#Importações

In [1]:
#Utilitários para a manipulação dos dados
import pandas as pd
import numpy as np

#Gerar as wordembeddings
import gensim

#Pré-processamento (padronizão e limpeza)
import spacy 

#Construção do modelo e avaliação
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC 
from sklearn.metrics import f1_score

# Carregando o Dataset

In [2]:
df = pd.read_csv('/content/drive/My Drive/Datasets/Texts/CSTR.csv')

In [3]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


In [4]:
df['class'].unique()

array(['ArtificiallIntelligence', 'Robotics', 'Systems', 'Theory'],
      dtype=object)

In [5]:
df_treino, df_teste = train_test_split(df, test_size = 0.3, stratify=df['class'])

In [6]:
df_treino

Unnamed: 0,file_name,text,class
241,7.txt,Memory hardware reliability is an indispensabl...,Systems
225,51.txt,Augmented reality is the merging of synthetic ...,Robotics
30,36.txt,One of the less appreciated obstacles to scali...,ArtificiallIntelligence
159,76.txt,We propose and implement a novel method for vi...,Robotics
165,30.txt,"In this report, we describe methods of acquiri...",Robotics
...,...,...,...
266,33.txt,Recently Gla{\ss}er et al. have shown that for...,Theory
167,3.txt,We propose a means of extending Conditional Ra...,Robotics
180,34.txt,"In this paper, we present a method for propaga...",Robotics
282,1.txt,"This note is a commentary on, and critique of,...",Theory


In [7]:
df_teste

Unnamed: 0,file_name,text,class
46,39.txt,We propose a compaction of WordNet senses for ...,ArtificiallIntelligence
126,70.txt,Finding lineal features in an image is an impo...,ArtificiallIntelligence
6,76.txt,Based on the observation that the unpredictabl...,ArtificiallIntelligence
254,6.txt,"For many election systems, bribery (and relate...",Theory
157,35.txt,Tracking is frequently considered a frame-to-f...,Robotics
...,...,...,...
244,5.txt,Most computing users today have access to clus...,Systems
275,35.txt,The study of semifeasible algorithms was initi...,Theory
280,8.txt,This paper is concerned with the computational...,Theory
243,3.txt,We argue for transactions as the synchronizati...,Systems


In [8]:
df_treino.groupby('class').count()

Unnamed: 0_level_0,file_name,text
class,Unnamed: 1_level_1,Unnamed: 2_level_1
ArtificiallIntelligence,89,89
Robotics,70,70
Systems,18,18
Theory,32,32


In [9]:
df_teste.groupby('class').count()

Unnamed: 0_level_0,file_name,text
class,Unnamed: 1_level_1,Unnamed: 2_level_1
ArtificiallIntelligence,39,39
Robotics,30,30
Systems,7,7
Theory,14,14


# Pré-processamento

In [10]:
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])

In [19]:
def preprocessamento (texto): 
  final_tokens = []
  doc = nlp(texto)
  for token in doc:
    if(token.is_alpha and not token.is_stop):
          final_tokens.append(token.lemma_.lower())
      
  return ' '.join(final_tokens)

In [20]:
df_treino['tokens'] = df_treino['text'].apply(preprocessamento)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_treino

Unnamed: 0,file_name,text,class,tokens
241,7.txt,Memory hardware reliability is an indispensabl...,Systems,memory hardware reliability indispensable syst...
225,51.txt,Augmented reality is the merging of synthetic ...,Robotics,augment reality merging synthetic sensory info...
30,36.txt,One of the less appreciated obstacles to scali...,ArtificiallIntelligence,appreciated obstacle scale multi agent system ...
159,76.txt,We propose and implement a novel method for vi...,Robotics,propose implement novel method visual space tr...
165,30.txt,"In this report, we describe methods of acquiri...",Robotics,report describe method acquire environment map...
...,...,...,...,...
266,33.txt,Recently Gla{\ss}er et al. have shown that for...,Theory,recently et al show class include pspace np ho...
167,3.txt,We propose a means of extending Conditional Ra...,Robotics,propose means extend conditional random field ...
180,34.txt,"In this paper, we present a method for propaga...",Robotics,paper present method propagate segmentation in...
282,1.txt,"This note is a commentary on, and critique of,...",Theory,note commentary critique andre luiz barbosa pa...


# Gerando as representações

In [22]:
#Definindo o número de dimensões
num_dimensoes = 100

In [23]:
modelo_linguagem = gensim.models.FastText(df_treino['tokens'],sg=1,min_count=2,window=10, size=num_dimensoes, iter=50)

In [17]:
modelo_linguagem = gensim.models.Word2Vec(df_treino['tokens'],sg=0,min_count=2,window=10, size=num_dimensoes, iter=50)

In [24]:
modelo_linguagem.wv.vocab

{' ': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a4e0>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a438>,
 'b': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a630>,
 'c': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a518>,
 'd': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a390>,
 'e': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a240>,
 'f': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a6a0>,
 'g': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a550>,
 'h': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a470>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a710>,
 'j': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a128>,
 'k': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a400>,
 'l': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a0f0>,
 'm': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a668>,
 'n': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a160>,
 'o': <gensim.models.keyedvectors.Vocab at 0x7fa56f96a2e8>,
 'p': <gensim.models.keyedvectors.Vocab 