# Importações

In [2]:
# Carregar e visualizar os dados
import pandas as pd
import numpy as np

#Biblioteca para gerar e manipular as word embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#Biblioteca para preprocessar os textos 
import spacy

#Importando bibliotecas do ScikitLearn para: 
# - Faze a divisão entre treino e teste
from sklearn.model_selection import train_test_split
# - Importando um modelo de classificação SVC (Support Vector Classifier)
from sklearn.svm import SVC
# - Importande a função para avaliar o resultado da classificação
from sklearn.metrics import f1_score

#Dataset

In [3]:
df = pd.read_csv('/content/drive/My Drive/Datasets/Texts/CSTR.csv')

In [4]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


In [5]:
df_treino, df_teste = train_test_split(df, test_size=0.3, stratify=df['class'])

# Pré-processamento

In [7]:
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])

In [8]:
def preprocessamento(texto):
  return [token.lemma_.lower() for token in nlp(texto) if token.is_alpha and not token.is_stop]

In [14]:
def tagged_data(textos): 
  return [TaggedDocument(words=preprocessamento(text), tags=[str(i)]) for i, text in enumerate(textos)]

In [37]:
def construir_vetores_treino(modelo, tagged_data, num_dim):
  matrix = np.zeros((len(tagged_data), num_dim))
  for i in range(len(tagged_data)):
    matrix[i] = modelo.docvecs[tagged_treino[i][1][0]]
  return matrix

In [79]:
def construir_vetores_teste(modelo, tagged_data, num_dim):
  matrix = np.zeros((len(tagged_data), num_dim))
  for i in range(len(tagged_data)):
    matrix[i] = modelo.infer_vector(tagged_data[i][0])
  return matrix

#Gerando a representação do conjunto de treino

In [16]:
#Número de dimensoes
num_dimensoes = 100

In [18]:
tagged_treino = tagged_data(df_treino['text'])

In [19]:
tagged_treino

[TaggedDocument(words=['coarse', 'grain', 'task', 'parallelism', 'exist', 'sequential', 'code', 'leveraged', 'boost', 'use', 'chip', 'multi', 'processor', 'large', 'task', 'execute', 'thousand', 'line', 'code', 'complex', 'analyze', 'manage', 'statically', 'report', 'describe', 'programming', 'system', 'call', 'parallelization', 'consist', 'programming', 'interface', 'support', 'system', 'interface', 'small', 'language', 'primitive', 'mark', 'possibly', 'parallel', 'task', 'possible', 'dependence', 'support', 'system', 'implement', 'software', 'ensure', 'correct', 'parallel', 'execution', 'speculative', 'parallelization', 'speculative', 'communication', 'speculative', 'memory', 'allocation', 'manage', 'parallelism', 'dynamically', 'tolerate', 'unevenness', 'task', 'size', 'inter', 'task', 'delay', 'hardware', 'speed', 'evaluate', 'size', 'benchmark', 'application', 'suggestible', 'parallelization', 'obtain', 'time', 'speedup', 'processor', 'sequential', 'legacy', 'application', 'thousa

In [54]:
#Criando o objeto do modelo
modelo = Doc2Vec(vector_size=100, min_count=2, dm=1, dm_concat=1, epochs=100)

In [55]:
#Obtendo os one-hot encoddings das palavras e documentos
modelo.build_vocab(tagged_treino)

In [56]:
#Treinando o modelo
modelo.train(tagged_treino,total_examples=modelo.corpus_count, epochs=modelo.iter)

  


In [59]:
#Reduzindo o uso de memória
modelo.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [60]:
modelo.docvecs['57']

array([-0.09531692,  0.14777881,  0.01527601, -0.27760738,  0.01821178,
       -0.14955133,  0.08100421,  0.12160641,  0.19397856,  0.09932791,
        0.22632395,  0.22932132,  0.12475091,  0.00220783,  0.25249875,
        0.00781584,  0.07512537,  0.00733879,  0.15391338, -0.17754707,
       -0.06815126, -0.02188533,  0.04624556,  0.02631936, -0.17116177,
        0.08597803, -0.21453023, -0.02994613,  0.05753513, -0.12077526,
       -0.10479087, -0.0329353 , -0.04519742,  0.07604741, -0.16221151,
        0.0929579 , -0.18130033, -0.27628458,  0.07411991,  0.0520831 ,
        0.34202477,  0.05335351, -0.23433001,  0.00145978, -0.15037946,
       -0.08914318,  0.05399174,  0.11934274, -0.12055969,  0.13413334,
       -0.00782364, -0.12108607,  0.07882651, -0.0920041 , -0.15703125,
       -0.16427594,  0.00209779,  0.14311707, -0.16013536, -0.06704012,
       -0.17469823,  0.06495933, -0.03898958, -0.08428542,  0.1060891 ,
        0.1188658 , -0.0368067 , -0.3003535 , -0.14760385,  0.01

In [61]:
modelo.docvecs['57'].shape

(100,)

In [62]:
repr_treino = construir_vetores_treino(modelo, tagged_treino, num_dimensoes)

In [63]:
repr_treino

array([[-7.56554976e-02,  1.32469073e-01, -3.20364833e-01, ...,
         2.95264069e-02,  1.99019074e-01,  1.26098141e-01],
       [ 3.61324817e-01, -1.39179677e-01,  6.61531746e-01, ...,
         3.86130512e-02,  7.69469440e-01,  4.02653039e-01],
       [ 3.29764068e-01,  4.27896291e-01,  3.44802916e-01, ...,
         2.06045270e-01,  4.44042683e-01, -5.51453307e-02],
       ...,
       [-2.71313079e-02,  1.72711400e-04, -6.98105022e-02, ...,
        -3.89761515e-02,  4.41701226e-02, -2.33745929e-02],
       [ 9.08939317e-02, -2.83858836e-01,  2.38789976e-01, ...,
         1.57619044e-02,  6.30073100e-02, -4.50621434e-02],
       [ 2.49088928e-03,  3.57903563e-03,  5.41862659e-02, ...,
         1.25495065e-03,  1.20423585e-01, -8.92963111e-02]])

In [64]:
repr_treino.shape

(209, 100)

#Construindo o modelo de Classificação no Treino

In [65]:
classificador = SVC(kernel='linear')

In [66]:
classificador.fit(repr_treino, df_treino['class'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#Testando no Teste

In [80]:
tagged_test = tagged_data(df_teste['text'])

In [81]:
tagged_test

[TaggedDocument(words=['paper', 'review', 'compare', 'technique', 'model', 'base', 'pose', 'recovery', 'extrinsic', 'camera', 'calibration', 'monocular', 'image', 'classify', 'solution', 'report', 'literature', 'analytical', 'perspective', 'affine', 'numerical', 'perspective', 'present', 'reformulation', 'important', 'numerical', 'perspective', 'solution', 'lowe', 'algorithm', 'phong', 'horaud', 'algorithm', 'improvement', 'lowe', 'algorithm', 'consist', 'eliminate', 'simplify', 'assumption', 'projective', 'equation', 'careful', 'experimental', 'evaluation', 'reveal', 'result', 'fully', 'projective', 'algorithm', 'superexponential', 'convergence', 'property', 'wide', 'range', 'initial', 'solution', 'realistic', 'usage', 'condition', 'order', 'magnitude', 'accurate', 'original', 'formulation', 'arguably', 'well', 'computation', 'time', 'property', 'extension', 'phong', 'horaud', 'algorithm', 'good', 'knowledge', 'method', 'independent', 'orientation', 'recovery', 'actually', 'exploit', 

In [82]:
repr_teste = construir_vetores_teste(modelo, tagged_test, num_dimensoes)

In [83]:
repr_teste

array([[ 0.14957792, -0.02110978, -0.17338872, ..., -0.57491559,
        -0.0523877 , -0.90663123],
       [ 0.14022163,  0.98347157, -0.15225066, ...,  1.53474355,
        -0.06904658,  0.21300852],
       [ 1.20019698, -1.02697945, -0.13053896, ..., -0.31462449,
        -1.03807938, -1.12245822],
       ...,
       [ 0.32470039, -0.4967989 , -0.94054562, ..., -0.24284711,
        -1.1354928 ,  1.03895724],
       [ 0.43019155, -1.17508042,  0.32049364, ..., -0.72684658,
        -0.60636187,  1.29158974],
       [-0.32256991, -0.17799577,  1.47638035, ...,  0.20273982,
         1.21486187,  0.06626972]])

In [84]:
repr_teste.shape

(90, 100)

In [85]:
predicoes = classificador.predict(repr_teste)

In [86]:
predicoes

array(['Robotics', 'ArtificiallIntelligence', 'Theory',
       'ArtificiallIntelligence', 'Systems', 'Theory',
       'ArtificiallIntelligence', 'Theory', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'Systems', 'Theory', 'Systems',
       'Theory', 'ArtificiallIntelligence', 'Robotics', 'Robotics',
       'Robotics', 'Robotics', 'Robotics', 'Robotics', 'Systems',
       'Theory', 'ArtificiallIntelligence', 'Theory',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Theory', 'ArtificiallIntelligence', 'Theory', 'Theory',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Robotics', 'Theory', 'Robotics', 'Theory', 'Robotics',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'Systems', 'Robotics',
       'ArtificiallIntelligence', 'ArtificiallIntelligence', 'Robotics',
       'Systems', 'Theory', 'Robotics', 'Robotics', 'Robotics',
       'Robotics', 'Theo

In [87]:
f1_score(df_teste['class'],predicoes,average='macro')

0.772909546593757