# Importações

In [3]:
# Carregar e visualizar os dados
import pandas as pd
import numpy as np

#Biblioteca para gerar e manipular as word embeddings
import gensim
from gensim.models import KeyedVectors

#Biblioteca para preprocessar os textos 
import spacy

#O tqdm é utilizado para exibir uma barra de progresso
#Os demais comandos são utilizados para o correto funcionamento no Colab
from tqdm import tqdm
from IPython import get_ipython
def tqdm_clear(*args, **kwargs):
    getattr(tqdm, '_instances', {}).clear()

#Importando bibliotecas do ScikitLearn para: 
# - Faze a divisão entre treino e teste
from sklearn.model_selection import train_test_split
# - Importando um modelo de classificação SVC (Support Vector Classifier)
from sklearn.svm import SVC
# - Importande a função para avaliar o resultado da classificação
from sklearn.metrics import f1_score

#Carregando o Dataset

In [4]:
df = pd.read_csv('/content/drive/My Drive/Datasets/Texts/CSTR.csv')

In [5]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


In [6]:
#Listando as classes da coleção
df['class'].unique()

array(['ArtificiallIntelligence', 'Robotics', 'Systems', 'Theory'],
      dtype=object)

In [7]:
#Dividindo a coleção entre treino e teste
df_treino, df_teste = train_test_split(df, test_size=0.3, stratify=df['class'])

In [8]:
df_treino.shape

(209, 3)

In [9]:
df_teste.shape

(90, 3)

In [10]:
df_treino.groupby('class').count()

Unnamed: 0_level_0,file_name,text
class,Unnamed: 1_level_1,Unnamed: 2_level_1
ArtificiallIntelligence,89,89
Robotics,70,70
Systems,18,18
Theory,32,32


In [11]:
df_teste.groupby('class').count()

Unnamed: 0_level_0,file_name,text
class,Unnamed: 1_level_1,Unnamed: 2_level_1
ArtificiallIntelligence,39,39
Robotics,30,30
Systems,7,7
Theory,14,14


# Pré-processamento dos textos

In [12]:
# Initializando o spacy, porém, desabilitando recursos que não iremos utilizar
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [13]:
def preprocessamento (texto): 
  final_tokens = []
  doc = nlp(texto)
  for token in doc:
    if(token.is_alpha and not token.is_stop):
          final_tokens.append(token.lemma_.lower())
      
  return ' '.join(final_tokens)

In [14]:
#Pré-processando o treino
tqdm.pandas()
df_treino['tokens'] = df_treino['text'].progress_apply(preprocessamento)

100%|██████████| 209/209 [00:02<00:00, 71.81it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
df_treino

Unnamed: 0,file_name,text,class,tokens
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence,rhetorical rhet programming knowledge represen...
88,18.txt,This paper presents an integrated approach to ...,ArtificiallIntelligence,paper present integrate approach build affect ...
155,56.txt,Since scalp EEG recordings are measured in mic...,Robotics,scalp eeg recording measure microvolt electric...
58,13.txt,This report proposes a generalization of Dynam...,ArtificiallIntelligence,report propose generalization dynamic predicat...
130,26.txt,We combine scene-space based methods with Baye...,Robotics,combine scene space base method bayesian model...
...,...,...,...,...
213,97.txt,This paper studies the task of using a mobile ...,Robotics,paper study task mobile camera platform search...
296,28.txt,Using entropy of traffic distributions has bee...,Theory,entropy traffic distribution show aid wide var...
159,76.txt,We propose and implement a novel method for vi...,Robotics,propose implement novel method visual space tr...
284,9.txt,Voting and elections are at the core of democr...,Theory,voting election core democratic society people...


# Gerando as *Document Embeddings* a partir das *Word Embeddings*

In [16]:
#Definindo o número de dimensoes
num_dimensoes = 100

In [19]:
#Gerando as word embeddings para o treino
modelo_linguagem = gensim.models.Word2Vec(df_treino['tokens'],sg=0,min_count=2,window=10, size=num_dimensoes, iter=50)

In [20]:
#Gerando a representação para os documentos a partir das word embeddings contidas no modelo modelo_w2v_sg
def construir_representacao(textos, num_dimensoes, metodo): 
    matrix = np.zeros((len(textos), num_dimensoes))

    for i in range(len(textos)):
        tokens = textos.iloc[i]
        matrix[i] = soma_vetores(tokens)
        if(metodo == 'average' and len(tokens) > 0): 
            matrix[i] = matrix[i]/len(tokens)
    return matrix

def soma_vetores(lista_tokens): 
  vetor_combinado = np.zeros(num_dimensoes)
  for token in lista_tokens: 
      try:
          vetor_combinado += modelo_linguagem.wv.get_vector(token)
      except KeyError:
          continue 
  return vetor_combinado



In [21]:
repr_treino = construir_representacao(df_treino['tokens'], num_dimensoes, 'sum')

In [22]:
repr_treino

array([[  9.73907869,  12.37462949,  -6.42558777, ...,  12.87991156,
          5.75684165,  24.92363133],
       [ 25.21470276,  35.73425687,  -5.28574483, ...,  16.98329428,
          3.16250054,  52.63155454],
       [ 27.96372958,  31.09946702, -13.14698417, ...,  23.4711189 ,
          6.8324971 ,  43.84056123],
       ...,
       [ 40.38443252,  23.78829491, -26.16372506, ...,  24.5282011 ,
         26.46866097,  87.86154931],
       [ 37.37096535,  53.86291095, -20.76796724, ...,  21.15344422,
          2.56712138,  69.3442057 ],
       [ 23.58719484,  37.43717156, -24.81408076, ...,  36.71608964,
         13.03803926,  74.73618925]])

In [23]:
repr_treino.shape

(209, 100)

# Construindo um modelo de Classificação no Treino

In [24]:
classifier = SVC(kernel='linear')

In [25]:
classifier.fit(repr_treino, df_treino['class'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Gerando a representação do conjunto de teste para avaliar o modelo

In [26]:
#Primeiro é necessário obter os tokes dos textos do teste
df_teste['tokens'] = df_teste['text'].progress_apply(preprocessamento)
#Posteriomente se obtém as document embeddgins a partir das word embeddings geradas no treino
repr_teste = construir_representacao(df_teste['tokens'], num_dimensoes, 'sum')

100%|██████████| 90/90 [00:01<00:00, 87.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
repr_teste.shape

(90, 100)

# Avaliando os resultados da classificação no conjunto de teste

In [28]:
classifier.predict(repr_teste)

array(['Theory', 'Robotics', 'Systems', 'Robotics', 'Robotics',
       'ArtificiallIntelligence', 'Robotics', 'Robotics', 'Robotics',
       'Theory', 'Robotics', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'Robotics', 'Theory',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Robotics', 'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'Robotics', 'ArtificiallIntelligence', 'Robotics', 'Robotics',
       'Theory', 'ArtificiallIntelligence', 'Robotics',
       'ArtificiallIntelligence', 'Systems', 'ArtificiallIntelligence',
       'Systems', 'ArtificiallIntelligence', 'Robotics',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Robotics', 'Theory', 'ArtificiallIntelligence', 'Theory',
       'Systems', 'ArtificiallIntelligence', 'Robotics',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'ArtificiallInte

In [29]:
f1_score(df_teste['class'], classifier.predict(repr_teste), average='macro')

0.5745115995115995

In [30]:
f1_score(df_teste['class'], classifier.predict(repr_teste), average='micro')

0.6111111111111112