# Importações

In [1]:
# Carregar e visualizar os dados
import pandas as pd
import numpy as np

#Biblioteca para gerar e manipular as word embeddings
import gensim
from gensim.models import KeyedVectors

#Biblioteca para preprocessar os textos 
import spacy

#Importando bibliotecas do ScikitLearn para: 
# - Faze a divisão entre treino e teste
from sklearn.model_selection import train_test_split
# - Importando um modelo de classificação SVC (Support Vector Classifier)
from sklearn.svm import SVC
# - Importande a função para avaliar o resultado da classificação
from sklearn.metrics import f1_score

#Carregando o Dataset

In [3]:
df = pd.read_csv('/content/drive/My Drive/Datasets/Texts/CSTR.csv')

In [4]:
df

Unnamed: 0,file_name,text,class
0,126.txt,Rhetorical (Rhet) is a programming / knowledge...,ArtificiallIntelligence
1,5.txt,Reduction is the operation of transforming a p...,ArtificiallIntelligence
2,48.txt,"For years, researchers have used knowledge-int...",ArtificiallIntelligence
3,81.txt,Proceedings of a workshop held in conjunction ...,ArtificiallIntelligence
4,25.txt,The Medication Advisor is the latest project o...,ArtificiallIntelligence
...,...,...,...
294,39.txt,Scoring protocols are a broad class of voting ...,Theory
295,5.txt,We study the behavior of Range Voting and Norm...,Theory
296,28.txt,Using entropy of traffic distributions has bee...,Theory
297,27.txt,We study the complexity of influencing electio...,Theory


In [5]:
#Listando as classes da coleção
df['class'].unique()

array(['ArtificiallIntelligence', 'Robotics', 'Systems', 'Theory'],
      dtype=object)

In [6]:
#Dividindo a coleção entre treino e teste
df_treino, df_teste = train_test_split(df, test_size=0.3, stratify=df['class'])

# Pré-processamento dos textos

In [21]:
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])

In [23]:
def preprocessamento(texto):
  return [token.lemma_.lower() for token in nlp(texto) if token.is_alpha and not token.is_stop]

In [25]:
#Preprocessando o treino
df_treino['tokens'] = df_treino['text'].apply(preprocessamento)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
df_treino

Unnamed: 0,file_name,text,class,tokens
237,24.txt,The data layout of a program is critical to pe...,Systems,"[datum, layout, program, critical, performance..."
289,34.txt,Preference aggregation in a multiagent setting...,Theory,"[preference, aggregation, multiagent, setting,..."
107,117.txt,There is a fundamental division between two ap...,ArtificiallIntelligence,"[fundamental, division, approach, cognition, i..."
14,47.txt,Natural language generation is a knowledge-int...,ArtificiallIntelligence,"[natural, language, generation, knowledge, int..."
10,56.txt,Neurons in the visual cortex are known to poss...,ArtificiallIntelligence,"[neuron, visual, cortex, know, possess, locali..."
...,...,...,...,...
279,45.txt,Unambiguity in alternating Turing machines has...,Theory,"[unambiguity, alternate, turing, machine, rece..."
26,84.txt,We describe some simple domain-independent imp...,ArtificiallIntelligence,"[describe, simple, domain, independent, improv..."
238,21.txt,Software transactional memory systems enable a...,Systems,"[software, transactional, memory, system, enab..."
61,20.txt,This study of the Fall 2002 Computer Programmi...,ArtificiallIntelligence,"[study, fall, computer, programming, csc, cour..."


#Gerando as representações

In [27]:
#número de dimensões
num_dimensoes = 100

In [29]:
modelo_linguagem = gensim.models.Word2Vec(df_treino['tokens'],sg=0,min_count=2,window=10,size=num_dimensoes, iter=50)

In [38]:
def construir_representacao(textos, num_dimensoes, metodo):

  matrix = np.zeros((len(textos), num_dimensoes))

  for i in range(len(textos)):
    tokens = textos.iloc[i]
    matrix[i] = soma_vetores(tokens, num_dimensoes)
    if metodo == 'average' and len(tokens) > 0:
      matrix[i] = matrix[i]/len(tokens)
  
  return matrix

def soma_vetores(tokens, num_dimensoes): 
  vetor_texto = np.zeros(num_dimensoes)
  for token in tokens:
    try: 
      vetor_texto += modelo_linguagem.wv.get_vector(token)
    except KeyError:
      continue 
  return vetor_texto 

In [39]:
repr_treino = construir_representacao(df_treino['tokens'], num_dimensoes, 'sum')

In [40]:
repr_treino

array([[ -4.95350236,  21.27726462,  26.21927706, ...,  22.39286217,
         -3.22852063,  -0.23166905],
       [ 33.50306972, -45.08158656,  23.53531729, ...,  -4.84292804,
        -10.88526754, -11.09137373],
       [-48.55771159,   5.26844899, -19.64602363, ...,  32.93304568,
         -4.60542911,  16.09825373],
       ...,
       [-12.64128875,  19.46455547,  25.44160598, ...,  33.25555335,
         -2.59555877,   0.74415854],
       [  1.52532381,   3.82348572,  39.29577235, ...,   8.55317837,
         30.70047313,  26.55599136],
       [-15.18093174,  19.53441319,  19.28081651, ...,  -8.84902747,
         18.18806618,  17.62296161]])

In [41]:
repr_treino.shape

(209, 100)

#Construindo um modelo de Classificação

In [42]:
classificador = SVC(kernel='linear')

In [43]:
classificador.fit(repr_treino, df_treino['class'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

#Testando no conjunto de Teste

In [44]:
df_teste

Unnamed: 0,file_name,text,class
202,2.txt,The Conservation Laboratory of the George East...,Robotics
125,108.txt,"This paper describes a formalism, Statistical ...",ArtificiallIntelligence
183,45.txt,Appearance-based object recognition systems ar...,Robotics
286,18.txt,We investigate issues related to two hard prob...,Theory
265,44.txt,We investigate the relative complexity of the ...,Theory
...,...,...,...
281,41.txt,Given a p-order A over a universe of strings (...,Theory
88,18.txt,This paper presents an integrated approach to ...,ArtificiallIntelligence
246,23.txt,"The use of multi-core, multi-processor machine...",Systems
81,114.txt,This report contains a small corpus of transcr...,ArtificiallIntelligence


In [45]:
df_teste['tokens'] = df_teste['text'].apply(preprocessamento)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
df_teste

Unnamed: 0,file_name,text,class,tokens
202,2.txt,The Conservation Laboratory of the George East...,Robotics,"[conservation, laboratory, george, eastman, ho..."
125,108.txt,"This paper describes a formalism, Statistical ...",ArtificiallIntelligence,"[paper, describe, formalism, statistical, even..."
183,45.txt,Appearance-based object recognition systems ar...,Robotics,"[appearance, base, object, recognition, system..."
286,18.txt,We investigate issues related to two hard prob...,Theory,"[investigate, issue, relate, hard, problem, re..."
265,44.txt,We investigate the relative complexity of the ...,Theory,"[investigate, relative, complexity, graph, iso..."
...,...,...,...,...
281,41.txt,Given a p-order A over a universe of strings (...,Theory,"[give, p, order, universe, string, transitive,..."
88,18.txt,This paper presents an integrated approach to ...,ArtificiallIntelligence,"[paper, present, integrate, approach, build, a..."
246,23.txt,"The use of multi-core, multi-processor machine...",Systems,"[use, multi, core, multi, processor, machine, ..."
81,114.txt,This report contains a small corpus of transcr...,ArtificiallIntelligence,"[report, contain, small, corpus, transcription..."


In [47]:
repr_teste = construir_representacao(df_teste['tokens'], num_dimensoes, 'sum')

In [51]:
predicoes = classificador.predict(repr_teste)

In [52]:
predicoes

array(['Theory', 'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'Theory', 'Theory', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'ArtificiallIntelligence', 'Systems',
       'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'Robotics', 'Robotics', 'Robotics',
       'Robotics', 'Robotics', 'ArtificiallIntelligence', 'Theory',
       'Robotics', 'Theory', 'Theory', 'Systems', 'Robotics',
       'ArtificiallIntelligence', 'ArtificiallIntelligence', 'Systems',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Robotics', 'Robotics', 'ArtificiallIntelligence', 'Robotics',
       'ArtificiallIntelligence', 'Robotics', 'ArtificiallIntelligence',
       'Theory', 'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'ArtificiallIntelligence',
       'ArtificiallIntelligence', 'Robotics', 'Robotics', 'Robotics',
       'Robotics', 'ArtificiallIntelligence', 'Robot

In [53]:
df_teste['class']

202                   Robotics
125    ArtificiallIntelligence
183                   Robotics
286                     Theory
265                     Theory
                ...           
281                     Theory
88     ArtificiallIntelligence
246                    Systems
81     ArtificiallIntelligence
219                   Robotics
Name: class, Length: 90, dtype: object

In [54]:
f1_score(df_teste['class'],predicoes,average='macro')

0.8068174653215637

#Comparando com a BOW

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
vetorizador = CountVectorizer(min_df=2)

In [58]:
bow_treino = vetorizador.fit_transform(df_treino['text'])

In [60]:
classificador.fit(bow_treino,df_treino['class'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [61]:
bow_teste = vetorizador.transform(df_teste['text'])

In [62]:
predicoes = classificador.predict(bow_teste)

In [63]:
f1_score(df_teste['class'],predicoes,average='macro')

0.7442238476721236