# Importando o Spacy

In [1]:
import spacy

#Determinando a gama de recursos que o spacy vai utilizar

In [2]:
nlp = spacy.load("en_core_web_sm")

#Tokenização

In [3]:
"""Gerando um objeto o qual gerará um objeto iteravel e cada item representa um token, e 
   cada token contém além da palavra em si, sua versão simplicada, função sintática, etc."""
sentenca = nlp('Rafael is working at Google in the South America. He works very hard :D')

In [4]:
#o atributo .text retorna o texto do respectivo token
for token in sentenca:
  print(token.text)

Rafael
is
working
at
Google
in
the
South
America
.
He
works
very
hard
:D


In [5]:
#Caso queira considerar primeiramente cada sentença como token também é possível.
#Para isso basta utilizar a propriedade sents do objeto gerado
doc = nlp('Rafael is working at Google in the South America. He works very hard!')

In [6]:
for sent in doc.sents: 
  print(sent.text)

Rafael is working at Google in the South America.
He works very hard!


# Detectando Stopwords

In [7]:
sentenca = nlp('Rafael is working at Google in the South America. He works very hard!')

In [8]:
#A propriedade .is_stop retorna True se o token está na lista de stopwords
for token in sentenca:
  print(f'{token.text} - {token.is_stop}')

Rafael - False
is - True
working - False
at - True
Google - False
in - True
the - True
South - False
America - False
. - False
He - True
works - False
very - True
hard - False
! - False


# Detectando tokens compostos apenas por caracteres alfabéticos

In [10]:
sentenca = nlp('Rafael is working at Google in the South America since 1999. He works very hard! This product costs €90.89')
#A propriedade .is_alpha retorna True se o token é composto apenas por caracteres alfabéticos
for token in sentenca:
  print(f'{token.text} - {token.is_alpha}')

Rafael - True
is - True
working - True
at - True
Google - True
in - True
the - True
South - True
America - True
since - True
1999 - False
. - False
He - True
works - True
very - True
hard - True
! - False
This - True
product - True
costs - True
€ - False
90.89 - False


# Simplificação das Palavras


## Lematização

In [None]:
sentenca = nlp('Rafael has drinked two coffees while the computer is computing the values of the matrices.')

In [None]:
for token in sentenca: 
  print(f'{token.text} - {token.lemma_}')

Rafael - Rafael
has - have
drinked - drink
two - two
coffees - coffee
while - while
the - the
computer - computer
is - be
computing - compute
the - the
values - value
of - of
the - the
matrices - matrix
. - .


# POS Tagging

In [None]:
sentenca = nlp('Rafael is working at Google in the South America. He works very hard!')

In [None]:
for token in sentenca: 
  print(f'{token.text} - {token.pos_} - {token.tag_}')

Rafael - PROPN - NNP
is - AUX - VBZ
working - VERB - VBG
at - ADP - IN
Google - PROPN - NNP
in - ADP - IN
the - DET - DT
South - PROPN - NNP
America - PROPN - NNP
. - PUNCT - .
He - PRON - PRP
works - VERB - VBZ
very - ADV - RB
hard - ADV - RB
! - PUNCT - .


# Entidades Nomeadas

In [None]:
sentenca = nlp('Rafael is working at Google in the South America. He works very hard!')
#Utilizando a propriendade .ents para retornar somente as entidades identificadas no texto
for entidade in sentenca.ents:
    print(entidade.text + ' - ' + entidade.label_ + ' - ' + str(spacy.explain(entidade.label_)))

Rafael - PERSON - People, including fictional
Google - ORG - Companies, agencies, institutions, etc.
the South America - LOC - Non-GPE locations, mountain ranges, bodies of water


In [None]:
#Extraindo apenas os nomes utilizando a propriedade noun_chunks
for nome in sentenca.noun_chunks:
   print(nome.text)

Rafael
Google
the South America
He


#Integrando o Spacy e o SkLearn para pré-processar os textos e gerar uma representação estruturada

In [None]:
#Importando as bibliotecas necessárias
import pandas as pd
pd.set_option('max_colwidth',200)
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
"""Criando a função para preprocessar os textos.
   Nessa função, serão removidos os caracteres de pontuação,
   stopwords, e os termos serão lematizados"""
def preprocessamento (texto): 
  final_tokens = []
  doc = nlp(texto)
  for token in doc:
    if(token.is_alpha and not token.is_stop):
          final_tokens.append(token.lemma_)
      
  return ' '.join(final_tokens)

In [None]:
#Criando a base de dados
texts = ['Goku is a hero in the Dragon Ball since 1989! Goku saved the earth so many times.',
         'The 7 Dragon balls can make wishes come true! Each ball contains his own dragon.',
         'If the wishes are superfluous, the dragon balls will become dark.' ,
         'Seiya is a bronze knight and is one of the main Knights of the Zodiac. He saved Athena several times.',
         "A knight of the zodiac wear a bronze, silver or a gold cloth to protect Athena.",
         'Saint Seiya: Knights of the Zodiac is a Japanese manga in which mystical warriors called the Saints fight wearing sacred cloths.']
classes = ['Dragon Ball', 'Dragon Ball', 'Dragon Ball', 'Cav. Zod.', 'Cav. Zod.', 'Cav. Zod.']

df = pd.DataFrame({'texts': texts, 'classes': classes})

In [None]:
"""Aplicando a função de preprocessamento na base de dados
   e ferando uma nova coluna"""
df['texts_preprocessed'] = df['texts'].apply(preprocessamento)

In [None]:
df

Unnamed: 0,texts,classes,texts_preprocessed
0,Goku is a hero in the Dragon Ball since 1989! Goku saved the earth so many times.,Dragon Ball,Goku hero Dragon Ball Goku save earth time
1,The 7 Dragon balls can make wishes come true! Each ball contains his own dragon.,Dragon Ball,Dragon ball wish come true ball contain dragon
2,"If the wishes are superfluous, the dragon balls will become dark.",Dragon Ball,wish superfluous dragon ball dark
3,Seiya is a bronze knight and is one of the main Knights of the Zodiac. He saved Athena several times.,Cav. Zod.,Seiya bronze knight main Knights Zodiac save Athena time
4,"A knight of the zodiac wear a bronze, silver or a gold cloth to protect Athena.",Cav. Zod.,knight zodiac wear bronze silver gold cloth protect Athena
5,Saint Seiya: Knights of the Zodiac is a Japanese manga in which mystical warriors called the Saints fight wearing sacred cloths.,Cav. Zod.,Saint Seiya Knights Zodiac japanese manga mystical warrior call saint fight wear sacred cloth


In [None]:
#Criando o vetorizador e já obtendo o vocabulário e gerando a representação estruturada
vetorizador = CountVectorizer(min_df=2)
representacao = vetorizador.fit_transform(df['texts_preprocessed'])

In [None]:
#Obtendo os termos ordenados de acordo com o seu índice no vocabulário
colunas = [item[0] for item in sorted(vetorizador.vocabulary_.items(), key = lambda x: x[1])]

#Gerando o DataFrame com o dados
df_repr = pd.DataFrame(representacao.toarray(), columns=colunas)

In [None]:
df_repr

Unnamed: 0,athena,ball,bronze,cloth,dragon,knight,knights,save,seiya,time,wear,wish,zodiac
0,0,1,0,0,1,0,0,1,0,1,0,0,0
1,0,2,0,0,2,0,0,0,0,0,0,1,0
2,0,1,0,0,1,0,0,0,0,0,0,1,0
3,1,0,1,0,0,1,1,1,1,1,0,0,1
4,1,0,1,1,0,1,0,0,0,0,1,0,1
5,0,0,0,1,0,0,1,0,1,0,1,0,1


In [None]:
df_repr.shape

(6, 13)