# Importações

In [10]:
# Carregar e visualizar os dados
import pandas as pd

#Biblioteca para gerar e manipular as word embeddings
import gensim
from gensim.models import KeyedVectors

#Biblioteca para preprocessar os textos 
import spacy

#O tqdm é utilizado para exibir uma barra de progresso
#Os demais comandos são utilizados para o correto funcionamento no Colab
from tqdm import tqdm
from IPython import get_ipython
def tqdm_clear(*args, **kwargs):
    getattr(tqdm, '_instances', {}).clear()



#Base de Dados

In [4]:
df = pd.read_csv('/content/drive/My Drive/Datasets/Texts/AmazonFoodReviews.csv')

In [5]:
df.head(10)

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...
5,4,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,5,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,5,Yay Barley,Right now I'm mostly just sprouting this so my...
9,5,Healthy Dog Food,This is a very healthy dog food. Good for thei...


#Pré-processando os Textos da Coleção

In [6]:
# Initializando o spacy, porém, desabilitando recursos que não iremos utilizar
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [7]:
def preprocessamento (texto): 
  final_tokens = []
  doc = nlp(texto)
  for token in doc:
    if(token.is_alpha and not token.is_stop):
          final_tokens.append(token.lemma_.lower())
      
  return ' '.join(final_tokens)

In [11]:
tqdm.pandas()
df['Tokens'] = df['Text'].progress_apply(preprocessamento)

100%|██████████| 568454/568454 [53:16<00:00, 177.83it/s]


In [12]:
#Salvando para garantir
df.to_csv('/content/drive/My Drive/Datasets/Texts/AmazonFoodReviewsWithTokens.csv', index=False)

In [13]:
df.head(n=10)

Unnamed: 0,Score,Summary,Text,Tokens
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,"[buy, vitality, can, dog, food, product, find,..."
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,"[product, arrive, label, jumbo, salt, peanut, ..."
2,4,"""Delight"" says it all",This is a confection that has been around a fe...,"[confection, century, light, pillowy, citrus, ..."
3,2,Cough Medicine,If you are looking for the secret ingredient i...,"[look, secret, ingredient, robitussin, believe..."
4,5,Great taffy,Great taffy at a great price. There was a wid...,"[great, taffy, great, price, wide, assortment,..."
5,4,Nice Taffy,I got a wild hair for taffy and ordered this f...,"[get, wild, hair, taffy, order, pound, bag, ta..."
6,5,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...,"[saltwater, taffy, great, flavor, soft, chewy,..."
7,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...,"[taffy, good, soft, chewy, flavor, amazing, de..."
8,5,Yay Barley,Right now I'm mostly just sprouting this so my...,"[right, sprout, cat, eat, grass, love, rotate,..."
9,5,Healthy Dog Food,This is a very healthy dog food. Good for thei...,"[healthy, dog, food, good, digestion, good, sm..."


#Treinando as Word Embeddings

## Word2Vec

###Skip-Gram

In [15]:
#Gerando o modelo SkipGram

#Primeiro parâmetros: coleção de textos
#sg=1: arquitetura do tipo Skip-Gram
#min_count=5: número mínimo de vezes que uma palavra deve aparecer ao longo da coleção
#window: tamanho do contexto
#size: número de dimenções da embedding
modelo_w2v_sg = gensim.models.Word2Vec(df['Tokens'],sg=1,min_count=5,window=10, size=100)

In [19]:
#Salvando o modelo inteiro SkipGram
modelo_w2v_sg.save("/content/drive/My Drive/Datasets/Texts/AmazonFoodReviews_model_w2v.model")
#Salvando apenas as Word Embeddings
modelo_w2v_sg.wv.save("/content/drive/My Drive/Datasets/Texts/AmazonFoodReviews_model_w2v.kv")

#Posteriormente para carregar ambos fica
#modelo_w2v_sg = gensim.models.Word2Vec.load("/content/drive/My Drive/Datasets/Texts/AmazonFoodReviews_model_w2v.model") 
#wv_kv_sg = gensim.models.KeyedVectors.load("/content/drive/My Drive/Datasets/Texts/AmazonFoodReviews_model_w2v.kv", mmap='r')

### CBOW

In [17]:
#Parâmetros similares ao Skip-Gram, porém, com sg=0
modelo_w2v_cbow = gensim.models.Word2Vec(df['Tokens'],sg=1,min_count=5,window=10, size=100)

# FastText

In [24]:
#Parêmtros similares aos da abordagem Word2Vec, porém, o modelo utilizado é o FastText
modelo_ft = gensim.models.FastText(df['Tokens'],min_count=5,window=10, size=100)

# Consultando as Palavras Mais Similares nos Diferentes Modelos

In [30]:
#Consultando as 20 palavras mais similares a 'cheese' no modelo skipgram
modelo_w2v_sg.wv.most_similar('cheese', topn=20)

  if np.issubdtype(vec.dtype, np.int):


[('cheddar', 0.805945634841919),
 ('asiago', 0.7987725734710693),
 ('macaroni', 0.7951961755752563),
 ('havarti', 0.7851487398147583),
 ('mozzarella', 0.7811793684959412),
 ('mac', 0.7750540971755981),
 ('chevre', 0.7730174660682678),
 ('curds', 0.7673276662826538),
 ('cheesier', 0.7610152363777161),
 ('parmesan', 0.7524831891059875),
 ('brie', 0.7496126890182495),
 ('castello', 0.747927725315094),
 ('dofino', 0.7445870637893677),
 ('cheesy', 0.7418321371078491),
 ('chedder', 0.7398743629455566),
 ('roquefort', 0.7304426431655884),
 ('provolone', 0.7300024032592773),
 ('velveeta', 0.7291139364242554),
 ('romano', 0.7269539833068848),
 ('chedar', 0.7259482145309448)]

In [31]:
#Consultando as 20 palavras mais similares a 'cheese' no modelo cbow
modelo_w2v_cbow.wv.most_similar('cheese', topn=20)

  if np.issubdtype(vec.dtype, np.int):


[('asiago', 0.7946245074272156),
 ('cheddar', 0.7904796600341797),
 ('mozzarella', 0.7788733839988708),
 ('macaroni', 0.7728685140609741),
 ('chedar', 0.7703266143798828),
 ('havarti', 0.7702367305755615),
 ('parmesan', 0.7679916620254517),
 ('cheesier', 0.7597986459732056),
 ('chevre', 0.7589219212532043),
 ('brie', 0.7582805156707764),
 ('mac', 0.757365345954895),
 ('curds', 0.7530713081359863),
 ('provolone', 0.7515742778778076),
 ('chedder', 0.7480548620223999),
 ('dofino', 0.7386725544929504),
 ('chesse', 0.738013505935669),
 ('castello', 0.735872745513916),
 ('cheesy', 0.7305408120155334),
 ('mascarpone', 0.728222668170929),
 ('grated', 0.722386360168457)]

In [32]:
modelo_ft.wv.most_similar('cheese', topn=20)

  if np.issubdtype(vec.dtype, np.int):


[('cheeese', 0.9560588002204895),
 ('cheesey', 0.9551533460617065),
 ('cheeses', 0.9144033789634705),
 ('theese', 0.8912648558616638),
 ('cheesesteak', 0.8825253248214722),
 ('cheesestick', 0.8648257255554199),
 ('cheeseburger', 0.8571553230285645),
 ('cheesiest', 0.8520803451538086),
 ('cheesier', 0.8470945358276367),
 ('cheeseball', 0.8310973048210144),
 ('cheesy', 0.8292456865310669),
 ('cheesoning', 0.8183912038803101),
 ('cheesecloth', 0.8124037981033325),
 ('cheesburger', 0.8101001977920532),
 ('cheezwhiz', 0.8055063486099243),
 ('cheezy', 0.798599123954773),
 ('cheez', 0.7783213257789612),
 ('creamcheese', 0.777782678604126),
 ('cheezit', 0.7625910043716431),
 ('cheeze', 0.7613284587860107)]

#Obtendo os Vetores das Palavras

In [33]:
modelo_ft.wv.get_vector('cheese')

array([-0.08184408, -2.2311382 , -2.492365  ,  3.257404  , -1.9422194 ,
        2.7177088 , -2.2955086 ,  1.1486218 ,  0.23529346, -3.368218  ,
        3.7355335 , -0.5069643 , -0.5499422 ,  2.2903469 ,  1.134696  ,
        2.670308  ,  1.4461482 ,  1.8927674 ,  0.5051987 ,  2.8341556 ,
        0.06283331, -1.2707167 ,  2.4234438 ,  3.8059785 ,  6.4608674 ,
       -2.1753607 ,  1.2648573 , -0.63337964,  0.5241564 ,  1.577323  ,
       -3.2373915 ,  0.24699497, -3.226584  ,  1.8641607 , -1.2343727 ,
        0.91023755, -0.64831525, -1.6202412 ,  2.9555364 ,  0.03749143,
       -1.249742  ,  0.72780824,  1.9836384 ,  0.7922271 ,  0.61221975,
        2.5431392 , -0.2680755 , -0.97211474,  1.9580075 , -2.5684462 ,
       -2.4829323 , -4.6846886 ,  3.1529284 ,  0.4547629 ,  0.63352   ,
       -0.92202497,  1.4747317 , -3.1064172 ,  1.7622886 ,  1.0706652 ,
        3.3747735 ,  2.7456727 , -1.2298013 ,  0.18480334, -0.08525322,
        0.06383915, -1.4466848 , -1.2302431 , -2.2721653 ,  0.54

#Obtendo os Velores Mais Parecidos Após Operações com os Vetores das Palavras

In [65]:
#positive: soma
#negative: subtrai
modelo_w2v_cbow.wv.most_similar('avocado', topn=1)

  if np.issubdtype(vec.dtype, np.int):


[('avocados', 0.6898744702339172)]

In [66]:
modelo_w2v_cbow.wv.most_similar(positive=['avocado','salsa'], topn=1)

  if np.issubdtype(vec.dtype, np.int):


[('guacamole', 0.7307560443878174)]

In [67]:
modelo_w2v_cbow.wv.most_similar(positive=['avocado','salsa'],negative=['salt'], topn=1)

  if np.issubdtype(vec.dtype, np.int):


[('avocados', 0.58322674036026)]

In [68]:
modelo_w2v_cbow.wv.most_similar(positive=['lemon','water'], topn=1)

  if np.issubdtype(vec.dtype, np.int):


[('lemonaide', 0.7784677743911743)]

# Vocabulário do Modelo

In [69]:
"""Além de retornar o vocabulário pode ser utilizado para
   verificar se uma palavra pertence ao vocabulário de treinamento"""
modelo_w2v_cbow.wv.vocab

{'buy': <gensim.models.keyedvectors.Vocab at 0x7f0d774bdc18>,
 'vitality': <gensim.models.keyedvectors.Vocab at 0x7f0d754a98d0>,
 'can': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9940>,
 'dog': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9978>,
 'food': <gensim.models.keyedvectors.Vocab at 0x7f0d754a99b0>,
 'product': <gensim.models.keyedvectors.Vocab at 0x7f0d754a99e8>,
 'find': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9a20>,
 'good': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9a58>,
 'quality': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9a90>,
 'look': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9ac8>,
 'like': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9b00>,
 'stew': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9b38>,
 'process': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9b70>,
 'meat': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9ba8>,
 'smell': <gensim.models.keyedvectors.Vocab at 0x7f0d754a9be0>,
 'better': <gensim.models.keyedvectors.Vocab

In [71]:
modelo_w2v_sg['cheap']

  """Entry point for launching an IPython kernel.


array([-0.1259561 , -0.4343633 ,  0.11544825,  0.37036207,  0.02404923,
        0.12771359, -0.11024404,  0.07505909,  0.00588032,  0.11527938,
       -0.15609452,  0.23163009,  0.4501444 ,  0.44907334,  0.19261187,
       -0.09032837,  0.03712239,  0.16728175,  0.12081986,  0.35452172,
        0.03871497,  0.06429012,  0.3569138 ,  0.03053388, -0.1945995 ,
       -0.24858478, -0.21007095, -0.01617339, -0.16569327,  0.12372867,
        0.12434195,  0.18353032,  0.01307029,  0.12009319,  0.29589656,
        0.23793861, -0.15751565,  0.13425985,  0.12706701, -0.08293684,
       -0.22191533,  0.2917584 , -0.12052703, -0.09782092, -0.08856533,
        0.2080423 , -0.0045195 ,  0.02030792,  0.19006802, -0.20325619,
        0.22794808, -0.42969278, -0.13556704, -0.05725838,  0.16113901,
       -0.16998972, -0.35072988, -0.04790407,  0.11391092,  0.20651056,
        0.27056786, -0.36972976, -0.00334894, -0.24998936,  0.2361827 ,
       -0.15815926,  0.14368317,  0.36902148,  0.16334164, -0.04