# Training word2vec for Item Title

Word2Vec was introduced in two papers between September and October 2013, by a team of researchers at Google. Along with the papers, the researchers published their implementation in C. The Python implementation was done soon after the 1st paper, by Gensim.


![](https://miro.medium.com/fit/c/1838/551/0*_j8UK1NpsCY_yUk2)

ref: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial#Training-the-model

In [None]:
import numpy as np
import pandas as pd
from time import time
#For displaying complete rows info
pd.options.display.max_colwidth=500
import tensorflow as tf
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import os
import seaborn as sns
import missingno as msno
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#sns.set_theme(style="whitegrid")

def read_json(input_file):
    '''
    Read Json Lines File
    '''
    with open(input_file) as f:
        lines = f.read().splitlines()    
    
    df = pd.DataFrame(lines)
    df.columns = ['json_element']
    df = pd.json_normalize(df['json_element'].apply(json.loads))
    
    return df

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Read Texto in Dataset

Read all text dataset from Item and Search interaction

In [None]:
df_item = read_json('/kaggle/input/meli-data-challenge-2020/item_data.jl') 
df_item.head()

In [None]:
df = read_json('/kaggle/input/meli-data-challenge-2020/train_dataset.jl').sample(n=300000)
df.shape

In [None]:
df_search = list()
for i in range(df.shape[0]):
    if i % 10000 == 0:
        print(i)
    row = [j['event_info'] for j in df.iloc[i]['user_history'] if isinstance(j['event_info'], str)]
    df_search.extend(row)
    
df_search = np.unique(df_search)
df_search[:10]

## Gensim Word2Vec Implementation

We use Gensim implementation of word2vec: https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
import multiprocessing

import gensim
import string
import re

cores = multiprocessing.cpu_count() # Count the number of cores in a computer


In [None]:
import re
from unidecode import unidecode

def func_tokenizer(text):
    # print(text)

    text = str(text)

    # # Remove acentuação
    text = unidecode(text)

    # # lowercase
    text = text.lower()

    # #remove tags
    text = re.sub("<!--?.*?-->", "", text)

    # # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text)

    # # punk
    text = re.sub(r'[?|!|\'|#]', r'', text)
    text = re.sub(r'[.|,|:|)|(|\|/]', r' ', text)

    # Clean onde
    tokens = [t.strip() for t in text.split() if len(t) > 1]

    # remove stopwords
    #stopwords = self.load_stopwords()
    #tokens    = [t for t in tokens if t not in stopwords]

    if len(tokens) == 0:
        tokens.append("<pad>")
    # print(tokens)
    # print("")
    # if len(tokens) < 2:
    #    print(tokens)
    return tokens

In [None]:
df_item.iloc[4].title, func_tokenizer(df_item.iloc[4].title)

In [None]:
from tqdm import tqdm

sentences=[]

# make corpus
for i in tqdm(range(len(df_item["title"]))):
    sentences.append(func_tokenizer(df_item.iloc[i]['title']))
    
for i in tqdm(range(len(df_search))):    
    sentences.append(func_tokenizer(df_search))    

In [None]:
sentences[:5]

In [None]:
model = gensim.models.Word2Vec(min_count=50,
                                 window=2,
                                 size=100,
                                 sample=6e-5, 
                                 alpha=0.03, 
                                 min_alpha=0.0007, 
                                 negative=20,
                                 workers=cores)

model.build_vocab(sentences, progress_per=1000)

In [None]:
model.corpus_count

In [None]:
list(model.wv.vocab.keys())[:10]

### Train a model

In [None]:
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [None]:
t = time()

epochs = 100

model.train(sentences, 
            total_examples=model.corpus_count, 
            epochs=epochs, 
            report_delay=1,
            compute_loss = True,
            callbacks=[callback()])

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
model.wv.similar_by_word('dell')

In [None]:
# model = gensim.models.Word2Vec(sentences=frase_tokens, min_count=2,size=100,workers=4)
# model

### t-SNE visualizations

Visualization Similarity Words

In [None]:
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial#Training-the-model
    """
    arrays = np.empty((0, 100), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    #reduc = PCA(n_components=50).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(arrays)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
word = 'dell'
tsnescatterplot(model, word, [i[0] for i in model.wv.most_similar(negative=[word])])

In [None]:
word = 'xaomi'
tsnescatterplot(model, word, [i[0] for i in model.wv.most_similar(negative=[word])])

In [None]:
word = 'carro'
tsnescatterplot(model, word, [i[0] for i in model.wv.most_similar(negative=[word])])

## Save Model

Save dictionary and embs

In [None]:
# save the word2vec model
model.save('/kaggle/working/word2vec.model')

In [None]:
model.wv.save_word2vec_format('/kaggle/working/mercadolivre-100d.bin', binary=True)

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('/kaggle/working/mercadolivre-100d.bin', binary=True)
model['celular']