## Basic packages

In [None]:
import json
import pandas as pd
import string
import numpy as np
import tensorflow as tf
import nltk, re
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from datetime import datetime
from gensim.models import *
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set()
%matplotlib inline

## Packages for self design fuction

In [None]:
special_characters = re.compile("[^A-Za-z0-9 ]")
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
def convert_to_sentences(data, tokenizer):
    # First, converting each review into sentences
    # Use NLTK Tokenizer to split review into sentences (punkt tokenizer - english.pickle)
    data = data.lower().replace("<br />", " ")
    data = data.replace("-", " ")
    data = data.replace(".", ". ")
    data = re.sub("  ", " ", data)
    all_sentences = tokenizer.tokenize(data.strip())
    
    # Second, converting each sentence into words
    sentences = []
    for words in all_sentences:
        s = re.sub(special_characters, "", words.lower())
        if (len(s)) > 0:
            sentences.append(s.split())
    
    # Finally, returning a list of sentences (containing words in each sentence)
    return sentences

In [None]:
review_A = []
for line in open('Toys_and_Games_5.json', 'r'):
    review_A.append(json.loads(line))

In [None]:
df = pd.DataFrame(review_A)
df.head()

In [None]:
sentences = []
for r in df.reviewText:
    sentences += convert_to_sentences(r, tokenizer)

In [None]:
num_feature = 50
min_word_count = 20
num_thread = 5
window_size = 10
down_sampling = 0.001
iteration = 12
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
model_w2v = word2vec.Word2Vec(sentences,
                          epochs = iteration,
                          vector_size=num_feature, 
                          min_count = min_word_count, 
                          window = window_size, 
                          sample = down_sampling, 
                          workers=num_thread)

In [None]:
model_FT = FastText(sentences, 
                          vector_size=num_feature, 
                          epochs = iteration,
                          window=window_size, 
                          min_count=min_word_count, 
                          workers=num_thread)

In [None]:
model_w2v.save("gensim_word2vec_withstop")
model_FT.save("gensim_fastText_withstop")

In [None]:
df_top20 = pd.DataFrame({'w2v':model_w2v.wv.index_to_key[:20],
                         'FT':model_FT.wv.index_to_key[:20]
                        })
df_top20

In [None]:
df.reviewText = df.reviewText.apply(lambda x:x.lower())
df.reviewText = df.reviewText.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords.words('english')))

In [None]:
sentences_nostop = []
for r in df.reviewText:
    sentences_nostop += convert_to_sentences(r, tokenizer)

In [None]:
for i in sentences_nostop[0:5]: 
    print("{}\n".format(i))

In [None]:
model_w2v_ns = word2vec.Word2Vec(sentences_nostop,
                          epochs = iteration,
                          vector_size=num_feature, 
                          min_count = min_word_count, 
                          window = window_size, 
                          sample = down_sampling, 
                          workers=num_thread)

In [None]:
model_FT_ns = FastText(sentences_nostop, 
                          vector_size=num_feature, 
                          epochs = iteration,
                          window=window_size, 
                          min_count=min_word_count, 
                          workers=num_thread)

In [None]:
model_w2v_ns.save("gensim_word2vec_nostop")
model_FT_ns.save("gensim_fastText_nostop")