In [7]:
import gensim
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings("ignore")


In [8]:
df = pd.read_csv('data/preprocessed.csv')
df

Unnamed: 0,reviewText,overall,helpful_ratio,review_length,preprocessed_text,processed_len
0,Installing the game was a struggle (because of...,0,0.666667,779,installing game struggle games windows live bu...,506
1,If you like rally cars get this game you will ...,1,0.000000,304,like rally cars get game fun oriented european...,188
2,1st shipment received a book instead of the ga...,0,0.000000,282,st shipment received book instead game nd ship...,194
3,I had Dirt 2 on Xbox 360 and it was an okay ga...,1,1.000000,294,dirt xbox okay game started playing games lapt...,171
4,"Overall this is a well done racing game, with ...",1,0.000000,715,overall well done racing game very good graphi...,502
...,...,...,...,...,...,...
234093,I've bought 10 of these over the past year to ...,1,1.000000,243,bought over past year give gifts friends club ...,159
234094,"OK, I have to admit, the price of this item di...",0,0.000000,1260,ok admit price item not expectations unfairly ...,806
234095,I love my skull face mask. It makes me outstan...,1,0.000000,202,love skull face mask makes outstand rest fello...,124
234096,This mask of course as described is half-face ...,1,0.000000,149,mask course described half face but long prote...,90


In [9]:

def create_w2v(embedding_dim, words, window, epochs, sg):
    '''
    creating word2vec or skipgram.
    '''
    #Creating Word2Vec
    w2v_model = gensim.models.Word2Vec(words , vector_size = embedding_dim , window = window , min_count = 1, epochs = epochs, sg = sg)
    print(f"- {'Word2Vec' if(sg==0) else 'Skipgram'} Created")
    print(f'- Vocabulary count: {len(w2v_model.wv)}')
    print(f'''- Similar words for word "great:\n"{w2v_model.wv.most_similar('great')}''')
    
    return w2v_model
    

def get_sentence_embedding(sent, model, flag):
    '''
    create embeddings by calculating mean of vectors of words in each review (preprocessed_text)
    '''
    list_vectors = []
    for word in sent:
        if(flag):
            #indicates fasttext
            vector = model.get_word_vector(word)
        else:
            vector = model.wv[word]
        list_vectors.append(vector)
    mean_vector = np.array(list_vectors).mean(axis=0)
    return mean_vector


# Creating "embeddings" column
def get_embedding_cols(df, embedding_dim, model, flag):
    '''
    returns df with embedded columns. flag indicates if its fasttext
    '''
    #df['embeddings'] = df['preprocessed_text'].apply(lambda x: get_sentence_embedding(x.split(), model))
    df['embeddings'] = df['preprocessed_text'].apply(lambda x: get_sentence_embedding(x.split(), model, flag))
    

    #creating a column for each vector in embedding - 100 columns
    cols = [f'e_{i}' for i in range(1, embedding_dim + 1)]
    df[cols] = pd.DataFrame(df['embeddings'].tolist(), index= df.index)
    print('- Embeddings are created.')
    return df


### Fasttext

In [10]:
df.to_csv('data/text_label.txt', columns = ['preprocessed_text'], header = None, index = False)

In [11]:
import fasttext

model = fasttext.train_unsupervised('data/text_label.txt', dim = 100)

print(f'FASTTEXT {model.dim} VECTOR EMBEDDING DIMENSIONS:')
print(f'=========================================')

ft_df_100 = get_embedding_cols(df, 100, model, True)

#writing to a new file
ft_df_100.to_csv('data/fasttext_100.csv', index = False)

print('- Fasttext embeddings Created')
print(f'- Vocabulary count: {len(model.words)}')
print(f'''- Similar words for word "great:\n"{model.get_nearest_neighbors('great', k=10)}''')

model.save_model('models/fasttext_model.bin')


Read 24M words
Number of words:  49568
Number of labels: 0
Progress: 100.0% words/sec/thread:   68386 lr:  0.000000 avg.loss:  1.215623 ETA:   0h 0m 0s


FASTTEXT 100 VECTOR EMBEDDING DIMENSIONS:
- Embeddings are created.
- Fasttext embeddings Created
- Vocabulary count: 49568
- Similar words for word "great:
"[(0.8861614465713501, 'good'), (0.8857290148735046, 'excellent'), (0.8426099419593811, 'fantastic'), (0.8311927318572998, 'excelllent'), (0.8246669173240662, 'well'), (0.8161651492118835, 'awesome'), (0.8105279803276062, 'amazing'), (0.8065727949142456, 'nice'), (0.8002344965934753, 'outstanding'), (0.7834881544113159, 'perfect')]


### Word2Vec - CBOW

In [14]:
words = []
for i in df['preprocessed_text'].values:
    words.append(i.split())
words[:3]

[['installing',
  'game',
  'struggle',
  'games',
  'windows',
  'live',
  'bugs',
  'championship',
  'races',
  'cars',
  'unlocked',
  'buying',
  'addon',
  'game',
  'paid',
  'nearly',
  'dollars',
  'when',
  'game',
  'new',
  'not',
  'like',
  'idea',
  'keep',
  'paying',
  'keep',
  'playing',
  'noticed',
  'no',
  'improvement',
  'physics',
  'graphics',
  'compared',
  'dirt',
  'tossed',
  'garbage',
  'vowed',
  'never',
  'buy',
  'another',
  'codemasters',
  'game',
  'really',
  'tired',
  'arcade',
  'style',
  'rally',
  'racing',
  'games',
  'anyway',
  'continue',
  'get',
  'fix',
  'richard',
  'burns',
  'rally',
  'http',
  'www',
  'amazon',
  'com',
  'richard',
  'burns',
  'rally',
  'pc',
  'dp',
  'b',
  'c',
  'ref',
  'sr',
  'ie',
  'utf',
  'qid',
  'sr',
  'keywords',
  'richard',
  'burns',
  'reading',
  'review',
  'enjoyed',
  'sure',
  'rate',
  'helpful'],
 ['like',
  'rally',
  'cars',
  'get',
  'game',
  'fun',
  'oriented',
  'europe

In [15]:
 
print(f'\nWORD2VEC 100 VECTOR EMBEDDING DIMENSIONS:')
print(f'=========================================')

#word2vec
cbow_model = create_w2v(100, words, 7, 50, sg = 0)

df1 = df.copy()

#creating embedding columns
df1 = get_embedding_cols(df1, 100, cbow_model, False)
df1.to_csv(f'word2vec_100.csv', index = False)

cbow_model.save('models/cbow.model')



WORD2VEC 100 VECTOR EMBEDDING DIMENSIONS:
- Word2Vec Created
- Vocabulary count: 86922
- Similar words for word "great:
"[('excellent', 0.8629738688468933), ('fantastic', 0.8284962177276611), ('terrific', 0.8072965741157532), ('awesome', 0.797069251537323), ('wonderful', 0.7778352499008179), ('good', 0.7708988785743713), ('outstanding', 0.7545972466468811), ('amazing', 0.754551112651825), ('fabulous', 0.7236035466194153), ('superb', 0.7132914066314697)]
- Embeddings are created.


### Word2Vec - Skipgram

In [16]:
  
print(f'\nSKIPGRAM 100 VECTOR EMBEDDING DIMENSIONS:')
print(f'=========================================')

#word2vec
sg_model = create_w2v(100, words, 7, 50, sg = 1)

df1 = df.copy()

#creating embedding columns
df1 = get_embedding_cols(df1, 100, sg_model, False)
df1.to_csv(f'skipgram_100.csv', index = False)

sg_model.save('models/skipgram.model')



SKIPGRAM 100 VECTOR EMBEDDING DIMENSIONS:
- Skipgram Created
- Vocabulary count: 86922
- Similar words for word "great:
"[('good', 0.8826044797897339), ('excellent', 0.7908362150192261), ('well', 0.7878409624099731), ('nice', 0.777850329875946), ('really', 0.7529785633087158), ('perfect', 0.7394362092018127), ('love', 0.7147756814956665), ('also', 0.7115322947502136), ('like', 0.709590494632721), ('very', 0.7042110562324524)]
- Embeddings are created.


In [17]:
print('Embeddings are created.')

Embeddings are created.
