### Load Gensim Library

In [1]:
#!pip install gensim

In [2]:
import gensim



In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [4]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [5]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)

print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


### Function to Clean up data

In [6]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

### Clean the Data using routine above

In [7]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


In [8]:
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [9]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [10]:
len(documents[108])

121

### Build the Model

In [11]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               #size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               epochs=10   #Number of iterations over the text corpus
                              )  

2021-04-25 23:27:24,112 : INFO : collecting all words and their counts
2021-04-25 23:27:24,120 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-25 23:27:25,111 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2021-04-25 23:27:26,065 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2021-04-25 23:27:26,982 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2021-04-25 23:27:28,125 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2021-04-25 23:27:29,102 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2021-04-25 23:27:29,102 : INFO : Creating a fresh vocabulary
2021-04-25 23:27:29,375 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 28322 unique words (28.18698434498751%% of original 100479, drops 72157)', 'datetime': '2021-04-25T23:

2021-04-25 23:28:15,541 : INFO : EPOCH 2 - PROGRESS: at 94.49% examples, 410745 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:28:16,562 : INFO : EPOCH 2 - PROGRESS: at 98.99% examples, 409519 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:28:16,727 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-04-25 23:28:16,735 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-25 23:28:16,777 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-04-25 23:28:16,806 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-04-25 23:28:16,808 : INFO : EPOCH - 2 : training on 12084660 raw words (8817243 effective words) took 21.6s, 408959 effective words/s
2021-04-25 23:28:17,836 : INFO : EPOCH 3 - PROGRESS: at 4.43% examples, 388702 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:28:18,866 : INFO : EPOCH 3 - PROGRESS: at 9.42% examples, 403637 words/s, in_qsize 8, out_qsize 0
2021-04-25 23:28:19,872 : INFO : EPOCH 3 - PROG

2021-04-25 23:29:17,034 : INFO : EPOCH 5 - PROGRESS: at 75.85% examples, 413230 words/s, in_qsize 7, out_qsize 1
2021-04-25 23:29:18,050 : INFO : EPOCH 5 - PROGRESS: at 80.78% examples, 413771 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:29:19,066 : INFO : EPOCH 5 - PROGRESS: at 85.69% examples, 414409 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:29:20,082 : INFO : EPOCH 5 - PROGRESS: at 90.53% examples, 414597 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:29:21,082 : INFO : EPOCH 5 - PROGRESS: at 94.42% examples, 411297 words/s, in_qsize 8, out_qsize 0
2021-04-25 23:29:22,087 : INFO : EPOCH 5 - PROGRESS: at 98.40% examples, 408432 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:29:22,409 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-04-25 23:29:22,428 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-25 23:29:22,434 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-04-25 23:29:22,447 : INFO : worker thread fi

2021-04-25 23:30:19,993 : INFO : EPOCH 8 - PROGRESS: at 67.11% examples, 415363 words/s, in_qsize 6, out_qsize 1
2021-04-25 23:30:21,009 : INFO : EPOCH 8 - PROGRESS: at 72.16% examples, 416217 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:30:22,017 : INFO : EPOCH 8 - PROGRESS: at 76.94% examples, 416495 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:30:23,038 : INFO : EPOCH 8 - PROGRESS: at 81.82% examples, 416422 words/s, in_qsize 5, out_qsize 2
2021-04-25 23:30:24,064 : INFO : EPOCH 8 - PROGRESS: at 86.69% examples, 416181 words/s, in_qsize 6, out_qsize 1
2021-04-25 23:30:25,065 : INFO : EPOCH 8 - PROGRESS: at 91.48% examples, 416519 words/s, in_qsize 8, out_qsize 0
2021-04-25 23:30:26,087 : INFO : EPOCH 8 - PROGRESS: at 96.24% examples, 416653 words/s, in_qsize 7, out_qsize 0
2021-04-25 23:30:26,822 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-04-25 23:30:26,829 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-04-25 23:30:26,834 : I

2021-04-25 23:31:18,216 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=28322, vector_size=100, alpha=0.025)', 'datetime': '2021-04-25T23:31:18.216138', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'created'}


# Exploring the model

### How many words in the model

In [12]:
#Model size

#model.wv.syn0.shape

In [13]:
# Vocablury of the model

#model.wv.vocab

### Get an embedding for a word

In [14]:
model.wv['flower']

array([-0.06312227,  0.18260437, -0.05638232,  0.5244593 , -0.3575581 ,
       -0.38276857, -0.7436697 , -0.33837998, -0.511507  , -0.00869925,
        0.31246102, -0.400683  ,  0.15150656,  0.0718934 , -0.41129938,
       -0.09452075, -1.0054414 ,  0.359891  , -0.1846446 , -0.28716266,
        0.739708  , -0.76732403,  0.5755478 , -1.2461064 , -0.09302743,
        1.2732005 ,  0.66307765,  0.4528352 , -1.0539004 ,  0.77289325,
       -0.06775813, -0.17348503, -0.10428288, -0.06846291,  0.75830364,
        0.56896067, -0.70130134, -0.19577497, -0.4247881 , -1.2123915 ,
        0.32690609,  0.72450376, -1.2983315 , -0.7555943 , -0.1336665 ,
       -0.50647974, -0.63084537, -0.2899278 ,  0.53106904,  0.22357945,
        0.354643  , -1.1128312 , -0.08121717, -0.3390924 , -0.7056097 ,
       -0.08561002,  0.55693465,  1.0223368 ,  0.682016  ,  0.46192926,
       -0.24727641,  1.2931068 ,  0.20871516, -0.34681737, -0.4415776 ,
       -0.46358335,  0.4294022 ,  0.5363402 , -0.30176607,  0.95

### Finding Words which have similar meaning

In [15]:
model.wv.most_similar('great')

[('wonderful', 0.8400700688362122),
 ('terrific', 0.8392094969749451),
 ('fantastic', 0.8244820833206177),
 ('good', 0.7821651697158813),
 ('brilliant', 0.7649096250534058),
 ('fine', 0.7630594372749329),
 ('superb', 0.7497766613960266),
 ('excellent', 0.7046172022819519),
 ('marvelous', 0.6933190226554871),
 ('outstanding', 0.6811567544937134)]

### Find the word which is not like others

In [16]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

### Saving the model

In [17]:
model.save('word2vec-movie-50')

2021-04-25 23:31:20,481 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec-movie-50', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-04-25T23:31:20.481460', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'saving'}
2021-04-25 23:31:20,481 : INFO : not storing attribute cum_table
2021-04-25 23:31:20,703 : INFO : saved word2vec-movie-50


In [18]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2021-04-25 23:31:20,744 : INFO : loading Word2Vec object from word2vec-movie-50
2021-04-25 23:31:21,471 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2021-04-25 23:31:21,479 : INFO : setting ignored attribute cum_table to None
2021-04-25 23:31:22,154 : INFO : Word2Vec lifecycle event {'fname': 'word2vec-movie-50', 'datetime': '2021-04-25T23:31:22.154588', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'loaded'}


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [19]:
model.wv.most_similar(positive=['king','man'], negative=['queen'])

[('filmmaker', 0.4742802679538727),
 ('himself', 0.46286138892173767),
 ('guy', 0.4608807861804962),
 ('boy', 0.45530185103416443),
 ('soldier', 0.44970929622650146),
 ('person', 0.44657570123672485),
 ('prophecy', 0.4322931170463562),
 ('buio', 0.42813870310783386),
 ('joker', 0.42198827862739563),
 ('villain', 0.41799196600914)]

In [20]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

array([-1.682169  ,  0.6127608 , -0.32731336,  0.26000518, -2.819154  ,
       -2.4359412 ,  5.1552896 ,  1.242214  , -2.0731087 ,  2.029748  ,
        0.8315679 , -1.8440495 , -1.4567516 ,  0.45687824, -1.1792855 ,
       -1.1972156 , -0.21946037,  0.23759255, -0.20262213, -1.4538592 ,
       -2.8300328 ,  2.065917  ,  1.1782981 ,  0.10611081,  2.3941934 ,
        0.36330163,  0.17988014,  2.2258825 ,  2.116016  ,  3.2797394 ,
        3.8886442 ,  2.8847766 ,  0.6814783 ,  3.2241156 ,  0.8487729 ,
       -1.075227  ,  1.7031952 ,  0.6523506 ,  3.336696  , -0.32288885,
       -2.6585119 , -0.8579759 ,  0.28475416,  0.537746  ,  0.77782214,
       -4.821861  , -5.1000853 ,  1.4074748 , -0.49771336,  1.2564744 ,
       -1.2546132 ,  1.7038641 , -0.7389567 , -3.4827905 , -3.537786  ,
        5.7014923 ,  7.239332  ,  2.5296054 , -1.3442844 , -1.9981059 ,
        2.664707  , -0.9088459 , -0.76554716, -2.0988648 , -1.0591347 ,
        2.5346682 ,  0.84775084, -3.2872267 ,  0.72837913,  1.59