# Load Gensim Library

In [2]:
import pandas as pd
import re, string
import gensim
import logging

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load Text Data

Download unlabeled data from below link:<br>
https://www.kaggle.com/c/word2vec-nlp-tutorial/download/6p5lry6q8vtNVre4DXOg%2Fversions%2FLH6HdfqTrHnAeosA007i%2Ffiles%2FunlabeledTrainData.tsv.zip

In [4]:
df = pd.read_csv('unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [5]:
df.shape

(50000, 2)

In [6]:
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


# Function to Clean up data

In [7]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)
  except:
    return ""

Clean the Data using routine above

In [8]:
df['clean_review'] = df['review'].apply(clean_str)

In [9]:
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


# Convert Each Review to a Word List before feeding to Word2Vec

In [10]:
documents = []

In [11]:
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [16]:
documents[:10]

[['watching',
  'time',
  'chasers',
  'it',
  'obvious',
  'that',
  'it',
  'was',
  'made',
  'by',
  'a',
  'bunch',
  'of',
  'friends',
  'maybe',
  'they',
  'were',
  'sitting',
  'around',
  'one',
  'day',
  'in',
  'film',
  'school',
  'and',
  'said',
  'hey',
  'let',
  's',
  'pool',
  'our',
  'money',
  'together',
  'and',
  'make',
  'a',
  'really',
  'bad',
  'movie',
  'or',
  'something',
  'like',
  'that',
  'what',
  'ever',
  'they',
  'said',
  'they',
  'still',
  'ended',
  'up',
  'making',
  'a',
  'really',
  'bad',
  'movie',
  'dull',
  'story',
  'bad',
  'script',
  'lame',
  'acting',
  'poor',
  'cinematography',
  'bottom',
  'of',
  'the',
  'barrel',
  'stock',
  'music',
  'etc',
  'all',
  'corners',
  'were',
  'cut',
  'except',
  'the',
  'one',
  'that',
  'would',
  'have',
  'prevented',
  'this',
  'film',
  's',
  'release',
  'life',
  's',
  'like',
  'that'],
 ['i',
  'saw',
  'this',
  'film',
  'about',
  'years',
  'ago',
  'and

# Build the Model

In [17]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPUs
                               size=50,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               iter=10   #Number of iterations over the text corpus
                              )  

2020-01-29 20:46:13,747 : INFO : collecting all words and their counts
2020-01-29 20:46:13,747 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-29 20:46:14,732 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2020-01-29 20:46:15,757 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2020-01-29 20:46:16,793 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2020-01-29 20:46:17,751 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2020-01-29 20:46:18,840 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2020-01-29 20:46:18,840 : INFO : Loading a fresh vocabulary
2020-01-29 20:46:19,466 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2020-01-29 20:46:19,466 : INFO : effective_min_count=10 leaves 11910457 word cor

2020-01-29 20:47:25,371 : INFO : EPOCH 3 - PROGRESS: at 55.04% examples, 397393 words/s, in_qsize 6, out_qsize 1
2020-01-29 20:47:26,371 : INFO : EPOCH 3 - PROGRESS: at 59.79% examples, 399181 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:47:27,386 : INFO : EPOCH 3 - PROGRESS: at 64.29% examples, 398934 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:47:28,420 : INFO : EPOCH 3 - PROGRESS: at 69.27% examples, 400810 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:47:29,435 : INFO : EPOCH 3 - PROGRESS: at 73.96% examples, 400952 words/s, in_qsize 5, out_qsize 2
2020-01-29 20:47:30,452 : INFO : EPOCH 3 - PROGRESS: at 79.15% examples, 403337 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:47:31,499 : INFO : EPOCH 3 - PROGRESS: at 83.77% examples, 402542 words/s, in_qsize 5, out_qsize 2
2020-01-29 20:47:32,503 : INFO : EPOCH 3 - PROGRESS: at 88.67% examples, 403520 words/s, in_qsize 6, out_qsize 1
2020-01-29 20:47:33,502 : INFO : EPOCH 3 - PROGRESS: at 93.44% examples, 404458 words/s, in_qsiz

2020-01-29 20:48:28,120 : INFO : EPOCH 6 - PROGRESS: at 30.16% examples, 372009 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:48:29,157 : INFO : EPOCH 6 - PROGRESS: at 33.55% examples, 361920 words/s, in_qsize 8, out_qsize 0
2020-01-29 20:48:30,184 : INFO : EPOCH 6 - PROGRESS: at 37.61% examples, 361194 words/s, in_qsize 8, out_qsize 0
2020-01-29 20:48:31,276 : INFO : EPOCH 6 - PROGRESS: at 41.20% examples, 354190 words/s, in_qsize 8, out_qsize 4
2020-01-29 20:48:32,302 : INFO : EPOCH 6 - PROGRESS: at 44.88% examples, 351150 words/s, in_qsize 7, out_qsize 2
2020-01-29 20:48:33,313 : INFO : EPOCH 6 - PROGRESS: at 49.59% examples, 355986 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:48:34,318 : INFO : EPOCH 6 - PROGRESS: at 53.42% examples, 354312 words/s, in_qsize 8, out_qsize 2
2020-01-29 20:48:35,344 : INFO : EPOCH 6 - PROGRESS: at 56.92% examples, 350743 words/s, in_qsize 8, out_qsize 0
2020-01-29 20:48:36,342 : INFO : EPOCH 6 - PROGRESS: at 61.00% examples, 351519 words/s, in_qsiz

2020-01-29 20:49:34,426 : INFO : EPOCH 8 - PROGRESS: at 78.70% examples, 341030 words/s, in_qsize 8, out_qsize 2
2020-01-29 20:49:35,445 : INFO : EPOCH 8 - PROGRESS: at 82.77% examples, 341410 words/s, in_qsize 8, out_qsize 0
2020-01-29 20:49:36,472 : INFO : EPOCH 8 - PROGRESS: at 86.60% examples, 340554 words/s, in_qsize 8, out_qsize 0
2020-01-29 20:49:37,482 : INFO : EPOCH 8 - PROGRESS: at 90.17% examples, 339395 words/s, in_qsize 7, out_qsize 1
2020-01-29 20:49:38,496 : INFO : EPOCH 8 - PROGRESS: at 93.71% examples, 338208 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:49:39,512 : INFO : EPOCH 8 - PROGRESS: at 97.24% examples, 337034 words/s, in_qsize 7, out_qsize 0
2020-01-29 20:49:40,258 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-01-29 20:49:40,300 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-01-29 20:49:40,337 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-01-29 20:49:40,356 : INFO : worker thread fi

# Exploring the model

How many words in the model and how many features

In [18]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(28322, 50)

In [19]:
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x4d402c48>,
 'time': <gensim.models.keyedvectors.Vocab at 0x4d402b48>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x4d402dc8>,
 'it': <gensim.models.keyedvectors.Vocab at 0x4d402e48>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x4d402cc8>,
 'that': <gensim.models.keyedvectors.Vocab at 0x4d402848>,
 'was': <gensim.models.keyedvectors.Vocab at 0x4d402248>,
 'made': <gensim.models.keyedvectors.Vocab at 0x4d402a88>,
 'by': <gensim.models.keyedvectors.Vocab at 0x4d402d88>,
 'a': <gensim.models.keyedvectors.Vocab at 0x4d402ec8>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x4d402988>,
 'of': <gensim.models.keyedvectors.Vocab at 0x4d402748>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x4d402208>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x4d402348>,
 'they': <gensim.models.keyedvectors.Vocab at 0x4d4028c8>,
 'were': <gensim.models.keyedvectors.Vocab at 0x4d4027c8>,
 'sitting': <gensim.models.keyedvectors.Vocab at 0x

Get an embedding for a word

In [20]:
model.wv['flower']

array([ 1.2386642 , -0.2478108 ,  0.7831918 ,  0.24507585, -0.6027737 ,
        1.5230455 ,  0.21680266,  1.5898254 , -1.0033935 , -0.19790335,
        0.6720689 , -1.379312  ,  0.18390732,  0.10945343, -0.12451898,
        0.5784375 ,  0.34567383,  0.12460408, -0.03341219,  0.78589386,
        0.61388326, -0.22666655,  0.0470451 ,  0.5781915 ,  0.7092055 ,
       -0.7964156 ,  0.55835557,  0.18141693,  0.99103194, -0.4621655 ,
        0.22707151,  1.1972257 , -0.32533967, -0.75320905, -0.5491755 ,
        0.8181654 ,  0.5148832 , -1.0390544 , -0.8947026 ,  0.64440376,
        0.7538887 , -0.09054603, -0.17643897, -0.02508358, -0.70738703,
        0.6233651 ,  0.47373864,  0.8807439 ,  0.6937695 , -1.147586  ],
      dtype=float32)

Saving the model

In [21]:
model.save('word2vec-movie-50')

2020-01-29 21:59:36,571 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2020-01-29 21:59:36,571 : INFO : not storing attribute vectors_norm
2020-01-29 21:59:36,587 : INFO : not storing attribute cum_table
2020-01-29 21:59:36,957 : INFO : saved word2vec-movie-50


Finding Words which have similar meaning

In [22]:
model.wv.most_similar('great')

2020-01-29 21:59:40,298 : INFO : precomputing L2-norms of word weight vectors


[('fantastic', 0.8896845579147339),
 ('wonderful', 0.8826889991760254),
 ('terrific', 0.8799619078636169),
 ('fine', 0.859294593334198),
 ('good', 0.8358762860298157),
 ('brilliant', 0.814114511013031),
 ('superb', 0.799919068813324),
 ('perfect', 0.770316481590271),
 ('nice', 0.7677238583564758),
 ('marvelous', 0.7524583339691162)]

Find the Word which is not like others

In [23]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [24]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.


[('scientist', 0.6019014716148376),
 ('mercenary', 0.5221081376075745),
 ('vendetta', 0.5215702652931213),
 ('himself', 0.5195237994194031),
 ('aladdin', 0.50232994556427),
 ('creator', 0.5018371343612671),
 ('joker', 0.5013880133628845),
 ('mastermind', 0.49782297015190125),
 ('genius', 0.4939456582069397),
 ('bookie', 0.4919530153274536)]

Loading a model from Memory

In [4]:
model = gensim.models.Word2Vec.load('word2vec-movie-50')

In [7]:
#model.wv['fower'] --> This gives an error as it is out of vocabulary word

<b>Reference for methods to handle OOV words:</b><br>
https://machinelearninginterview.com/topics/natural-language-processing/how-do-you-deal-with-out-of-vocabulary-words-during-run-time-when-you-build-a-language-model/

### Pretrained Models

1. Word2Vec by Google 
    - Trained on Google News Dataset comprising of ~100 billion tokens and 300 dimensions
2. Glove by Stanford
    - Trained on Wikipedia articles with 6 billion tokens and available in 50, 100, 200 and 300 dimensions
    - There are other pretrained models as well - 27B tokens from twitter, 42B and 840B from common web crawl
3. FastText 
    - wiki-news-300d-1M.vec.zip: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
    - wiki-news-300d-1M-subword.vec.zip: 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).
    - crawl-300d-2M.vec.zip: 2 million word vectors trained on Common Crawl (600B tokens).
    - crawl-300d-2M-subword.zip: 2 million word vectors trained with subword information on Common Crawl (600B tokens).

<b>Reference:</b><br>
https://www.kaggle.com/c/word2vec-nlp-tutorial/overview/part-2-word-vectors