In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.options.display.max_colwidth = 200

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load Gensim Library

In [None]:
import re, string
import gensim

In [None]:
#change file path to point to where you have stored the zip file.
df = pd.read_csv('/kaggle/input/nlp-specialization-data/unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3) 
print('Number of examples in Dataset: ', df.shape)
df.head()

### Function to Clean up data

In [None]:
def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)

  except:
    return ""

### Clean the Data using routine above

In [None]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

In [None]:
df.loc[0, 'clean_review']

### Convert Review to a Word List

In [None]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))

In [None]:
print(documents[0])

In [None]:
len(documents[0])

### Build the Model

In [None]:
# ?gensim.models.Word2Vec

In [None]:
#Build the model
model = gensim.models.Word2Vec(documents,       #Word list
                               min_count=10,    #Ignore all words with total frequency lower than this                           
                               workers=4,       #Number of CPU Cores
                               vector_size=50,  #Embedding size -  Dimensionality of the feature vectors. - (50, 300)
                               window=5,        #Maximum Distance between current and predicted word
                               epochs =10       #Number of iterations over the text corpus
                              )  

# Exploring the model

### How many words in the model

In [None]:
#Model size
len(model.wv.key_to_index)

In [None]:
# Vocablury of the model
(model.wv.index_to_key)[100:110]

### Get an embedding for a word

In [None]:
model.wv.most_similar('happy')

### Finding Words which have similar meaning

In [None]:
model.wv.most_similar('great')

### Find the word which is not like others

In [None]:
model.wv.doesnt_match("man woman child king".split())

### Saving the model

In [None]:
model.save('word2vec-movie-50')

In [None]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [None]:
model.wv.most_similar(positive=['king','man'], negative=['queen'])

In [None]:
model.wv['king'] + model.wv['man'] - model.wv['queen']

In [None]:
model.wv['man']

In [None]:
model.wv['woman']

In [None]:
X = model.wv[model.wv.index_to_key]
print(X.shape)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
result = pca.fit_transform(X)
print(result.shape)

In [None]:
import numpy as np
index=np.random.randint(0,28321,300)
print(index.shape)
index

In [None]:
result_sample= result[index]
print(result_sample.shape)

In [None]:
type(model.wv.index_to_key)

In [None]:
vocab_sample=[]
for ind in index.tolist():
    vocab_sample.append(model.wv.index_to_key[ind])
print(len(vocab_sample))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fif = plt.figure(figsize = (12,7))
plt.scatter(result_sample[:,0], result_sample[:,1])
words = vocab_sample

for i,w in enumerate(words):
    plt.annotate(w, xy = (result_sample[i,0], result_sample[i,1]) ,)
    
plt.show()