# 2. Word2Vec

In [None]:
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import re
import matplotlib

In [None]:
matplotlib.use("TkAgg")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Dataset
* Julius Caesar
* Macbeth

In [None]:
data_dir = '../data/'

In [None]:
macbeth_file = data_dir + 'macbeth.txt'

In [None]:
caesar_file = data_dir + 'julius_caesar.txt'

### Remove the stopwords

In [None]:
stopword_file = data_dir + 'long_stopwords.txt'

In [None]:
stop_words = []

with open(stopword_file,'r') as inpFile:
    lines = inpFile.readlines()
    stop_words_temp = map(lambda x : re.sub('\n','',x),lines)
    stop_words = list(map(lambda x:  re.sub('[^A-Za-z0-9]+', '',x), stop_words_temp))


In [None]:
stop_words

In [None]:
type(stop_words)

In [None]:
def clean(word):
    word = word.strip()
    word = word.lower()
    word = re.sub('[^A-Za-z0-9]+', '', word)
    if word not in stop_words:
        return word
    else:
        return ''

In [None]:
clean("king's")

In [None]:
clean("they'll")

In [None]:
line_count = 0
sentences = []

with open(macbeth_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = list(filter(lambda x:True if len(x) > 0 else False, words))
            sentences.append(words)
            
with open(caesar_file,'r') as inpFile:
    x = inpFile.readlines()
    for line in x:
        if line is not None or line != '\n':
            words = line.split()
            words = map(lambda x: clean(x), words)
            words = list(filter(lambda x:True if len(x) > 0 else False, words))
            sentences.append(words)

In [None]:
type(sentences)

In [None]:
sentences[100:110]

## Word2Vec model

In [None]:
model = Word2Vec(sentences, window=5, size=500, workers=4, min_count=5)

In [None]:
model.wv.vocab

In [None]:
labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model[word])
    labels.append(word)
    


### TSNE plot to find the similarity of words

In [None]:
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

In [None]:
new_values = tsne_model.fit_transform(tokens)

In [None]:
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

In [None]:
plt.figure(figsize=(16, 12)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()

### Analogies

In [None]:
model.most_similar(positive=['caesar','duncan'],negative=['scotland'])

In [None]:
model.most_similar(positive=['caesar','duncan'],negative=['macbeth'])

In [None]:
model.most_similar(positive=['caesar','macbeth'],negative=['banquo'])

In [None]:
model.most_similar(positive=['rome','scotland'],negative=['banquo'])

In [None]:
model.doesnt_match("duncan macbeth scotland banquo".split())