# Word2Vec

In [21]:
from __future__ import absolute_import, division, print_function
import codecs
import glob
import multiprocessing
import os
import re
import nltk
import gensim.models.word2vec as w2v
import numpy as np

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/sankyfox/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sankyfox/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
raw_data_files = sorted(glob.glob("./*.txt")) 
raw_data_files

['./sherlock_holmes.txt']

In [4]:
# Check if txt files are utf-8
# import chardet
# for f in raw_data_files:
#     chardet.detect(open(f).read())

In [5]:
raw_corpus = u""
for file_name in raw_data_files:
    print("Reading '{0}' ...".format(file_name))
    with codecs.open(file_name,"r","utf-8") as f:
        raw_corpus += f.read()
    print("Corpus is now {0} characters long".format(len(raw_corpus)))
    print

Reading './sherlock_holmes.txt' ...
Corpus is now 3868122 characters long


In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(raw_corpus)

In [7]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    words = [x.lower() for x in words]
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [9]:
token_count = sum([len(s) for s in sentences])
print("The corpus has {0:,} tokens".format(token_count))

The corpus has 668,430 tokens


In [10]:
# hyper parameters
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 20
downsampling = 1e-3
seed = 2

In [11]:
model = w2v.Word2Vec(sg=1,seed=seed,workers=num_workers,size=num_features,min_count=min_word_count,window=context_size,sample=downsampling,iter=20)

In [12]:
model.build_vocab(sentences)

In [13]:
print("Word2Vec vocabulary length: ",len(model.wv.vocab))

Word2Vec vocabulary length:  9603


In [20]:
model.train(sentences)

9288329

In [18]:
# save the trained file so we can load it anytime
if not os.path.exists('trained'):
    os.mkdir('trained')
model.save(os.path.join("trained","model.w2v"))

In [19]:
model = w2v.Word2Vec.load(os.path.join("trained","model.w2v"))

## Now for some fun

In [17]:
print()
print("Sherlock:Irene:: Watson: ?")
print(model.wv.most_similar(positive=['sherlock', 'watson'], negative=['irene'])[0][0])
print("Well we always suspected that ...")
print()

print("Let's find all doctors")
print("Inspector:Lestrade :: Dr: ?")
print(model.wv.most_similar(positive=['inspector','dr'], negative=['lestrade']))
print()

print("How about professors ?")
print("Inspector:Lestrade :: professor: ?")
print(model.wv.most_similar(positive=['inspector','professor'], negative=['lestrade']))
print()


print("Our model recongnizes inspectors and knows Watson isn't one")
print("Given : gregson watson lestrade baynes")
print("Output : ", model.wv.doesnt_match("gregson watson lestrade baynes".split()))
print()

print("Wow! it knows the difference between the good guys and the bad guys !")
print("Given : moritarty holmes augustus sebastian")
print("Output : ",model.wv.doesnt_match("moritarty holmes augustus sebastian".split()))
print()


Sherlock:Irene:: Watson: ?
holmes
Well we always suspected that ...

Let's find all doctors
Inspector:Lestrade :: Dr: ?
[(u'mortimer', 0.4274638295173645), (u'roylott', 0.4094651937484741), (u'huxtable', 0.3990991711616516), (u'sterndale', 0.3790780305862427), (u'leon', 0.3714893162250519), (u'starr', 0.3627554178237915), (u'reminiscences', 0.34268635511398315), (u'armstrong', 0.3406115174293518), (u'grimesby', 0.33381372690200806), (u'correspondent', 0.3333827257156372)]

How about professors ?
Inspector:Lestrade :: professor: ?
[(u'coram', 0.4137895405292511), (u'moriarty', 0.37892886996269226), (u'presbury', 0.3314708471298218), (u'roy', 0.3195204436779022), (u'lamented', 0.30523109436035156), (u'propped', 0.30168890953063965), (u'drift', 0.2694909870624542), (u'sebastian', 0.25961148738861084), (u'cousin', 0.25885090231895447), (u'solutions', 0.25093913078308105)]

Our model recongnizes inspectors and knows Watson isn't one
Given : gregson watson lestrade baynes
Output :  watson

