In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.manifold
import matplotlib.pyplot as plt
%matplotlib inline

import glob, codecs, multiprocessing
from importlib import reload
from gensim.models import word2vec

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Error loading words: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
reference_filenames = sorted(glob.glob("src/part*.txt"))

In [5]:
corpus_raw = u""
for book_filename in reference_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()

In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(raw, remove_stopwords):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.lower().split()
    
    if remove_stopwords ==True:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence, remove_stopwords=True))

In [9]:
#sentences[0]
#sentence_to_wordlist(raw_sentences[0],remove_stopwords=True)

In [10]:
num_features = 300             
min_word_count = 40                         
num_workers = 4      
context = 10                                                                                         
downsampling = 1e-3

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling, seed=1)
model.init_sims(replace=True)

2018-01-16 01:19:20,456 : INFO : collecting all words and their counts
2018-01-16 01:19:20,457 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-16 01:19:20,472 : INFO : PROGRESS: at sentence #10000, processed 69968 words, keeping 8385 word types
2018-01-16 01:19:20,479 : INFO : collected 10696 word types from a corpus of 101286 raw words and 14241 sentences
2018-01-16 01:19:20,480 : INFO : Loading a fresh vocabulary
2018-01-16 01:19:20,485 : INFO : min_count=40 retains 473 unique words (4% of original 10696, drops 10223)
2018-01-16 01:19:20,486 : INFO : min_count=40 leaves 55246 word corpus (54% of original 101286, drops 46040)
2018-01-16 01:19:20,488 : INFO : deleting the raw counts dictionary of 10696 items
2018-01-16 01:19:20,489 : INFO : sample=0.001 downsamples 97 most-common words
2018-01-16 01:19:20,490 : INFO : downsampling leaves estimated 43202 word corpus (78.2% of prior 55246)
2018-01-16 01:19:20,490 : INFO : estimated required memory for 4

In [11]:
model_name = "HarryPotter.w2v"
model.save(model_name)

2018-01-16 01:19:20,770 : INFO : saving Word2Vec object under HarryPotter.w2v, separately None
2018-01-16 01:19:20,772 : INFO : not storing attribute syn0norm
2018-01-16 01:19:20,773 : INFO : not storing attribute cum_table
2018-01-16 01:19:20,786 : INFO : saved HarryPotter.w2v


In [12]:
load_model = word2vec.Word2Vec.load("HarryPotter.w2v")

2018-01-16 01:19:20,914 : INFO : loading Word2Vec object from HarryPotter.w2v
2018-01-16 01:19:20,924 : INFO : loading wv recursively from HarryPotter.w2v.wv.* with mmap=None
2018-01-16 01:19:20,925 : INFO : setting ignored attribute syn0norm to None
2018-01-16 01:19:20,926 : INFO : setting ignored attribute cum_table to None
2018-01-16 01:19:20,926 : INFO : loaded HarryPotter.w2v


In [13]:
load_model.most_similar("snape")

  """Entry point for launching an IPython kernel.
2018-01-16 01:19:21,023 : INFO : precomputing L2-norms of word weight vectors


[('slughorn', 0.9998684525489807),
 ('neville', 0.9998464584350586),
 ('slytherin', 0.9998429417610168),
 ('narcissa', 0.9998394846916199),
 ('cold', 0.9998355507850647),
 ('bellatrix', 0.999834418296814),
 ('potion', 0.9998343586921692),
 ('need', 0.9998303055763245),
 ('round', 0.99982750415802),
 ('madam', 0.9998272657394409)]

In [14]:
load_model.most_similar("hermione")

  """Entry point for launching an IPython kernel.


[('ginny', 0.9997619390487671),
 ('yeah', 0.9997517466545105),
 ('ron', 0.9997482299804688),
 ('right', 0.9997450709342957),
 ('percy', 0.9997421503067017),
 ('hagrid', 0.9997384548187256),
 ('madam', 0.999719500541687),
 ('potion', 0.99971604347229),
 ('malfoy', 0.9997134208679199),
 ('watch', 0.9997129440307617)]

In [15]:
word_vectors = model.wv

In [16]:
word_vectors.similarity('harry', 'hermione')

0.9995719266697956

In [17]:
ron_herm = "I simply want to say that it doesn't matter if Ron loved Hermione or not. She never loved him.".lower().split()
har_herm = "Above all the lies, I won't forget to add that Hermione has loved Harry before they were born".lower().split()

stopwords = nltk.corpus.stopwords.words('english')
ron_herm = [w for w in ron_herm if w not in stopwords]
ron_herm = [w for w in har_herm if w not in stopwords]

distance = model.wmdistance(ron_herm, har_herm)
distance

  
2018-01-16 01:19:21,299 : INFO : Removed 5 and 16 OOV words from document 1 and 2 (respectively).
2018-01-16 01:19:21,300 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-01-16 01:19:21,300 : INFO : built Dictionary(2 unique tokens: ['hermione', 'harry']) from 2 documents (total 4 corpus positions)


0.0

In [18]:
word_vectors.most_similar_to_given('harry', ['snape', 'gryffindor', 'hermione', 'magic'])

'gryffindor'