In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.manifold
import matplotlib.pyplot as plt
%matplotlib inline

import glob, codecs, multiprocessing
from importlib import reload
from gensim.models import word2vec

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('words')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package words to
[nltk_data]     /home/hell_raider/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/hell_raider/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hell_raider/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
reference_filenames = sorted(glob.glob("src/part*.txt"))

In [5]:
corpus_raw = u""
for book_filename in reference_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()

In [6]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(raw, remove_stopwords):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.lower().split()
    
    if remove_stopwords ==True:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence, remove_stopwords=True))

In [9]:
#sentences[0]
#sentence_to_wordlist(raw_sentences[0],remove_stopwords=True)

In [10]:
num_features = 300             
min_word_count = 40                         
num_workers = 4      
context = 10                                                                                         
downsampling = 1e-3

model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling, seed=1)
model.init_sims(replace=True)

2018-01-16 23:57:05,512 : INFO : collecting all words and their counts
2018-01-16 23:57:05,514 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-16 23:57:05,532 : INFO : PROGRESS: at sentence #10000, processed 69968 words, keeping 8385 word types
2018-01-16 23:57:05,540 : INFO : collected 10696 word types from a corpus of 101286 raw words and 14241 sentences
2018-01-16 23:57:05,541 : INFO : Loading a fresh vocabulary
2018-01-16 23:57:05,546 : INFO : min_count=40 retains 473 unique words (4% of original 10696, drops 10223)
2018-01-16 23:57:05,546 : INFO : min_count=40 leaves 55246 word corpus (54% of original 101286, drops 46040)
2018-01-16 23:57:05,548 : INFO : deleting the raw counts dictionary of 10696 items
2018-01-16 23:57:05,550 : INFO : sample=0.001 downsamples 97 most-common words
2018-01-16 23:57:05,551 : INFO : downsampling leaves estimated 43202 word corpus (78.2% of prior 55246)
2018-01-16 23:57:05,552 : INFO : estimated required memory for 4

In [11]:
model_name = "HarryPotter.w2v"
model.save(model_name)

2018-01-16 23:57:05,834 : INFO : saving Word2Vec object under HarryPotter.w2v, separately None
2018-01-16 23:57:05,835 : INFO : not storing attribute syn0norm
2018-01-16 23:57:05,836 : INFO : not storing attribute cum_table
2018-01-16 23:57:05,850 : INFO : saved HarryPotter.w2v


In [12]:
load_model = word2vec.Word2Vec.load("HarryPotter.w2v")

2018-01-16 23:57:05,989 : INFO : loading Word2Vec object from HarryPotter.w2v
2018-01-16 23:57:06,000 : INFO : loading wv recursively from HarryPotter.w2v.wv.* with mmap=None
2018-01-16 23:57:06,001 : INFO : setting ignored attribute syn0norm to None
2018-01-16 23:57:06,002 : INFO : setting ignored attribute cum_table to None
2018-01-16 23:57:06,003 : INFO : loaded HarryPotter.w2v


In [13]:
load_model.most_similar("snape")

  """Entry point for launching an IPython kernel.
2018-01-16 23:57:06,083 : INFO : precomputing L2-norms of word weight vectors


[('neville', 0.9998724460601807),
 ('slughorn', 0.9998594522476196),
 ('sight', 0.9998570680618286),
 ('gryffindor', 0.999855101108551),
 ('flying', 0.9998477101325989),
 ('narcissa', 0.999845564365387),
 ('book', 0.9998432397842407),
 ('taking', 0.9998406171798706),
 ('continued', 0.999840259552002),
 ('desk', 0.9998396635055542)]

In [14]:
load_model.most_similar("hermione")

  """Entry point for launching an IPython kernel.


[('ron', 0.9998289942741394),
 ('percy', 0.9998274445533752),
 ('tonks', 0.9998146891593933),
 ('slughorn', 0.9998093247413635),
 ('hard', 0.9998055696487427),
 ('trunk', 0.9998050332069397),
 ('right', 0.9998040199279785),
 ('potion', 0.9998038411140442),
 ('sight', 0.9997999668121338),
 ('snape', 0.9997997283935547)]

In [15]:
word_vectors = model.wv

In [16]:
word_vectors.similarity('harry', 'hermione')

0.9996987526778015

In [17]:
ron_herm = "I simply want to say that it doesn't matter if Ron loved Hermione or not. She never loved him.".lower().split()
har_herm = "Above all the lies, I won't forget to add that Hermione has loved Harry before they were born".lower().split()

stopwords = nltk.corpus.stopwords.words('english')
ron_herm = [w for w in ron_herm if w not in stopwords]
ron_herm = [w for w in har_herm if w not in stopwords]

distance = model.wmdistance(ron_herm, har_herm)
distance

  
2018-01-16 23:57:06,432 : INFO : Removed 5 and 16 OOV words from document 1 and 2 (respectively).
2018-01-16 23:57:06,433 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-01-16 23:57:06,434 : INFO : built Dictionary(2 unique tokens: ['harry', 'hermione']) from 2 documents (total 4 corpus positions)


0.0

In [18]:
word_vectors.most_similar_to_given('harry', ['snape', 'gryffindor', 'hermione', 'magic'])

'gryffindor'