<a href="https://colab.research.google.com/github/ramkumarr02/Word2Vec-on-Harry-Potter/blob/master/Word2Vec_on_HP_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages

In [0]:
import numpy as np
import pandas as pd
import re
import nltk
from time import time
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

from sklearn.decomposition import PCA
from matplotlib import pyplot
from gensim.models import KeyedVectors

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data

##Reading from Github repo

In [3]:
!git clone https://github.com/ramkumarr02/Forked-whiteboard.git

fatal: destination path 'Forked-whiteboard' already exists and is not an empty directory.


In [0]:
hp1 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 1 - The Philosopher's Stone.txt"
hp2 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 2 - The Chamber of Secrets.txt"
hp3 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 3 - The Prisoner of Azkaban.txt"
hp4 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 4 - The Goblet of Fire.txt"
hp5 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 5 - The Order of the Phoenix.txt"
hp6 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 6 - The Half Blood Prince.txt"
hp7 = "/content/Forked-whiteboard/nbviewer/notebooks/data/harrypotter/Book 7 - The Deathly Hallows.txt"

In [0]:
hp_series = [hp1, hp2, hp3, hp4, hp5, hp6, hp7]

In [0]:
def read_txt(file_name):
    with open(file_name, 'r') as f:
        text = f.readlines()    
    text_blob = ''.join(text)
    print("file_name : ", len(text_blob))

    return(text_blob)

## Join all 7 books together

In [7]:
series = ""
for book in hp_series:
    book = read_txt(book)
    book = book.lower()    
    series = series + book
    print(len(series))    

file_name :  474429
474429
file_name :  531708
1006137
file_name :  676978
1683115
file_name :  1187250
2870365
file_name :  1608763
4479128
file_name :  1059022
5538150
file_name :  1227024
6765174


## Data Preprocessing

### Remove the footer from all pages

In [0]:
reg_pattern = r"page \|(.*)"
series = re.sub(pattern = reg_pattern, repl= '', string = series)

In [0]:
with open("temp.txt", "w") as text_file:
    text_file.write(series)

### Tokenize words by sentences

In [0]:
sent_tokens = sent_tokenize(series)

In [11]:
t1 = time()
sents = []
corpus = []
for sent in sent_tokens:
    words = word_tokenize(sent)    
    sents.append([words, "~"])    

t2 = time()
print("Time Elapsed : ",round(((t2- t1)/60),2)," mins")

Time Elapsed :  0.17  mins


In [0]:
flat_tokens = [item for sublist in sents for item in sublist]     
flat_tokens = [item for sublist in flat_tokens for item in sublist]     

### Remove stopwords and lemmatize

In [0]:
lmtzr = WordNetLemmatizer()

In [0]:
additional_stopwords = ['the', 'rowling', 'wa']

In [0]:
stoplist = [stopwords, additional_stopwords]

In [0]:
def cleantext(tokens, stoplist):    
    tokens = [lmtzr.lemmatize(word) for word in tokens if word == "~" or word.isalpha()]
    tokens = [lmtzr.lemmatize(word) for word in tokens if word == "~" or len(word)>1]
     
    for words in stoplist:
        tokens = [lmtzr.lemmatize(word) for word in tokens if word == "~" or not word in words]
    return(tokens)

In [17]:
%%time
clean_tokens = cleantext(flat_tokens, stoplist)

CPU times: user 15.4 s, sys: 89 ms, total: 15.5 s
Wall time: 15.5 s


In [18]:
print(f'{len(flat_tokens)} flat_tokens got reduced to {len(clean_tokens)} clean_tokens')

1478559 flat_tokens got reduced to 622755 clean_tokens


### Create list of lists

In [20]:
%%time

sent = []
corpus = []
for word in clean_tokens:
    if word != "~":
        sent.append(word)
    elif word == "~":
        corpus.append(sent)
        sent = []   

CPU times: user 135 ms, sys: 8.93 ms, total: 144 ms
Wall time: 146 ms


### Bigrams

In [0]:
phrases = Phrases(corpus, min_count=10, threshold = 50)
bigram = Phraser(phrases)

In [0]:
bgrams = []
for sent in corpus:
    bgrams.append(bigram[sent])    

In [0]:
flat_bgrams = [item for sublist in bgrams for item in sublist]     
freq = nltk.FreqDist(flat_bgrams)
freqdf = pd.DataFrame.from_dict(freq,orient='index')
freqdf.to_csv('freq.csv')

In [24]:
i = 0
for sent in bgrams:
    for word in sent:
        if "_" in word:
            print(word)
            i += 1
    if i > 10:
        break

number_four
privet_drive
number_four
privet_drive
privet_drive
young_man
minute_later
five_clock
rooted_spot
number_four
living_room


# Word2Vec (Local)

In [0]:
word_bag = corpus

In [26]:
%%time
model = Word2Vec(size = 300, window=5, min_count = 20, workers = -1)
model.build_vocab(word_bag)

model.train(word_bag, total_examples=model.corpus_count, epochs = 5)

CPU times: user 1 s, sys: 8.92 ms, total: 1.01 s
Wall time: 1.02 s


In [27]:
'''
%%time
model = Word2Vec(word_bag, min_count=20, size = 300, workers = -1)
model.save("word2vec.model")
'''

'\n%%time\nmodel = Word2Vec(word_bag, min_count=20, size = 300, workers = -1)\nmodel.save("word2vec.model")\n'

## Results

In [28]:
list(model.wv.vocab)[0:10]

['boy',
 'lived',
 'dursley',
 'number',
 'four',
 'privet',
 'drive',
 'proud',
 'say',
 'perfectly']

In [29]:
model.wv.most_similar(positive=["harry"])

  if np.issubdtype(vec.dtype, np.int):


[('molly', 0.19838526844978333),
 ('shut', 0.18901100754737854),
 ('original', 0.1792946606874466),
 ('thoroughly', 0.1780172884464264),
 ('nodded', 0.17132221162319183),
 ('recognized', 0.16783234477043152),
 ('without', 0.16622556746006012),
 ('supposed', 0.16585727035999298),
 ('threshold', 0.16030767560005188),
 ('hogwarts', 0.15551047027111053)]

In [30]:
model.wv.most_similar(positive=["potter"])

  if np.issubdtype(vec.dtype, np.int):


[('wide', 0.1999313235282898),
 ('owner', 0.1796487271785736),
 ('yesterday', 0.17470750212669373),
 ('conversation', 0.16851602494716644),
 ('sat', 0.16233819723129272),
 ('feel', 0.15860338509082794),
 ('guarding', 0.15856021642684937),
 ('cheerfully', 0.1579795479774475),
 ('gaunt', 0.1577623337507248),
 ('mistake', 0.1568850427865982)]

In [31]:
model.wv.most_similar(positive=["voldemort"])

  if np.issubdtype(vec.dtype, np.int):


[('advice', 0.22290074825286865),
 ('everyone', 0.20699702203273773),
 ('fond', 0.17761681973934174),
 ('month', 0.17724180221557617),
 ('figure', 0.1729629933834076),
 ('staying', 0.16827571392059326),
 ('disappointment', 0.1682014763355255),
 ('jar', 0.16677328944206238),
 ('hello', 0.16217169165611267),
 ('five', 0.16051316261291504)]

In [32]:
model.wv.most_similar(positive=["hogwarts"])

  if np.issubdtype(vec.dtype, np.int):


[('graveyard', 0.18303154408931732),
 ('midst', 0.18124863505363464),
 ('self', 0.17502057552337646),
 ('beat', 0.17166206240653992),
 ('echoing', 0.17068055272102356),
 ('play', 0.1682247519493103),
 ('torn', 0.1622982621192932),
 ('dead', 0.15949660539627075),
 ('necessary', 0.1590462327003479),
 ('stopped', 0.15715521574020386)]

In [33]:
model.wv.most_similar(positive=["horcrux"])

  if np.issubdtype(vec.dtype, np.int):


[('situation', 0.2367812991142273),
 ('promptly', 0.22015978395938873),
 ('school', 0.17702756822109222),
 ('picture', 0.1733391135931015),
 ('pull', 0.17176119983196259),
 ('young', 0.17109423875808716),
 ('briskly', 0.1700482964515686),
 ('keeper', 0.16915090382099152),
 ('shown', 0.16410572826862335),
 ('snarled', 0.16379988193511963)]

In [34]:
model.wv.similarity("gryffindor", 'slytherin')

  if np.issubdtype(vec.dtype, np.int):


0.0060571204

In [35]:
model.wv.similarity("harry", 'hagrid')

  if np.issubdtype(vec.dtype, np.int):


0.070742436

# Google word2Vec

In [36]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-01-03 13:06:21--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.84.205
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.84.205|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [37]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz'
google_word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Intersect local Model with Google Word2vec model

In [39]:
%%time
google_model = Word2Vec(size = 300, window=5, min_count = 20, workers = -1)
google_model.build_vocab(word_bag)

#model.intersect_word2vec_format('./word2vec/GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

google_model.intersect_word2vec_format(EMBEDDING_FILE, lockf=1.0, binary=True)

google_model.train(word_bag, total_examples=google_model.corpus_count, epochs = 5)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 1min 25s, sys: 805 ms, total: 1min 26s
Wall time: 1min 26s


## Results

In [40]:
google_model.wv.most_similar(positive=["harry"])

  if np.issubdtype(vec.dtype, np.int):


[('neville', 0.5812051296234131),
 ('mr', 0.555977463722229),
 ('dennis', 0.5308670997619629),
 ('james', 0.5300297737121582),
 ('robert', 0.5231185555458069),
 ('arry', 0.520870566368103),
 ('johnson', 0.5157880783081055),
 ('dont', 0.4983846843242645),
 ('charlie', 0.49837130308151245),
 ('thomas', 0.48531270027160645)]

In [41]:
google_model.wv.most_similar(positive=["potter"])

  if np.issubdtype(vec.dtype, np.int):


[('art', 0.4490913152694702),
 ('painting', 0.4103425443172455),
 ('mead', 0.3641393482685089),
 ('garden', 0.3624359369277954),
 ('flower', 0.35996729135513306),
 ('flint', 0.33174169063568115),
 ('herbology', 0.33140838146209717),
 ('pumpkin', 0.3254821300506592),
 ('willow', 0.32443472743034363),
 ('stone', 0.3094981610774994)]

In [51]:
google_model.wv.most_similar(positive=["voldemort"])

  if np.issubdtype(vec.dtype, np.int):


[('god', 0.6350281238555908),
 ('knight', 0.4713427424430847),
 ('evil', 0.46526503562927246),
 ('heaven', 0.4486253559589386),
 ('sorcerer', 0.42839527130126953),
 ('sir', 0.42287835478782654),
 ('warlock', 0.4114571213722229),
 ('servant', 0.403384268283844),
 ('prince', 0.4027935862541199),
 ('king', 0.3985804319381714)]

In [43]:
google_model.wv.most_similar(positive=["hogwarts"])

  if np.issubdtype(vec.dtype, np.int):


[('advanced', 0.19666540622711182),
 ('place', 0.1936536431312561),
 ('graveyard', 0.18688777089118958),
 ('alive', 0.1808253824710846),
 ('late', 0.1773623377084732),
 ('bell', 0.17182034254074097),
 ('abandoned', 0.1697089523077011),
 ('prime', 0.16884028911590576),
 ('halloween', 0.16830234229564667),
 ('regular', 0.1680608093738556)]

In [44]:
google_model.wv.most_similar(positive=["horcrux"])

  if np.issubdtype(vec.dtype, np.int):


[('horcruxes', 0.6242253184318542),
 ('dementors', 0.6012237071990967),
 ('basilisk', 0.5296100974082947),
 ('dementor', 0.5282191038131714),
 ('locket', 0.4928228259086609),
 ('hippogriff', 0.48730671405792236),
 ('muggle', 0.4694926142692566),
 ('werewolf', 0.460193932056427),
 ('rune', 0.43476802110671997),
 ('hallows', 0.43337249755859375)]

In [45]:
google_model.wv.similarity("voldemort", 'slytherin')

  if np.issubdtype(vec.dtype, np.int):


0.025380021

In [46]:
google_model.wv.similarity("harry", 'hagrid')

  if np.issubdtype(vec.dtype, np.int):


-0.0052783852

In [47]:
google_model.wv.similarity("harry", 'gryffindor')

  if np.issubdtype(vec.dtype, np.int):


-0.055337477

In [48]:
google_model.wv.similarity("lupin", 'werewolf')

  if np.issubdtype(vec.dtype, np.int):


0.0011458562

In [49]:
google_model.wv.most_similar(positive=["basilisk"])

  if np.issubdtype(vec.dtype, np.int):


[('dementors', 0.6436929702758789),
 ('hippogriff', 0.567786455154419),
 ('dementor', 0.5650946497917175),
 ('creature', 0.5508249998092651),
 ('gargoyle', 0.549191415309906),
 ('spider', 0.5463741421699524),
 ('serpent', 0.5414613485336304),
 ('goblin', 0.5300490260124207),
 ('horcrux', 0.5296101570129395),
 ('dragon', 0.502647340297699)]

In [52]:
google_model.wv.most_similar(positive=["quidditch"])

  if np.issubdtype(vec.dtype, np.int):


[('muggle', 0.5214775800704956),
 ('wizarding', 0.47505366802215576),
 ('quaffle', 0.4732670187950134),
 ('dementors', 0.4503976106643677),
 ('muggles', 0.4465000629425049),
 ('ludo', 0.3921332359313965),
 ('dementor', 0.3704177141189575),
 ('werewolf', 0.3695617616176605),
 ('broomstick', 0.3691725432872772),
 ('hippogriff', 0.36648988723754883)]

In [56]:
google_model.wv.most_similar(positive=["wand"])

  if np.issubdtype(vec.dtype, np.int):


[('broom', 0.531521201133728),
 ('broomstick', 0.46065953373908997),
 ('dementors', 0.4059273600578308),
 ('hippogriff', 0.3880380690097809),
 ('basilisk', 0.3835200071334839),
 ('sword', 0.38140869140625),
 ('dementor', 0.3698751628398895),
 ('gently', 0.3696576952934265),
 ('button', 0.3636736273765564),
 ('muggle', 0.3636331856250763)]