# mahabharatha2Vec

## Imports

In [5]:
from __future__ import absolute_import, division, print_function

In [2]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

Using TensorFlow backend.


In [3]:
import glob
import multiprocessing
import os
import re

In [4]:
%pylab inline
%matplotlib inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
# downlaoding sentence tokenizer and stopwords
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/sanju/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sanju/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Prepare Corpus

**Load books from files**

In [7]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [8]:
print("Found books:")
book_filenames

Found books:


['data/ajaya1.txt', 'data/ajaya2.txt', 'data/asura.txt', 'data/jaya.txt']

**Combine the books into one string**

In [9]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

Reading 'data/ajaya1.txt'...
Corpus is now 949412 characters long

Reading 'data/ajaya2.txt'...
Corpus is now 1889681 characters long

Reading 'data/asura.txt'...
Corpus is now 2953032 characters long

Reading 'data/jaya.txt'...
Corpus is now 3533460 characters long



**Split the corpus into sentences**

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [12]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [13]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [15]:
print(raw_sentences[51])
print(sentence_to_wordlist(raw_sentences[51]))

It	made	me	rush	home	to	revisit	the	Mahabharata,	an	epic
that	has	inspired	countless	writers	over	the	centuries.
['It', 'made', 'me', 'rush', 'home', 'to', 'revisit', 'the', 'Mahabharata', 'an', 'epic', 'that', 'has', 'inspired', 'countless', 'writers', 'over', 'the', 'centuries']


In [17]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 605,503 tokens


In [146]:
len(sentences)

43384

## Train to obtain Word Vectors

In [56]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Context window length.
context_size = 7

# Downsample setting for frequent words.
downsampling = 0.5*1e-5

In [57]:
# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Seed for the RNG, to make the results reproducible.
seed = 1

In [58]:
maha2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [145]:
maha2vec.build_vocab(sentences)

Word2Vec vocabulary length: 10749


In [144]:
print("Word2Vec vocabulary length:", len(maha2vec.wv.vocab))

Word2Vec vocabulary length: 10749


In [81]:
maha2vec.train(sentences,total_examples=maha2vec.corpus_count, epochs=maha2vec.iter)

1455651

In [82]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [83]:
maha2vec.save(os.path.join("trained", "maha2vec.w2v"))

## Explore the trained model.

## Loading the trained model

In [6]:
maha2vec = w2v.Word2Vec.load(os.path.join("trained", "maha2vec.w2v"))

### Explore semantic similarities between book characters

**Words closest to the given word**

In [19]:
maha2vec.most_similar("Karna")

[(u'Suyodhana', 0.7754979133605957),
 (u'Guru', 0.7687414884567261),
 (u'Drona', 0.7591928243637085),
 (u'Aswathama', 0.7510437965393066),
 (u'Nobody', 0.7374307513237),
 (u'gurudakshina', 0.7326776385307312),
 (u'calmly', 0.7314265966415405),
 (u'smarted', 0.7226190567016602),
 (u'Suta', 0.7209179401397705),
 (u'frowned', 0.7204959988594055)]

In [20]:
maha2vec.most_similar("Arjuna")

[(u'Jayadhrata', 0.7607086896896362),
 (u'Gandiva', 0.7360511422157288),
 (u'Ashwatthama', 0.7333036661148071),
 (u'Bhima', 0.7231071591377258),
 (u'behead', 0.717864453792572),
 (u'favourite', 0.7155941724777222),
 (u'Gaya', 0.7155582904815674),
 (u'Realizing', 0.7148381471633911),
 (u'chord', 0.7107759714126587),
 (u'Krishna', 0.7098587155342102)]

In [21]:
maha2vec.most_similar("Suyodhana")

[(u'coldly', 0.8158780336380005),
 (u'calmly', 0.8102086782455444),
 (u'softly', 0.7996382117271423),
 (u'Aswathama', 0.7930370569229126),
 (u'tone', 0.7758366465568542),
 (u'Karna', 0.7754979133605957),
 (u'Gandharan', 0.7745668292045593),
 (u'chuckled', 0.7702316045761108),
 (u'glee', 0.7696663737297058),
 (u'perplexed', 0.7642831802368164)]

**Linear relationships between word pairs**

In [10]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = maha2vec.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

### Father Relation

In [22]:
nearest_similarity_cosmul("Dhritarashtra", "Suyodhana", "Arjuna")

Dhritarashtra is related to Suyodhana, as Pandu is related to Arjuna


u'Pandu'

### Mother Relation

In [23]:
nearest_similarity_cosmul("Kunti", "Arjuna", "Suyodhana")

Kunti is related to Arjuna, as Gandhari is related to Suyodhana


u'Gandhari'

### Wife Relation

In [24]:
nearest_similarity_cosmul("Subhadra", "Arjuna", "Jayadratha")

Subhadra is related to Arjuna, as Sushala is related to Jayadratha


u'Sushala'

### Sibling Relation

In [14]:
nearest_similarity_cosmul("Balarama", "Krishna", "Nakula")

Balarama is related to Krishna, as Sahadeva is related to Nakula


u'Sahadeva'

In [15]:
nearest_similarity_cosmul("Sahadeva", "Nakula", "Bhima")

Sahadeva is related to Nakula, as Arjuna is related to Bhima


u'Arjuna'

### Some of them very wrong....

In [16]:
nearest_similarity_cosmul("Subhadra", "Krishna", "Suyodhana")

Subhadra is related to Krishna, as Bhanumati is related to Suyodhana


u'Bhanumati'

In [17]:
nearest_similarity_cosmul("Draupadi", "Arjuna", "Suyodhana")

Draupadi is related to Arjuna, as Sabha is related to Suyodhana


u'Sabha'

In [18]:
nearest_similarity_cosmul("Karna", "Suyodhana", "Vidhura")

Karna is related to Suyodhana, as unmoved is related to Vidhura


u'unmoved'