In [1]:
import codecs  # for unicode encoding
import glob  # for reading files
import multiprocessing   # to run your model faster
import os  # access the filesystem
import re  # to run regular expression

In [2]:
import nltk
import gensim.models.word2vec as w2v
import numpy as np

In [3]:
# open your files

book_filenames = sorted(glob.glob("data/*.txt"))

In [4]:
book_filenames

['data/got1.txt',
 'data/got2.txt',
 'data/got3.txt',
 'data/got4.txt',
 'data/got5.txt']

In [5]:
# STEP 1

# combine all books into one large corpus

corpus_raw = u""

# read every file and append it to corpus raw

for book in book_filenames:
    with codecs.open(book,"r","utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus length {0}".format(len(corpus_raw)))

Corpus length 1770659
Corpus length 4071041
Corpus length 6391405
Corpus length 8107945
Corpus length 9719485


In [6]:
# download certain helper functions from nltk

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/abhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# create a tokenizer

tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

In [8]:
# STEP 2

# split my corpus into sentences using this tokenizer

raw_sentences = tokenizer.tokenize(corpus_raw)

In [9]:
len(raw_sentences)

128868

In [10]:
raw_sentences[10]

'It was here the ravens came, after long flight.'

In [11]:
# STEP 3 - cleaning your data

# remove non letters and split into words

def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [12]:
# split raw sentences

sentences = []

for raw in raw_sentences:
    if len(raw)>0:
        sentences.append(sentence_to_wordlist(raw))

In [13]:
raw_sentences[10]

'It was here the ravens came, after long flight.'

In [14]:
sentences[10]

['It', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight']

In [15]:
token_count = sum([len(sentence) for sentence in sentences])
token_count

1818103

In [16]:
# train word 2 vec

word2vec = w2v.Word2Vec(
    sg=1,
    seed=1,
    workers= multiprocessing.cpu_count(),
    size=300,
    min_count=3,
    window=7,
    sample=1e-3
)

In [17]:
word2vec.build_vocab(sentences)

In [18]:
len(word2vec.wv.vocab)

17277

In [19]:
word2vec.train(sentences)

7021780

In [2]:
word2vec.wv.most_similar("Stark")

NameError: name 'word2vec' is not defined

In [27]:
word2vec.wv.most_similar_cosmul(positive=["","woman"], negative=["man"])

[('Lannister', 1.1300432682037354),
 ('Tyrion', 1.118436336517334),
 ('Kevan', 1.1168137788772583),
 ('Lancel', 1.0900390148162842),
 ('Tywin', 1.0680841207504272),
 ('Sansa', 1.0382449626922607),
 ('Margaery', 1.0335110425949097),
 ('Joff', 1.0157004594802856),
 ('Joffrey', 1.015446424484253),
 ('dwarf', 1.0100629329681396)]