# word2vec txt train and model conversion

Adapted from Thrones2Vect by Yuriy Guts, 2016

## Imports

In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
%pylab inline

**Set up logging**

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

**Download NLTK tokenizer models (only the first time)**

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

## Prepare Corpus

**Load books from files**

In [None]:
book_filenames = sorted(glob.glob("data/*.txt"))

In [None]:
print("Found books:")
book_filenames

**Combine the books into one string**

In [None]:
corpus_raw = u""
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))
    print()

**Split the corpus into sentences**

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [None]:
#convert into a list of words
#rtemove unnnecessary,, split into words, no hyphens
#list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [None]:
#sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [None]:
print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))

## Train Word2Vec

In [None]:
#ONCE we have vectors
#step 3 - build model
#3 main tasks that vectors help with
#DISTANCE, SIMILARITY, RANKING

# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 7

# Downsample setting for frequent words.
#0 - 1e-5 is good for this
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [None]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
thrones2vec.build_vocab(sentences)

In [None]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))

**Start training, this might take a minute or two...**

In [None]:
thrones2vec.train(sentences, total_examples=thrones2vec.corpus_count, epochs=100)

**Save to file, can be useful later**

In [None]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [None]:
thrones2vec.save(os.path.join("trained", "thrones2vec.w2v"))

## Convert trained model to txt file for glove model use.

In [4]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "thrones2vec.w2v"))

In [5]:
from gensim.models.keyedvectors import KeyedVectors

thrones2vec.wv.save_word2vec_format(os.path.join("trained", "thrones2vec.bin"), binary=True)

In [6]:
model = KeyedVectors.load_word2vec_format(os.path.join("trained", "thrones2vec.bin"), binary=True)
model.save_word2vec_format(os.path.join("trained", "thrones2vec.txt"), binary=False)