In [14]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
!ls /content/drive/'My Drive'/NLP_vec/

got1.txt  got2.txt  got3.txt  got4.txt	got5.txt


In [0]:
import codecs                        ## word encoding
import glob                          ## regex
import multiprocessing               ## concurrency
import pprint                        ## pretty printing
import nltk
import gensim.models.word2vec as w2v ## word2 vec model
import sklearn.manifold              ## dimensonality reduction
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import re

In [23]:
## Load books from files::
directory = "/content/drive/My Drive/NLP_vec/"
os.chdir(directory)

book_list = sorted(os.listdir())
print(book_list)

['got1.txt', 'got2.txt', 'got3.txt', 'got4.txt', 'got5.txt']


In [24]:
## Combine the books into one string
corpus_raw = u""
for book in book_list:
    print("Reading '{0}'...".format(book))
    with codecs.open(book, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))


Reading 'got1.txt'...
Corpus is now 1768233 characters long
Reading 'got2.txt'...
Corpus is now 3526269 characters long
Reading 'got3.txt'...
Corpus is now 5808346 characters long
Reading 'got4.txt'...
Corpus is now 7469273 characters long
Reading 'got5.txt'...
Corpus is now 9774033 characters long


In [0]:
## NLTK : Tokenizer

tokenizer  = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(corpus_raw)

In [0]:
# convert into a list of words remove unnecessary,, split into words, no hyphens just list of words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]" ," ", raw)
    words = clean.split()
    return words

In [31]:
# sentence where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

print(raw_sentences[5])
print(sentence_to_wordlist(raw_sentences[5]))
token_count = (sum([len(sentence) for sentence in sentences]))

print("The corpus has {0:,} tokens".format(token_count))

Now he thought of them as old friends.
['Now', 'he', 'thought', 'of', 'them', 'as', 'old', 'friends']
The corpus has 1,836,474 tokens


In [32]:
######################################################### Train Word2Vec ###############################################

''' ONCE we have vectors step 3 - build model : 

    3 main tasks that vectors help with
    DISTANCE, SIMILARITY, RANKING

    Dimensionality of the resulting word vectors.
    more dimensions, more computationally expensive to train
    but also more accurate
    more dimensions = more generalized
'''
num_features = 300  # ''' Dimension of the resulting word vector'''
min_word_count = 3  # ''' Minimum word count threshold. '''
num_workers = multiprocessing.cpu_count()  # ''' Number of threads to run in parallel. : more workers, faster we train '''
context_size = 7  # ''' Context window length. '''
downsampling = 1e-3  # ''' Downsample setting for frequent words. 0 - 1e-5 is good for this '''
seed = 1  # ''' Seed for the RNG, to make the results reproducible. random number generator,deterministic, good for debugging '''

got2vec = w2v.Word2Vec(sg=1,
                       seed=seed,
                       workers=num_workers,
                       size=num_features,
                       min_count=min_word_count,
                       window=context_size,
                       sample=downsampling)

got2vec.build_vocab(sentences)
print("Word2Vec vocabulary length:", len(got2vec.wv.vocab))

Word2Vec vocabulary length: 17301


In [35]:
got2vec.train(sentences ,total_examples = got2vec.corpus_count ,epochs = got2vec.iter)

''' Create directory '''

if not os.path.isdir('trained_model'):
    print('new directory has been created')

  
got2vec.save("got2vec.w2v")

  """Entry point for launching an IPython kernel.


new directory has been created


In [36]:
got2vec = w2v.Word2Vec.load("got2vec.w2v")

tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)

all_word_vectors_matrix = got2vec.wv.syn0

all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)

points = pd.DataFrame(
    [
        (word, coords[0], coords[1])
        for word, coords in [
            (word, all_word_vectors_matrix_2d[got2vec.wv.vocab[word].index])
            for word in got2vec.wv.vocab
       ]
    ],
    columns=["word", "x", "y"]
)


  """


In [37]:
got2vec.most_similar_cosmul("blood")     

  """Entry point for launching an IPython kernel.


[('veins', 0.7431859970092773),
 ('brains', 0.7431262135505676),
 ('smeared', 0.7329888343811035),
 ('Blood', 0.7278515100479126),
 ('sap', 0.7251682281494141),
 ('trickling', 0.7224265933036804),
 ('valonqar', 0.7211002111434937),
 ('venom', 0.7201265692710876),
 ('salty', 0.7200968861579895),
 ('excrement', 0.7197314500808716)]