In [91]:
# Import libraries
import numpy as np
from keras.preprocessing.text import Tokenizer

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence

import gensim
import re

In [92]:
# File name and extracting data from it
file_name = r"movie_lines.txt"
lines = []
with open(file_name) as f:
    doc = f.readlines()

for line in doc:
    lines.append(line.split(" +++$+++ ")[-1].replace("\n",""))

# lines = " ".join(lines)
    
del doc

In [93]:
# Parameters
min_words = 5
window_size = 2

In [94]:
# Remove special characters
lines = [re.sub('[^a-zA-Z0-9 ]', '', line) for line in lines]

# Consider only those sentences with more than 2 words
lines = [text for text in lines if text.count(' ') >= min_words]

# Consider a subset
lines = lines[:1000]

# Tokenizing the words 
vectorize = Tokenizer()
vectorize.fit_on_texts(lines)
vectorized_lines = vectorize.texts_to_sequences(lines)

# Dataset based stats
total_vocab = sum(len(s) for s in vectorized_lines)
word_count = len(vectorize.word_index) + 1

In [95]:
def cbow_model(data, window_size, total_vocab):
    # Total length of the window considering the target word
    total_length = window_size*2
    
    # Individually considering each sentence
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            
            # Consider surrounding words of the target words. Special condition to avoid negative indexing
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            
            # Padding sequences in case of starting words and ending words of sentences
            contextual = sequence.pad_sequences(context_word, maxlen=total_length)
            final_target = np_utils.to_categorical(target, total_vocab)
            yield(contextual, final_target)

In [97]:
# Simple Model with Embedding Layer with Categorical Crossentropy loss and Adam Optimizer
model = Sequential()
model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
model.add(Dense(total_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
for i in range(10):
    cost = 0
    for contextual, final_target in cbow_model(vectorized_lines, window_size, total_vocab):
        cost += model.train_on_batch(contextual, final_target)
    print(i, cost)

0 96778.89892441034
1 81283.45058482885
2 74931.77163121104
3 68754.67902367562
4 62681.44745000824
5 56809.88444168866
6 51211.7634006273
7 46085.82725789177
8 41575.64151547686
9 37674.032532867684


In [102]:
# Must match dimension of model output
dimensions = 100
weights = model.get_weights()[0]

# Storing Word Vectors with weights onto disk
with open('vectors.txt' ,'w') as vect_file:
    vect_file.write('{} {}\n'.format(len(vectorize.word_index.items()), dimensions))
    for text, i in vectorize.word_index.items():
        
        # Special condition for last word embeddeding to avoid adding extra empty line
        if i == len(vectorize.word_index):
            final_vec = ' '.join(map(str, list(weights[i, :])))
            vect_file.write('{} {}'.format(text, final_vec))
            continue
            
        final_vec = ' '.join(map(str, list(weights[i, :])))
        vect_file.write('{} {}\n'.format(text, final_vec))

In [103]:
# Load Word Vector into Gensim Word2Vec model
cbow_output = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)

# Random word checked
cbow_output.most_similar(positive=['good'])

[('gay', 0.3722064197063446),
 ('gonna', 0.32852739095687866),
 ('cop', 0.31805941462516785),
 ('introduction', 0.315818190574646),
 ('plan', 0.3115581274032593),
 ('heres', 0.3040046691894531),
 ('sitting', 0.29972460865974426),
 ('ending', 0.29890140891075134),
 ('spare', 0.2936362028121948),
 ('body', 0.291808158159256)]