## adapted from: https://github.com/nzw0301/keras-examples/blob/master/CBoW.ipynb

In [1]:
import os,re,sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import pickle

np.random.seed(42)

from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords

import keras.backend as K
from keras.models import Sequential,Model
from keras.layers import Dense, Embedding, Lambda, Input, Concatenate, Average
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from tqdm import *

import gensim

Using TensorFlow backend.


In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 1

In [3]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [4]:
%aimport data.movielens_20m_imdb
%aimport utils.dataframes

from data.movielens_20m_imdb import load_or_get_from_cache
from utils.dataframes import sample_rows

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_SAVED_MODELS = os.path.abspath('../../models/')

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

In [6]:
MAX_TEXT_LENGTH = 1000
MAX_NB_WORDS = 500
NB_DOCS = 50
DIM = 50
CONTEXT_WINDOW_SIZE = 2
# STOPWORDS = nltk_stopwords.words('english')
STOPWORDS = None
TOKENIZER_FILTERS = string.punctuation+"'"
NB_EPOCHS = 10

In [7]:
docs_df = load_or_get_from_cache(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,INTERIM_DATA_ROOT)

In [8]:
docs_df = sample_rows(docs_df,NB_DOCS)

In [9]:
corpus = docs_df['plot'].values

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

In [11]:
sequences = tokenizer.texts_to_sequences(corpus)

In [12]:
nb_samples = sum(len(seq) for seq in sequences)

In [13]:
# total size of the corpus, in words
nb_samples

11519

In [14]:
def generate_data(document_sequences, window_size, vocabulary_size):
    
    maxlen = window_size*2
    
    for word_sequence in document_sequences:
               
        text_length = len(word_sequence)
        
        for index_in_document,word in enumerate(word_sequence):
            
            # these are the words in the context
            contexts = []
            # these are the target words (which we'll predict using the context)
            labels = []
                        
            context_start = index_in_document - window_size           
            context_end   = index_in_document + window_size
            
            
            for index_in_context in range(context_start, context_end+1):
                if index_in_document != index_in_context: # index_in_context is the target word
                    if index_in_context >= 0 and index_in_context < text_length: # inner_index must be a valid index
                        context_word = word_sequence[index_in_context]
                        contexts.append([context_word])
                       
            labels.append(word)
            
            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels,vocabulary_size)
            
            
            yield (x,y) 

In [None]:
def generate_data2(document_sequences, window_size, vocabulary_size):
    maxlen = window_size*2
    for words in document_sequences:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, vocabulary_size)
            yield (x, y)

In [15]:
VOCAB_SIZE = len(tokenizer.word_index)+1

input1 = Input(shape=(CONTEXT_WINDOW_SIZE*2,))
x1 = Embedding(VOCAB_SIZE,output_dim=DIM,name="word_embeddings")(input1)
x1 = Lambda(lambda x: K.mean(x, axis=1), output_shape=(DIM,))(x1)

outputs = Dense(VOCAB_SIZE,activation='softmax')(x1)

cbow = Model(inputs=input1,outputs=outputs)


In [16]:
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [None]:
SVG(model_to_dot(cbow,show_shapes=True).create(prog='dot', format='svg'))

In [17]:
# VOCAB_SIZE = len(tokenizer.word_index)+1
# cbow2 = Sequential()
# cbow2.add(Embedding(input_dim=VOCAB_SIZE, output_dim=DIM, input_length=CONTEXT_WINDOW_SIZE*2))
# cbow2.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(DIM,)))
# cbow2.add(Dense(VOCAB_SIZE, activation='softmax'))
# cbow2.compile(loss='categorical_crossentropy', optimizer='adadelta')
# SVG(model_to_dot(cbow2,show_shapes=True).create(prog='dot', format='svg'))

In [19]:
for epoch in range(NB_EPOCHS):
    loss = 0.
    for x, y in tqdm(generate_data(document_sequences=sequences, window_size=CONTEXT_WINDOW_SIZE, vocabulary_size=VOCAB_SIZE)):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

0it [00:00, ?it/s]


ValueError: Input arrays should have the same number of samples as target arrays. Found 2 input samples and 1 target samples.

In [None]:
f = open('vectors.txt' ,'w')
f.write(' '.join([str(MAX_NB_WORDS-1), str(DIM)]))
f.write('\n')

In [None]:
pairs = [ (word,i) for word,i in tokenizer.word_index.items() if i < MAX_NB_WORDS]

pairs = sorted(pairs,key=lambda tpl: tpl[1])

In [None]:
actually_used_word_index = dict(pairs)

In [None]:
vectors = cbow.get_weights()[0]
vectors.shape

In [None]:
for word, i in actually_used_word_index.items():
    f.write(word)
    f.write(' ')
    f.write(' '.join(map(str, list(vectors[i, :]))))
    f.write('\n')
f.close()

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['i'])

In [None]:
w2v.most_similar(positive=['he'])

In [None]:
w2v.most_similar(positive=['man'])

In [None]:
w2v.most_similar(positive=['can'])

In [None]:
path = PATH_TO_SAVED_MODELS+"/word2vec-cbow.p"

In [None]:
cbow.save(path)