## adapted from: https://github.com/nzw0301/keras-examples/blob/master/CBoW.ipynb

In [1]:
import os,re,sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import pickle

np.random.seed(42)

from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords

import keras.backend as K
from keras.models import Sequential,Model
from keras.layers import Dense, Embedding, Lambda, Input, Concatenate, Average
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from tqdm import *

import gensim

%matplotlib inline
%load_ext autoreload
%autoreload 1

src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.segments
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

MODELS_ROOT = os.path.abspath("../../../models/ranking/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-13.csv')

# CONFIGS

SEED= 42

np.random.seed(SEED)

DEFAULT_KERAS_FILTER = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

MAX_TEXT_LENGTH = 1000
NB_DOCS = 5
DIM = 100
CONTEXT_WINDOW_SIZE = 2
STOPWORDS = None
TOKENIZER_FILTERS = DEFAULT_KERAS_FILTER+"'"
NB_EPOCHS = 20

docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

docs_df = sample_rows(docs_df,NB_DOCS)

corpus = docs_df['synopsis'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2



cbow_orig = Sequential()
cbow_orig.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow_orig.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow_orig.add(Dense(V, activation='softmax'))

cbow_orig.compile(loss='categorical_crossentropy', optimizer='adagrad')

def generate_data_orig(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)
 
  
    
for epoch in range(NB_EPOCHS):
    loss = 0.
    for x, y in generate_data_orig(corpus, window_size, V):
        loss += cbow_orig.train_on_batch(x, y)
    print(epoch, loss)


Using TensorFlow backend.


0 49553.3056538
1 45672.2164713
2 44471.2225741
3 43741.6566956
4 43209.3532488
5 42782.9492111
6 42421.432866
7 42103.1705788
8 41815.5590726
9 41550.73807
10 41303.5579368
11 41070.5108548
12 40849.1290367
13 40637.6234328
14 40434.6575589
15 40239.2054163
16 40050.4582493
17 39867.7609636
18 39690.5722414
19 39518.4367354


In [None]:
corpus

In [None]:
corpus[0]

In [None]:
input1 = Input(shape=(CONTEXT_WINDOW_SIZE*2,))
x1 = Embedding(VOCAB_SIZE,output_dim=DIM,name="word_embeddings")(input1)
x1 = Lambda(lambda x: K.mean(x, axis=1), output_shape=(DIM,))(x1)

outputs = Dense(VOCAB_SIZE,activation='softmax')(x1)

cbow = Model(inputs=input1,outputs=outputs)


In [None]:
cbow2.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [None]:
SVG(model_to_dot(cbow2,show_shapes=True).create(prog='dot', format='svg'))

In [None]:
def generate_data(document_sequences, window_size, vocabulary_size):
    
    maxlen = window_size*2
    
    for word_sequence in document_sequences:
               
        text_length = len(word_sequence)
        
        for index_in_document,word in enumerate(word_sequence):
            
            # these are the words in the context
            contexts = []
            # these are the target words (which we'll predict using the context)
            labels = []
                        
            context_start = index_in_document - window_size           
            context_end   = index_in_document + window_size
            
            context = []
            
            for index_in_context in range(context_start, context_end+1):
                if index_in_document != index_in_context: # index_in_context is the target word
                    if index_in_context >= 0 and index_in_context < text_length: # inner_index must be a valid index
                        context_word = word_sequence[index_in_context]
                        context.append(context_word)
                        
            contexts.append(context)
                       
            labels.append(word)
            
            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels,vocabulary_size)
            
            
            yield (x,y) 

In [None]:
f = open('vectors.txt' ,'w')
f.write(' '.join([str(MAX_NB_WORDS-1), str(DIM)]))
f.write('\n')

In [None]:
pairs = [ (word,i) for word,i in tokenizer.word_index.items() if i < MAX_NB_WORDS]

pairs = sorted(pairs,key=lambda tpl: tpl[1])

In [None]:
actually_used_word_index = dict(pairs)

In [None]:
vectors = cbow.get_weights()[0]
vectors.shape

In [None]:
for word, i in actually_used_word_index.items():
    f.write(word)
    f.write(' ')
    f.write(' '.join(map(str, list(vectors[i, :]))))
    f.write('\n')
f.close()

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['i'])

In [None]:
w2v.most_similar(positive=['he'])

In [None]:
w2v.most_similar(positive=['man'])

In [None]:
w2v.most_similar(positive=['can'])

In [None]:
path = PATH_TO_SAVED_MODELS+"/word2vec-cbow.p"

In [None]:
cbow.save(path)