# LP4 Assignment 5
## Sudeep Mangalvedhekar
## 43147

In [20]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.utils import pad_sequences
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
import numpy as np
import pandas as pd

In [21]:
words = """Machine learning is a field of inquiry devoted to understanding and building methods that learn, that is, methods that leverage data to improve performance on some set of tasks. It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.
A subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers, but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning. Some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain. In its application across business problems, machine learning is also referred to as predictive analytics."""


In [22]:
data = words.split()
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word_id = tokenizer.word_index

In [23]:
word_id['PAD'] = 0
id_word = {v:k for k, v in word_id.items()}
wids = [[word_id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word_id)
embed_size = 100
window_size = 2 

In [32]:
def context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [33]:
cbow = Sequential()
cbow.add(Embedding(input_dim = vocab_size, output_dim = embed_size, input_length = window_size * 2))
cbow.add(Lambda(lambda x: K.mean(x, axis = 1), output_shape = (embed_size,)))
cbow.add(Dense(vocab_size, activation = 'softmax'))
cbow.compile(loss = 'categorical_crossentropy', optimizer = 'rmsprop')

cbow.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 4, 100)            11400     
                                                                 
 lambda_4 (Lambda)           (None, 100)               0         
                                                                 
 dense_4 (Dense)             (None, 114)               11514     
                                                                 
Total params: 22,914
Trainable params: 22,914
Non-trainable params: 0
_________________________________________________________________


In [34]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in context_word_pairs(corpus = wids, window_size = window_size, vocab_size = vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 1 	Loss: 930.3021101951599
Epoch: 2 	Loss: 895.9225301742554
Epoch: 3 	Loss: 883.9017984867096
Epoch: 4 	Loss: 880.0481750965118
Epoch: 5 	Loss: 877.5646753311157


In [35]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index = list(id_word.values())[1:]).head()

(113, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,-0.048941,0.000117,-0.037539,0.048601,-0.016828,-0.026413,-0.010593,0.030688,-0.026337,-0.010564,...,-0.010782,-0.02405,-0.045778,0.029327,-0.025733,-0.016876,0.001123,-0.020288,0.014119,-0.017256
to,-0.026185,-0.041119,0.009876,-0.010092,-0.011144,-0.027401,0.017925,-0.033435,0.046328,0.009438,...,-0.01689,-0.044985,-0.04178,-0.049175,0.042389,-0.007941,0.030295,0.029686,0.000743,-0.004645
machine,0.017866,-0.046512,0.031775,0.039631,-0.019082,-0.007789,-0.005886,-0.002875,-0.01163,0.031239,...,-0.033641,0.038276,-0.047828,0.041901,0.017795,0.026543,-0.028124,0.043737,-0.039543,-0.006502
is,0.023588,-0.042624,-0.033366,-0.036219,0.021758,-0.033087,0.041991,-0.049666,0.00193,-0.009174,...,-0.048147,0.038943,0.008366,-0.035012,-0.013942,-0.036681,0.045836,0.045886,-0.035563,0.022609
a,0.010195,0.046708,0.014745,0.02207,-0.002705,-0.043659,-0.031714,-0.00577,0.009226,0.038061,...,-0.008277,-0.031105,0.044923,0.018262,0.034401,-0.018318,0.026816,0.022182,0.026933,0.019538


In [41]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id_word[idx] for idx in distance_matrix[word_id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['statistics']}

similar_words

(113, 113)


{'statistics': ['use', 'across', 'unfeasible', 'delivers', 'biological']}