# Assignment 1


In [32]:
import numpy as np
np.random.seed(13) #TODO Check if this is used for sgd
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot
from keras.preprocessing import sequence
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
from __future__ import division

In [33]:
# DO NOT Modify the lines in this cell
path = 'alice.txt'
corpus = open(path).readlines()[0:700]

corpus = [sentence for sentence in corpus if sentence.count(" ") >= 2]

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1

# Is this something they need to change?
dim = 100
window_size = 2 #use this window size for Skipgram, CBOW, and the model with the additional hidden layer
window_size_corpus = 4 #use this window size for the co-occurrence matrix

## Question 1

### Co-occurrence Matrix
Use the provided code to load the "Alice in Wonderland" text document. 
1. Implement the word-word co-occurrence matrix for “Alice in Wonderland”
2. Normalize the words such that every value lies within a range of 0 and 1
3. Compute the cosine distance between the given words:
    - Alice 
    - Dinah
    - Rabbit
4. List the 5 closest words to 'Alice'. Discuss the results.
5. Discuss what the main drawbacks are of a term-term co-occurence matrix solutions?


In [34]:
#create co-occurrence matrix
import pandas as pd
from sklearn import preprocessing

#dataset with unique words as index
words = list(tokenizer.word_index.keys())
matrix = pd.DataFrame(words, columns=['words'])
matrix.set_index('words', inplace=True)
columns = pd.DataFrame(columns=words)
matrix = pd.concat([matrix,columns])
matrix = matrix.reindex(columns.columns, axis=1)
matrix.fillna(0, inplace=True)

#inverse index to get word by code
inverse_index = dict((v,k) for k, v in tokenizer.word_index.items())

#compute score for every word-word couple
for line in corpus:
    for i, word_code in enumerate(line):
        word = inverse_index.get(word_code)
        for j in range(max(0, i-window_size_corpus), min(len(line), i+window_size_corpus+1)):
            if word_code != line[j]:
                matrix[word][inverse_index.get(line[j])] += 1

#normalize each sample
normalized_values = preprocessing.normalize(matrix.values)
matrix = pd.DataFrame(normalized_values, index=matrix.index, columns=matrix.columns)

matrix.head()

Unnamed: 0,grin,says,either,ma,he,mixed,flowers,well,conclusion,thinking,...,wasting,throat,minute,ears,mistake,gently,cupboards,edgar,nurse,sorrowful
grin,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
says,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
either,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119523,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
he,0.108465,0.108465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#find cosine similarity to Alice, Dinah and Rabbit

alice_vector = matrix['alice'].values.reshape(1, -1)
rabbit_vector = matrix['rabbit'].values.reshape(1, -1)
dinah_vector = matrix['dinah'].values.reshape(1, -1)

alice_vs_rabbit = cosine_similarity(alice_vector, rabbit_vector)
alice_vs_dinah = cosine_similarity(alice_vector, dinah_vector)
dinah_vs_rabbit = cosine_similarity(dinah_vector, rabbit_vector)

print("Cosine similarity between Alice and Rabbit: " + str(alice_vs_rabbit[0][0]))
print("Cosine similarity between Alice and Dinah: " + str(alice_vs_dinah[0][0]))
print("Cosine similarity between Dinah and Rabbit: " + str(dinah_vs_rabbit[0][0]))

Cosine similarity between Alice and Rabbit: 0.054617805224162
Cosine similarity between Alice and Dinah: 0.05073318700003422
Cosine similarity between Dinah and Rabbit: 0.03424344802534649


In [36]:
#find the closest words to Alice (nearest neighbors)

neigh = nn()
neigh.fit(matrix.values)

neighbors = neigh.kneighbors(alice_vector, 6)

for i, n in enumerate(neighbors[1][0]):
    if matrix.index[n] != 'alice':
        print("Word: {}\nDistance: {}\n".format(matrix.index[n], neighbors[0][0][i]))

Word: i
Distance: 3.6581715798218215

Word: she
Distance: 3.6627351402754194

Word: a
Distance: 3.664105104961176

Word: very
Distance: 3.66585512451628

Word: it
Distance: 3.6687653879055233



The five closest words to "Alice" appear to be: "I", "she", "a", "very" and "it". It makes sense that the personal pronouns "she" and "i" are the closest ones to "Alice", since they are used in very similiar contexts and they are probably often followed by the same verbs. E.g. "Alice sees the rabbit", "she sees the rabbit" or "I see the rabbit", in direct dialogs. As for the other three words, they do not bring valuable information, as they are all quite common in the english language. A possible way to avoid this would be to remove stopwords from the corpus, in order to focus only on more meaningful terms.

Discussion of the drawbacks:
- **Sparse matrix:** As can be seen in the output of the matrix, the word pair 'particular' and 'suddenly' never appear close to each other in the corpus, but there is still a cell reserved in the matrix for that pair. The same goes for many other pairs in the matrix, resulting in many cells with value 0. Obviously, this is not very efficient for storage.
- **Large matrix:** As can be seen in the word co-occurrence matrix for Alice In Wonderland that was created above, the matrix size is 1182 rows x 1182 columns, where 1182 is the number of tokens in the corpus. It is easy to imagine that this will become a problem for a bigger corpus, or for a corpus which has dynamic content, e.g. a web search engine.
- **Non-discriminative results:** As shown in the nearest neighbors calculation above, the words 'a', 'it', and 'very' do not give very meaningful correlation. However, they are among the most common words used in English language, so it makes sense that they appear often near the term 'Alice' (or any other terms in the corpus). Therefore, stopwords filter or even a different weighting score other than word occurence count should be used in order to find words correlation that is more meaningful.

In [37]:
#Save your all the vector representations of your word embeddings in this way
#Change when necessary the sizes of the vocabulary/embedding dimension

f = open('vectors_co_occurrence.txt',"w")
f.write(" ".join([str(V-1),str(V-1)]))
f.write("\n")

#vectors = your word co-occurrence matrix
vectors = matrix.values
for i, word in enumerate(tokenizer.word_index.keys()): 
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()

In [38]:
#reopen your file as follows

co_occurrence = KeyedVectors.load_word2vec_format('./vectors_co_occurrence.txt', binary=False)

## Question 2

### Word embeddings
Build embeddings with a keras implementation where the embedding vector is of length 50, 150 and 300. Use the Alice in Wonderland text book for training.
1. Using the CBOW model
2. Using Skipgram model
3. Add extra hidden dense layer to CBow and Skipgram implementations. Choose an activation function for that layer and justify your answer.
4. Analyze the four different word embeddings
    - Implement your own function to perform the analogy task with. Do not use existing libraries for this task such as Gensim. Your function should be able to answer whether an anaology as in the example given in the pdf-file is true.
    - Compare the performance on the analogy task between the word embeddings that you have trained in 2.1, 2.2 and 2.3.  
    - Visualize your results and interpret your results
5. Use the word co-occurence matrix from Question 1. Compare the performance on the analogy task with the performance of your trained word embeddings.  
6. Discuss:
    - What are the main advantages of CBOW and Skipgram?
    - What is the advantage of negative sampling?
    - What are the main drawbacks of CBOW and Skipgram?
7. Load pre-trained embeddings on large corpuses (see the pdf file). You only have to consider the word embeddings with an embedding size of 300
    - Compare performance on the analogy task with your own trained embeddings from "Alice in Wonderland". You can limit yourself to the vocabulary of Alice in Wonderland. Visualize the pre-trained word embeddings and compare these with the results of your own trained word embeddings. 


In [77]:
#function definitions for CBOW

#generate data for CBOW
def generate_data_cbow(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            out_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    out_words.append([word])
                    labels.append(words[i])
            if out_words != []:
                print(out_words)
                all_in.append(np.array(labels,dtype=np.int32))
                all_out.append(np_utils.to_categorical(out_words, V))
    return (all_in,all_out)

#load the preprocessed CBOW data
def generate_data_cbow_from_file():
    f = open('data_cbow.txt' ,'r')
    for row in f:
        inputs,outputs = row.split(",")
        inputs = np.fromstring(inputs, dtype=int, sep=' ')
        inputs = np.asarray(np.split(inputs, len(inputs)))
        outputs = np.fromstring(outputs, dtype=float, sep=' ')
        outputs = np.asarray(np.split(outputs, len(inputs)))
        yield (inputs,outputs)

In [78]:
#prepare data for cbow
   
#get x and y's for data
x,y = generate_data_cbow(corpus,window_size,V)

#save the preprocessed data of Skipgram
f = open('data_cbow.txt' ,'w')

for input,outcome  in zip(x,y):
    print(input, outcome)
    input = np.concatenate(input)
    f.write(" ".join(map(str, list(input))))
    f.write(",")
    outcome = np.concatenate(outcome)
    f.write(" ".join(map(str,list(outcome))))
    f.write("\n")
f.close()

[[242], [242]]
[[6], [6], [6]]
[[26], [26], [26], [26]]
[[1], [1], [1], [1]]
[[63], [63], [63]]
[[243], [243]]
[[11], [11]]
[[9], [9], [9]]
[[584], [584], [584], [584]]
[[3], [3], [3], [3]]
[[67], [67], [67], [67]]
[[27], [27], [27], [27]]
[[244], [244], [244], [244]]
[[8], [8], [8], [8]]
[[585], [585], [585], [585]]
[[71], [71], [71], [71]]
[[14], [14], [14], [14]]
[[380], [380], [380], [380]]
[[21], [21], [21]]
[[1], [1]]
[[381], [381]]
[[2], [2], [2]]
[[8], [8], [8], [8]]
[[245], [245], [245], [245]]
[[112], [112], [112], [112]]
[[3], [3], [3], [3]]
[[49], [49], [49], [49]]
[[98], [98], [98], [98]]
[[57], [57], [57], [57]]
[[586], [586], [586], [586]]
[[4], [4], [4], [4]]
[[17], [17], [17], [17]]
[[587], [587], [587], [587]]
[[72], [72], [72]]
[[1], [1]]
[[205], [205]]
[[14], [14], [14]]
[[380], [380], [380], [380]]
[[9], [9], [9], [9]]
[[588], [588], [588], [588]]
[[19], [19], [19], [19]]
[[7], [7], [7], [7]]
[[17], [17], [17], [17]]
[[50], [50], [50], [50]]
[[246], [246], [246], [

[[26], [26], [26], [26]]
[[48], [48], [48], [48]]
[[441], [441], [441], [441]]
[[2], [2], [2], [2]]
[[43], [43], [43], [43]]
[[1], [1], [1], [1]]
[[221], [221], [221], [221]]
[[187], [187], [187]]
[[330], [330]]
[[95], [95]]
[[4], [4], [4]]
[[442], [442], [442], [442]]
[[443], [443], [443], [443]]
[[26], [26], [26], [26]]
[[1], [1], [1], [1]]
[[444], [444], [444], [444]]
[[672], [672], [672], [672]]
[[38], [38], [38], [38]]
[[4], [4], [4], [4]]
[[9], [9], [9], [9]]
[[119], [119], [119]]
[[3], [3]]
[[67], [67]]
[[31], [31]]
[[54], [54]]
[[175], [175]]
[[4], [4], [4]]
[[163], [163], [163], [163]]
[[142], [142], [142], [142]]
[[5], [5], [5], [5]]
[[24], [24], [24], [24]]
[[331], [331], [331], [331]]
[[673], [673], [673], [673]]
[[120], [120], [120], [120]]
[[22], [22], [22], [22]]
[[206], [206], [206], [206]]
[[8], [8], [8]]
[[674], [674]]
[[222], [222]]
[[55], [55], [55]]
[[9], [9], [9], [9]]
[[112], [112], [112], [112]]
[[21], [21], [21], [21]]
[[7], [7], [7], [7]]
[[675], [675], [675],

[[6], [6], [6]]
[[218], [218], [218], [218]]
[[20], [20], [20], [20]]
[[47], [47], [47], [47]]
[[79], [79], [79], [79]]
[[2], [2], [2], [2]]
[[4], [4], [4], [4]]
[[211], [211], [211], [211]]
[[3], [3], [3], [3]]
[[321], [321], [321], [321]]
[[44], [44], [44], [44]]
[[1], [1], [1], [1]]
[[744], [744], [744], [744]]
[[8], [8], [8]]
[[5], [5]]
[[347], [347]]
[[33], [33], [33]]
[[47], [47], [47], [47]]
[[102], [102], [102], [102]]
[[1], [1], [1], [1]]
[[347], [347], [347], [347]]
[[33], [33], [33], [33]]
[[745], [745], [745], [745]]
[[31], [31], [31], [31]]
[[18], [18], [18], [18]]
[[4], [4], [4], [4]]
[[58], [58], [58], [58]]
[[29], [29], [29]]
[[219], [219]]
[[119], [119]]
[[245], [245], [245]]
[[208], [208], [208], [208]]
[[74], [74], [74], [74]]
[[5], [5], [5]]
[[89], [89]]
[[102], [102]]
[[5], [5], [5]]
[[192], [192], [192], [192]]
[[462], [462], [462], [462]]
[[13], [13], [13], [13]]
[[112], [112], [112], [112]]
[[123], [123], [123], [123]]
[[336], [336], [336], [336]]
[[4], [4], [4]

[[1], [1], [1], [1]]
[[221], [221], [221], [221]]
[[111], [111], [111], [111]]
[[163], [163], [163], [163]]
[[497], [497], [497], [497]]
[[334], [334], [334], [334]]
[[10], [10], [10], [10]]
[[5], [5], [5]]
[[117], [117]]
[[339], [339]]
[[498], [498], [498]]
[[3], [3], [3], [3]]
[[811], [811], [811], [811]]
[[15], [15], [15], [15]]
[[111], [111], [111], [111]]
[[163], [163], [163], [163]]
[[51], [51], [51], [51]]
[[1], [1], [1], [1]]
[[280], [280], [280], [280]]
[[1], [1], [1]]
[[280], [280]]
[[51], [51]]
[[172], [172], [172]]
[[35], [35], [35], [35]]
[[4], [4], [4], [4]]
[[20], [20], [20], [20]]
[[812], [812], [812], [812]]
[[32], [32], [32], [32]]
[[6], [6], [6], [6]]
[[143], [143], [143], [143]]
[[281], [281], [281], [281]]
[[14], [14], [14], [14]]
[[452], [452], [452], [452]]
[[11], [11], [11], [11]]
[[184], [184], [184]]
[[30], [30]]
[[813], [813]]
[[13], [13], [13]]
[[4], [4], [4], [4]]
[[9], [9], [9], [9]]
[[232], [232], [232], [232]]
[[3], [3], [3], [3]]
[[263], [263], [263], [

[[81], [81]]
[[40], [40]]
[[227], [227], [227]]
[[101], [101], [101], [101]]
[[168], [168], [168], [168]]
[[2], [2], [2], [2]]
[[9], [9], [9], [9]]
[[116], [116], [116], [116]]
[[21], [21], [21], [21]]
[[517], [517], [517], [517]]
[[873], [873], [873], [873]]
[[4], [4], [4], [4]]
[[76], [76], [76]]
[[73], [73]]
[[31], [31]]
[[13], [13], [13]]
[[1], [1], [1], [1]]
[[364], [364], [364], [364]]
[[8], [8], [8], [8]]
[[28], [28], [28], [28]]
[[9], [9], [9], [9]]
[[1], [1], [1], [1]]
[[126], [126], [126], [126]]
[[4], [4], [4], [4]]
[[9], [9], [9], [9]]
[[482], [482], [482], [482]]
[[2], [2], [2], [2]]
[[4], [4], [4]]
[[358], [358]]
[[7], [7]]
[[279], [279], [279]]
[[141], [141], [141], [141]]
[[10], [10], [10], [10]]
[[66], [66], [66], [66]]
[[3], [3], [3], [3]]
[[874], [874], [874], [874]]
[[517], [517], [517], [517]]
[[165], [165], [165]]
[[468], [468]]
[[13], [13]]
[[9], [9], [9]]
[[5], [5], [5], [5]]
[[875], [875], [875], [875]]
[[876], [876], [876], [876]]
[[16], [16], [16], [16]]
[[11

[[11], [11], [11], [11]]
[[10], [10], [10], [10]]
[[5], [5], [5], [5]]
[[117], [117], [117], [117]]
[[339], [339], [339], [339]]
[[3], [3], [3], [3]]
[[519], [519], [519], [519]]
[[1], [1], [1], [1]]
[[539], [539], [539]]
[[8], [8]]
[[944], [944]]
[[106], [106], [106]]
[[12], [12], [12], [12]]
[[106], [106], [106], [106]]
[[12], [12], [12], [12]]
[[478], [478], [478], [478]]
[[8], [8], [8], [8]]
[[8], [8], [8], [8]]
[[370], [370], [370], [370]]
[[1], [1], [1], [1]]
[[25], [25], [25], [25]]
[[99], [99], [99]]
[[29], [29]]
[[326], [326]]
[[30], [30], [30]]
[[11], [11], [11], [11]]
[[52], [52], [52], [52]]
[[21], [21], [21], [21]]
[[371], [371], [371], [371]]
[[55], [55], [55], [55]]
[[33], [33], [33], [33]]
[[74], [74], [74], [74]]
[[5], [5], [5], [5]]
[[215], [215], [215], [215]]
[[24], [24], [24], [24]]
[[945], [945], [945]]
[[259], [259]]
[[197], [197]]
[[156], [156], [156]]
[[6], [6], [6], [6]]
[[218], [218], [218], [218]]
[[47], [47], [47], [47]]
[[3], [3], [3], [3]]
[[535], [535], 

[[12], [12], [12], [12]]
[[62], [62], [62]]
[[44], [44]]
[[6], [6]]
[[62], [62], [62]]
[[44], [44], [44], [44]]
[[7], [7], [7], [7]]
[[1007], [1007], [1007], [1007]]
[[127], [127], [127], [127]]
[[275], [275], [275], [275]]
[[46], [46], [46], [46]]
[[6], [6], [6], [6]]
[[150], [150], [150], [150]]
[[5], [5], [5], [5]]
[[89], [89], [89], [89]]
[[16], [16], [16]]
[[1], [1]]
[[374], [374]]
[[7], [7], [7]]
[[36], [36], [36], [36]]
[[348], [348], [348], [348]]
[[5], [5], [5], [5]]
[[1008], [1008], [1008], [1008]]
[[57], [57], [57], [57]]
[[5], [5], [5], [5]]
[[1009], [1009], [1009], [1009]]
[[1], [1], [1], [1]]
[[162], [162], [162], [162]]
[[33], [33], [33], [33]]
[[44], [44], [44], [44]]
[[99], [99], [99]]
[[1], [1]]
[[1], [1]]
[[25], [25], [25]]
[[99], [99], [99], [99]]
[[29], [29], [29], [29]]
[[1010], [1010], [1010], [1010]]
[[28], [28], [28], [28]]
[[162], [162], [162], [162]]
[[19], [19], [19], [19]]
[[1011], [1011], [1011], [1011]]
[[52], [52], [52], [52]]
[[21], [21], [21]]
[[73], [

[[100], [100], [100], [100]]
[[68], [68], [68], [68]]
[[16], [16], [16], [16]]
[[1], [1], [1]]
[[90], [90]]
[[79], [79]]
[[39], [39], [39]]
[[22], [22], [22], [22]]
[[289], [289], [289], [289]]
[[108], [108], [108], [108]]
[[14], [14], [14], [14]]
[[98], [98], [98], [98]]
[[123], [123], [123], [123]]
[[192], [192], [192], [192]]
[[1], [1], [1], [1]]
[[90], [90], [90]]
[[560], [560]]
[[1062], [1062]]
[[1], [1], [1]]
[[294], [294], [294], [294]]
[[183], [183], [183], [183]]
[[239], [239], [239], [239]]
[[195], [195], [195], [195]]
[[88], [88], [88], [88]]
[[1063], [1063], [1063], [1063]]
[[8], [8], [8], [8]]
[[28], [28], [28]]
[[1064], [1064]]
[[294], [294]]
[[2], [2], [2]]
[[46], [46], [46], [46]]
[[7], [7], [7], [7]]
[[17], [17], [17], [17]]
[[344], [344], [344], [344]]
[[28], [28], [28], [28]]
[[1065], [1065], [1065], [1065]]
[[567], [567], [567], [567]]
[[39], [39], [39], [39]]
[[22], [22], [22]]
[[1066], [1066]]
[[11], [11]]
[[53], [53], [53]]
[[1], [1], [1], [1]]
[[290], [290], [29

[[63], [63], [63], [63]]
[[306], [306], [306], [306]]
[[11], [11], [11], [11]]
[[15], [15], [15], [15]]
[[4], [4], [4], [4]]
[[52], [52], [52], [52]]
[[579], [579], [579], [579]]
[[40], [40], [40]]
[[2], [2]]
[[199], [199]]
[[31], [31], [31]]
[[3], [3], [3], [3]]
[[14], [14], [14], [14]]
[[10], [10], [10], [10]]
[[133], [133], [133], [133]]
[[534], [534], [534], [534]]
[[196], [196], [196], [196]]
[[132], [132], [132], [132]]
[[580], [580], [580], [580]]
[[581], [581], [581], [581]]
[[44], [44], [44], [44]]
[[106], [106], [106], [106]]
[[12], [12], [12]]
[[1159], [1159]]
[[31], [31]]
[[68], [68], [68]]
[[1160], [1160], [1160], [1160]]
[[308], [308], [308], [308]]
[[28], [28], [28], [28]]
[[131], [131], [131], [131]]
[[2], [2], [2], [2]]
[[372], [372], [372], [372]]
[[42], [42], [42], [42]]
[[5], [5], [5], [5]]
[[229], [229], [229], [229]]
[[8], [8], [8], [8]]
[[125], [125], [125], [125]]
[[2], [2], [2], [2]]
[[5], [5], [5]]
[[126], [126]]
[[1161], [1161]]
[[81], [81], [81]]
[[2], [2], 

ValueError: zero-dimensional arrays cannot be concatenated

In [None]:
for x, y in generate_data_cbow_from_file():
        print(x,y)

In [40]:
#create CBOW model


In [41]:
#define loss function


In [42]:
#train model


In [None]:
#function definitions for Skipgram

#generate data for Skipgram
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size*2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1
                    
            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    in_words.append([word])
                    labels.append(words[i])
            if in_words != []:
                all_in.append(np.array(in_words,dtype=np.int32))
                all_out.append(np_utils.to_categorical(labels, V))
    return (all_in,all_out)

#load the preprocessed Skipgram data
def generate_data_skipgram_from_file():
    f = open('data_skipgram.txt' ,'r')
    for row in f:
        inputs,outputs = row.split(",")
        inputs = np.fromstring(inputs, dtype=int, sep=' ')
        inputs = np.asarray(np.split(inputs, len(inputs)))
        outputs = np.fromstring(outputs, dtype=float, sep=' ')
        outputs = np.asarray(np.split(outputs, len(inputs)))
        yield (inputs,outputs)

In [43]:
#prepare data for Skipgram
   
#get x and y's for data
x,y = generate_data_skipgram(corpus,window_size,V)

#save the preprocessed data of Skipgram
f = open('data_skipgram.txt' ,'w')

for input,outcome  in zip(x,y):
    input = np.concatenate(input)
    f.write(" ".join(map(str, list(input))))
    f.write(",")
    outcome = np.concatenate(outcome)
    f.write(" ".join(map(str,list(outcome))))
    f.write("\n")
f.close()

In [44]:
for dim in [50, 150, 300]:
    #create Skipgram model
    skipgram = Sequential()
    skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
    skipgram.add(Reshape((dim, )))
    skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))
    
    #define loss function for Skipgram
    skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')
    
    #train skipgram model
    print("\nTraining skipgram for dim="+str(dim))
    for ite in range(5):
        loss = 0.
        for x, y in generate_data_skipgram_from_file():
            loss += skipgram.train_on_batch(x, y)
        print(ite, loss)
    
    #save vector representation to file
    f = open('vectors_skipgram_'+str(dim)+'.txt' ,'w')
    f.write(" ".join([str(V-1),str(dim)]))
    f.write("\n")

    vectors = skipgram.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(vectors[i,:]))))
        f.write("\n")
    f.close()

Training skipgram for dim=50
0 41271.35245156288
1 39126.84041810036
2 39288.233939647675
3 39363.8857254982
4 39432.4152007103
Training skipgram for dim=150
0 41222.643466711044
1 38923.35916996002
2 38988.32216048241
3 39005.6442643404
4 39038.848814845085
Training skipgram for dim=300
0 41159.682378292084
1 38730.84592962265
2 38713.53452014923
3 38676.75279080868
4 38648.01530623436


In [45]:
#create CBOW model with additional dense layer


In [46]:
#define loss function for CBOW + dense


In [47]:
#train model for CBOW + dense


In [50]:
for dim in [50, 150, 300]:
    #create Skipgram model with additional dense layer
    skipgram = Sequential()
    skipgram.add(Embedding(input_dim=V, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
    skipgram.add(Reshape((dim, )))
    skipgram.add(Dense(128, activation="relu"))
    skipgram.add(Dense(input_dim=dim, units=V, kernel_initializer='uniform', activation='softmax'))
    
    #define loss function for Skipgram + dense
    skipgram.compile(loss='categorical_crossentropy', optimizer='adadelta')
    
    #train model for Skipgram + dense
    print("\nTraining skipgram with dense layer for dim="+str(dim))
    for ite in range(5):
        loss = 0.
        for x, y in generate_data_skipgram_from_file():
            loss += skipgram.train_on_batch(x, y)
        print(ite, loss)
    
    #save vector representation to file
    f = open('vectors_skipgram_dense_'+str(dim)+'.txt' ,'w')
    f.write(" ".join([str(V-1),str(dim)]))
    f.write("\n")

    vectors = skipgram.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(vectors[i,:]))))
        f.write("\n")
    f.close()


Training skipgram with dense layer for dim=50
0 39380.52455711365
1 38751.52722334862
2 38633.536863565445
3 38469.259721159935
4 38317.923087358475

Training skipgram with dense layer for dim=150
0 39399.51644325256
1 38765.91266536713
2 38614.264612317085
3 38465.81636416912
4 38303.918350696564

Training skipgram with dense layer for dim=300
0 39321.48041725159
1 38606.570843577385
2 38408.09201443195
3 38207.7414662838
4 38027.33973431587


In [74]:
#load vectors from files
cbows = []
cbows_dense = []
skipgrams = []
skipgrams_dense = []
    
for dim in [50, 150, 300]:
    f = open('vectors_skipgram_'+str(dim)+'.txt' ,'r')
    d = {}
    for row in f.readlines()[1:]:
        line = row.split()
        d[line[0]] = line[1:]
    skipgrams.append(d)
    
    f = open('vectors_skipgram_dense_'+str(dim)+'.txt' ,'r')
    d = {}
    for row in f.readlines()[1:]:
        line = row.split()
        d[line[0]] = line[1:]
    skipgrams_dense.append(d)
    
#save as separate dictionaries
skipgram_vectors_50 = skipgrams[0]
skipgram_vectors_150 = skipgrams[1]
skipgram_vectors_300 = skipgrams[2]
skipgram_dense_vectors_50 = skipgrams_dense[0]
skipgram_dense_vectors_150 = skipgrams_dense[1]
skipgram_dense_vectors_300 = skipgrams_dense[2]

In [None]:
#Implement your own analogy function
#use cosine similarity & nearest neighbor?

Comparison performance:

In [None]:
#Visualization results trained word embeddings


Interpretation results of the visualization

Compare the results of the trained word embeddings with the word-word co-occurrence matrix

Discussion of the advantages of CBOW and Skipgram, the advantages of negative sampling and drawbacks of CBOW and Skipgram

In [None]:
#load pretrained word embeddings of word2vec

path_word2vec = "/GoogleNews-vectors-negative300.bin"

#word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
#load pretraind word embeddings of Glove

path = "/glove.6B/glove.6B.300d_converted.txt"

#convert GloVe into word2vec format
gensim.scripts.glove2word2vec.get_glove_info(path)
gensim.scripts.glove2word2vec.glove2word2vec(path, "glove_converted.txt")

#glove = KeyedVectors.load_word2vec_format(path, binary=False)

In [None]:
#Visualize the pre-trained word embeddings

Comparison performance with your own trained word embeddings