### CBOW-Method using Gensim

In [18]:
from gensim.models import word2vec
import os
import logging
import warnings

warnings.filterwarnings("ignore")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class Text8Sentences(object):
    def __init__(self,fname,maxlen):
        self.fname = fname
        self.maxlen = maxlen
        
    def __iter__(self,):
           with open(self.fname, "r") as ftext:
                text = ftext.read().split(" ")
                words = []
                for word in text:
                    if len(words) >= self.maxlen:
                        yield words
                        words = []
                    words.append(word)
                yield words


data = "./"
sentences = Text8Sentences(os.path.join(data, "text8.txt"), 50)
model = word2vec.Word2Vec(sentences, size=300, min_count=30)



2018-12-01 14:17:31,467 : INFO : collecting all words and their counts
2018-12-01 14:18:25,393 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-12-01 14:18:26,611 : INFO : PROGRESS: at sentence #10000, processed 500000 words, keeping 33464 word types
2018-12-01 14:18:26,740 : INFO : PROGRESS: at sentence #20000, processed 1000000 words, keeping 52755 word types
2018-12-01 14:18:26,869 : INFO : PROGRESS: at sentence #30000, processed 1500000 words, keeping 65589 word types
2018-12-01 14:18:27,000 : INFO : PROGRESS: at sentence #40000, processed 2000000 words, keeping 78383 word types
2018-12-01 14:18:27,133 : INFO : PROGRESS: at sentence #50000, processed 2500000 words, keeping 88008 word types
2018-12-01 14:18:27,264 : INFO : PROGRESS: at sentence #60000, processed 3000000 words, keeping 96645 word types
2018-12-01 14:18:27,395 : INFO : PROGRESS: at sentence #70000, processed 3500000 words, keeping 104309 word types
2018-12-01 14:18:27,526 : INFO : PROGRE

2018-12-01 14:19:20,799 : INFO : EPOCH 1 - PROGRESS: at 99.56% examples, 280543 words/s, in_qsize 6, out_qsize 1
2018-12-01 14:19:20,904 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-12-01 14:19:20,911 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-12-01 14:19:20,917 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-12-01 14:19:20,919 : INFO : EPOCH - 1 : training on 17005208 raw words (11929472 effective words) took 42.5s, 281013 effective words/s
2018-12-01 14:19:24,815 : INFO : EPOCH 2 - PROGRESS: at 0.06% examples, 1960 words/s, in_qsize 0, out_qsize 0
2018-12-01 14:19:25,821 : INFO : EPOCH 2 - PROGRESS: at 5.88% examples, 144665 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:19:26,843 : INFO : EPOCH 2 - PROGRESS: at 11.00% examples, 221885 words/s, in_qsize 5, out_qsize 0
2018-12-01 14:19:27,851 : INFO : EPOCH 2 - PROGRESS: at 14.64% examples, 252640 words/s, in_qsize 5, out_qsize 0
2018-12-01 14:19:28,852 : 

2018-12-01 14:20:30,560 : INFO : EPOCH 4 - PROGRESS: at 14.35% examples, 279776 words/s, in_qsize 5, out_qsize 0
2018-12-01 14:20:31,564 : INFO : EPOCH 4 - PROGRESS: at 17.94% examples, 300835 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:20:32,569 : INFO : EPOCH 4 - PROGRESS: at 21.46% examples, 315418 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:20:33,583 : INFO : EPOCH 4 - PROGRESS: at 24.99% examples, 326970 words/s, in_qsize 5, out_qsize 0
2018-12-01 14:20:34,592 : INFO : EPOCH 4 - PROGRESS: at 28.46% examples, 335973 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:20:35,608 : INFO : EPOCH 4 - PROGRESS: at 32.17% examples, 345404 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:20:36,622 : INFO : EPOCH 4 - PROGRESS: at 35.75% examples, 352219 words/s, in_qsize 5, out_qsize 0
2018-12-01 14:20:37,641 : INFO : EPOCH 4 - PROGRESS: at 39.46% examples, 358642 words/s, in_qsize 6, out_qsize 0
2018-12-01 14:20:38,649 : INFO : EPOCH 4 - PROGRESS: at 43.05% examples, 363516 words/s, in_qsiz

In [19]:
#most_similar

print(model.most_similar("woman"))

2018-12-01 14:22:18,970 : INFO : precomputing L2-norms of word weight vectors


[('girl', 0.7142775654792786), ('child', 0.6974591016769409), ('man', 0.6733200550079346), ('herself', 0.6407644748687744), ('lady', 0.6313808560371399), ('baby', 0.630010187625885), ('person', 0.6156458854675293), ('lover', 0.6108782291412354), ('prostitute', 0.6080262660980225), ('mother', 0.5998956561088562)]


In [20]:
#most_similar(positive=["woman", "king"], negative=["man"], topn=10

print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=10))                     
  

[('queen', 0.6132511496543884), ('princess', 0.5554727911949158), ('empress', 0.5432103276252747), ('daughter', 0.5391123294830322), ('throne', 0.5376384854316711), ('isabella', 0.5371761322021484), ('prince', 0.5333335399627686), ('elizabeth', 0.5304224491119385), ('pharaoh', 0.5212265253067017), ('consort', 0.515372633934021)]


In [21]:
#similarity("girl", "woman")

print(model.similarity("girl", "woman"))

#similarity("girl", "man")

print(model.similarity("girl", "man"))

#similarity("girl", "car")

print(model.similarity("girl", "car"))

#similarity("bus", "car")

print(model.similarity("bus", "car"))


0.7142774113026622
0.5774182894329365
0.2978809985081157
0.48169954077933447


### Skipgrams-Method using Keras

In [1]:
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "It's easy to get started with Chart.js. All that's required is the script included in your page."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

wids = [word2id[w] for w in text_to_word_sequence(text)]
pairs, labels = skipgrams(wids, len(word2id))
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1],labels[i]))

Using TensorFlow backend.


(is (12), script (14)) -> 1
(script (14), the (13)) -> 0
(chart (7), js (8)) -> 1
(started (5), easy (2)) -> 0
(the (13), easy (2)) -> 0
(all (9), chart (7)) -> 1
(the (13), all (9)) -> 1
(is (12), all (9)) -> 0
(to (3), required (11)) -> 0
(page (18), included (15)) -> 0


### CBOW model weight generation model

In [2]:
from keras.models import Sequential
from keras.layers.core import Dense, Lambda,Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer, one_hot
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_distances
import keras.backend as K
import nltk
import numpy as np
import operator

In [3]:
vocab_size = 1579
embed_size = 300
window_size = 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, embeddings_initializer='glorot_uniform', input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
model.add(Dense(vocab_size, kernel_initializer='glorot_uniform', activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adadelta")

# get weights
weights = model.layers[0].get_weights()[0]

In [13]:
lines = []
fin = open("./alice_in_wonderland.txt", "r")
for line in fin:
    line = line.strip()
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

sents = nltk.sent_tokenize(" ".join(lines))

tokenizer = Tokenizer(20000)  # use top 5000 words only
tokens = tokenizer.fit_on_texts(sents)

vocab_size = len(tokenizer.word_index) + 1
w_lefts, w_centers, w_rights = [], [], []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts.extend([x[0] for x in triples])
    w_centers.extend([x[1] for x in triples])
    w_rights.extend([x[2] for x in triples])

ohe = OneHotEncoder(n_values=vocab_size)
Xleft = ohe.fit_transform(np.array(w_lefts).reshape(-1, 1)).todense()
Xright = ohe.fit_transform(np.array(w_rights).reshape(-1, 1)).todense()
X = (Xleft + Xright) / 2.0
Y = ohe.fit_transform(np.array(w_centers).reshape(-1, 1)).todense()
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}

idx2emb = {}    
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, weights)
    idx2emb[wid] = vec_emb

for word in ["opportunity", "Catch"]:
    wid = word2idx[word.lower()]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i), cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))


opportunity => refused, ago, he’ll, ‘hold, along, thousand, falling, dodo, ‘mine, usual
Catch => nearly, conversations, garden, expecting, ‘w, where, pointing, become, cunning, how
