In [None]:
!wget https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json

--2021-01-07 03:21:34--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.188.128, 64.233.189.128, 108.177.97.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.188.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2021-01-07 03:21:34 (102 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [None]:
import json

In [None]:
with open('sarcasm.json') as f:
  data = json.load(f)

In [None]:
# words = []
# for item in data:
#   sentence = item['headline'].split()
#   for word in sentence:
#     words.append(word)

corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'
]

In [None]:

from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['<OOV>'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 31
Vocabulary Sample: [('the', 1), ('is', 2), ('and', 3), ('sky', 4), ('blue', 5), ('beautiful', 6), ('quick', 7), ('brown', 8), ('fox', 9), ('lazy', 10)]


In [None]:
vocab_size

31

In [None]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        # print(sentence_length)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []
            start = index - window_size
            end = index + window_size + 1

            context_words.append([words[i]
                                 for i in range(start, end)
                                 if 0 <= i < sentence_length
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [None]:
m = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)

In [None]:
import numpy as np

In [None]:
import numpy as np
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print(x, y)
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])

        if i == 10:
            break
        i += 1

[[1 4 5 3]] [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['the', 'sky', 'blue', 'and'] -> Target (Y): is
[[4 2 3 6]] [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['sky', 'is', 'and', 'beautiful'] -> Target (Y): blue
[[12 18  3  6]] [[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['love', 'this', 'and', 'beautiful'] -> Target (Y): blue
[[18  5  6  4]] [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['this', 'blue', 'beautiful', 'sky'] -> Target (Y): and
[[ 1  7  9 19]] [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
Context (X): ['the', 'quick', 'fox', 'jumps'] -> Target (Y): brown
[[ 7  8 19 20]] [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0

In [None]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

# visualize model structure
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False,
#                  rankdir='TB').create(prog='dot', format='svg'))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            3100      
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 31)                3131      
Total params: 6,231
Trainable params: 6,231
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 249.22476840019226

Epoch: 2 	Loss: 244.29651021957397

Epoch: 3 	Loss: 238.61979484558105

Epoch: 4 	Loss: 231.98311734199524

Epoch: 5 	Loss: 224.84529900550842



Get word embedding

In [None]:
import pandas as pd
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(30, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
is,0.013609,-0.032137,-6.5e-05,0.047261,-0.009963,-0.044314,0.089682,-0.044716,-0.015753,0.00382,0.089259,0.014464,-0.072743,0.040941,0.049278,-0.030565,-0.075274,0.075153,0.116088,0.046165,0.040315,-0.056376,0.003478,-0.064788,0.081267,0.084865,0.158836,-0.02445,-0.104121,0.047796,0.033062,-0.117632,-0.025403,-0.003541,-0.029908,-0.021853,0.004391,0.077666,0.030391,-0.089214,...,-0.064084,0.086601,0.046815,-0.053698,-0.094445,-0.096322,-0.060326,0.027118,0.094518,-0.057134,-0.114091,0.099296,0.113362,0.124804,-0.164358,-0.004215,-0.019065,-0.101196,0.126299,-0.019366,-0.051125,0.017947,-0.027973,-0.029675,-0.000232,0.02161,-0.044013,-0.025915,0.073418,0.020599,0.034937,-0.057472,-0.01644,0.06024,-0.111583,0.01484,0.025445,0.052402,0.108445,-0.056267
and,0.049377,-0.027838,0.015197,0.006753,-0.043233,0.003731,0.063299,-0.011158,-0.043826,-0.027655,0.07008,-0.095323,-0.021016,0.010222,0.031694,0.111622,-0.017936,0.167583,0.058612,-0.054926,-0.083427,0.019647,-0.074944,0.033637,0.133263,0.122115,0.150956,0.046901,-0.037556,0.017226,-0.007645,-0.192247,0.020542,0.023762,0.063457,-0.006984,-0.005556,-0.045251,-0.004793,-0.128716,...,0.055682,0.052756,0.005755,-0.012581,-0.085692,0.031243,-0.189547,-0.052133,0.064666,0.022855,-0.136566,0.147207,-0.033157,0.124872,-0.158923,-0.031937,0.020225,-0.143979,0.033135,-0.128276,0.045056,0.045573,0.058274,0.027056,0.010672,0.10961,-0.098183,-0.03428,-0.021019,-0.078572,0.13015,-0.014698,0.01517,0.045523,-0.052377,0.074783,0.024158,0.082695,0.096788,-0.148033
sky,-0.047619,-0.004521,0.047736,0.051597,-0.040568,-0.036433,-0.04305,-0.035473,0.04057,0.03668,-0.032274,0.017493,-0.09267,0.019125,0.083728,-0.004147,-0.024924,0.053147,-0.096777,0.049828,-0.025013,0.012599,-0.035668,0.021227,0.007407,0.09622,-0.044012,0.08968,-0.078431,-0.021086,-0.045373,-0.05157,0.02308,0.110826,0.023991,0.045958,-0.034563,-0.06459,0.172536,-0.076358,...,0.02664,0.047782,-0.106801,-0.137894,-0.106051,0.048949,-0.035913,-0.07712,0.014776,-0.013382,-0.028097,0.070353,-0.001867,0.066408,-0.084372,-0.009027,0.115463,-0.089123,0.009185,-0.0408,0.057902,0.0356,0.010122,-0.010443,-0.057313,0.038158,0.067805,-0.02046,0.093607,-0.063971,0.014185,-0.114461,0.018495,0.023319,-0.022966,-0.004722,-0.157348,0.030449,0.041477,-0.112779
blue,0.031101,-0.099991,-0.030097,-0.079334,-0.016191,0.026309,0.092936,-0.116854,0.010214,0.024819,0.10158,-0.010311,-0.001757,0.03139,-0.059304,0.048388,-0.034365,0.100786,-0.090596,0.009688,0.011457,0.012149,-0.047851,0.028066,0.080952,0.074833,0.032647,0.059118,-0.080724,-0.068052,0.047845,0.014943,-0.05421,0.000131,-0.011119,0.032273,0.02446,0.005226,0.06882,-0.067838,...,-0.033143,0.065103,-0.015388,-0.046106,-0.047739,0.004802,-0.012149,-0.042606,0.128592,0.005653,-0.062891,0.049102,0.023543,0.086408,-0.054378,-0.005833,0.104714,-0.044502,0.01413,0.032922,0.081082,0.071517,0.00072,-0.031947,-0.071755,0.080765,-0.057218,0.048139,0.047477,-0.075243,0.023457,-0.052746,0.049936,-0.01975,0.088061,0.076369,-0.010598,0.067158,0.046689,-0.077704
beautiful,0.038142,-0.075563,-0.102879,-0.034029,-0.07097,-0.020237,0.126548,-0.069474,0.028606,0.053788,-0.023214,-0.013661,0.011275,0.043212,0.010316,0.050936,-0.074132,0.045547,0.014853,0.064265,-0.028447,-0.114416,-0.134876,0.085434,0.096157,0.155479,0.053282,0.065637,-0.065389,0.012172,0.016194,-0.015911,-0.004113,0.001674,-0.047587,0.011952,0.047722,-0.04642,0.007582,-0.073806,...,-0.126377,0.076087,0.075423,-0.07602,-0.109253,-0.107206,0.023038,-0.030609,0.037746,-0.009969,-0.050189,0.095965,0.011138,0.100169,-0.120571,0.037149,0.138914,-0.003005,0.067102,-0.032687,-0.003647,0.045546,0.005078,0.037723,-0.038577,-0.030945,-0.051795,0.083669,0.10246,-0.090864,-0.021799,-0.114391,-0.004188,0.013578,0.029328,-0.021246,-0.014065,0.054861,0.008146,-0.092477


In [None]:
weights = cbow.get_weights()[0]

In [None]:
weights

array([[-3.1561736e-02, -1.7814040e-02, -6.8779618e-02, ...,
        -9.5186057e-05,  2.9679265e-02, -6.3402772e-02],
       [ 1.3608841e-02, -3.2136969e-02, -6.5173721e-05, ...,
         5.2401550e-02,  1.0844480e-01, -5.6267262e-02],
       [ 4.9376521e-02, -2.7838325e-02,  1.5196926e-02, ...,
         8.2694858e-02,  9.6787691e-02, -1.4803250e-01],
       ...,
       [ 9.1258630e-02,  3.9431006e-02,  8.3381096e-03, ...,
         4.6373010e-02,  4.6702214e-03,  4.4937957e-02],
       [ 3.7448078e-02, -3.2936670e-02, -2.6228743e-02, ...,
        -5.6585975e-02, -2.4718527e-02, -2.9869976e-02],
       [-1.4059506e-03, -5.5466793e-02, -1.9173015e-02, ...,
         7.0485704e-02,  2.0090012e-02,  8.7448070e-03]], dtype=float32)

Using Gensim