Dataset: https://www.kaggle.com/datafiniti/grammar-and-online-product-reviews/data

In [1]:
from functools import partial
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from utility.preprocessing import preprocessing
preprocessing = partial(preprocessing, HYPHEN_HANDLE=2, )

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_data = pd.read_csv('./data/GrammarandProductReviews.csv', usecols=['reviews.text'])
df_data.drop_duplicates(inplace=True)
df_data.dropna(inplace=True)
df_data['reviews.text'] = df_data['reviews.text'].apply(preprocessing)

In [3]:
corpus = [_ for sent in df_data['reviews.text'].tolist() for _ in sent.split(".")]

In [8]:
corpus = corpus[:1000]

In [9]:
def generate_data(corpus, _slice=3):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus = tokenizer.texts_to_sequences(corpus)
    
    data = []
    targets = []
    for sentence in tqdm(corpus):
        slices = [sentence[i: i+_slice] for i in range(0, len(sentence) - (_slice-1))]
        center = int(np.floor(_slice/2))
        for s in slices:
            data.append([s[center]])
            targets.append([_ for idx, _ in enumerate(s) if idx != center])
    
    X = np.zeros((len(data), len(tokenizer.word_index)+1))
    y = np.zeros((len(data), len(tokenizer.word_index)+1))
    for idx, (i, j) in enumerate(zip(data, targets)):
        X[idx][i] = 1
        y[idx][j] = 1

    print("X_shape:", X.shape)
    print("y_shape:", y.shape)
    print("# Words:", len(tokenizer.word_index))

    return X, y, tokenizer

In [10]:
X, y, tokenizer = generate_data(corpus, 5)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


X_shape: (37741, 2811)
y_shape: (37741, 2811)
# Words: 2810


In [11]:
model = Sequential([
    Dense(2, input_shape=(X.shape[1],)),
    Dense(X.shape[1]),
    Activation('softmax')
])

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2)                 5624      
_________________________________________________________________
dense_2 (Dense)              (None, 2811)              8433      
_________________________________________________________________
activation_1 (Activation)    (None, 2811)              0         
Total params: 14,057
Trainable params: 14,057
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [14]:
model.fit(X, y, epochs=10000, verbose=1)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000

KeyboardInterrupt: 

In [None]:
w = model.layers[0]

In [None]:
w.get_weights()

In [None]:
points = model.layers[0].get_weights()[0]

In [None]:
import matplotlib.pyplot as plt

In [None]:
points.transpose()

In [None]:
word_embedding = {word: embedding for word, embedding in zip(tokenizer.word_index.keys(), points[1:])}

In [None]:
word_embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(word_embedding['money'].reshape(1, -1), word_embedding['economy'].reshape(1, -1))

In [None]:
word_embedding['india']

In [None]:
plt_x = points.transpose()[0, 1:]
plt_y = points.transpose()[1, 1:]
fig = plt.figure(figsize=(10, 150))
ax = fig.subplots()
ax.scatter(plt_x, plt_y)

for i, txt in enumerate([_ for _ in tokenizer.word_index]):
    ax.annotate(txt, (plt_x[i], plt_y[i]))

plt.show()