Dataset: 
- https://www.kaggle.com/datafiniti/grammar-and-online-product-reviews/data
- https://www.kaggle.com/arathee2/demonetization-in-india-twitter-data/data

In [None]:
from functools import partial
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
from utility.preprocessing import preprocessing
preprocessing = partial(preprocessing, HYPHEN_HANDLE=2, )

In [None]:
def generate_data(corpus, _slice=3):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus = tokenizer.texts_to_sequences(corpus)
    
    data = []
    targets = []
    for sentence in tqdm(corpus):
        slices = [sentence[i: i+_slice] for i in range(0, len(sentence) - (_slice-1))]
        center = int(np.floor(_slice/2))
        for s in slices:
            data.append([s[center]])
            targets.append([_ for idx, _ in enumerate(s) if idx != center])
    
    X = np.zeros((len(data), len(tokenizer.word_index)+1))
    y = np.zeros((len(data), len(tokenizer.word_index)+1))
    for idx, (i, j) in enumerate(zip(data, targets)):
        X[idx][i] = 1
        y[idx][j] = 1

    print("X_shape:", X.shape)
    print("y_shape:", y.shape)
    print("# Words:", len(tokenizer.word_index))

    return X, y, tokenizer

In [None]:
df_data = pd.read_csv('./data/demonetization-tweets.csv', encoding='latin-1', usecols=['text'])
df_data.drop_duplicates(inplace=True)
df_data.dropna(inplace=True)
df_data.text = df_data.text.apply(preprocessing)
corpus = [_ for sent in df_data.text.tolist() for _ in sent.split(".")]
X, y, tokenizer = generate_data(corpus, 5)

In [None]:
model = Sequential([
    Dense(2, input_shape=(X.shape[1],)),
    Dense(X.shape[1]),
    Activation('softmax')
])
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
model.summary()

In [None]:
try:
    h = model.fit(X, y, epochs=100, verbose=1)
except KeyboardInterrupt:
    print('\n\nExited by User')

In [None]:
points = model.layers[0].get_weights()[0]
word_embedding = {word: embedding for word, embedding in zip(tokenizer.word_index.keys(), points[1:])}
inverse_idx = {v: k for k, v in tokenizer.word_index.items()}

In [None]:
def closest(word, _top=5):
    word = word_embedding[word]
    cos_sim = cosine_similarity(word.reshape(1, -1), points)
    top_n = cos_sim.argsort()[0][-_top:][::-1]
    return [inverse_idx[_] for _ in top_n if _ in inverse_idx]

def similarity(word_1, word_2):
    return cosine_similarity(
        word_embedding[word_1].reshape(1, -1), 
        word_embedding[word_2].reshape(1, -1)
    ).flatten()[0]

In [None]:
similarity('atm', 'bank')

In [None]:
import matplotlib.pyplot as plt

plt_x = points.transpose()[0, 1:]
plt_y = points.transpose()[1, 1:]
fig = plt.figure(figsize=(10, 250))
ax = fig.subplots()
ax.scatter(plt_x, plt_y)

for i, txt in enumerate([_ for _ in tokenizer.word_index]):
    if i%5 == 0:
        ax.annotate(txt, (plt_x[i], plt_y[i]))

plt.show()