In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
review_df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
review_df.head(5)

In [None]:
def normalize_word(word):
    word = word.lower().strip()
    return word

def get_tokens(sentence):
    tokens = [normalize_word(token) for token in sentence.split(' ') if token ]
    return [token for token in tokens if token]

vocab = []
all_reviews = review_df.review.to_list()
vocab = []
for review in all_reviews:
    vocab.extend(get_tokens(review))
vocab = list(set(vocab))
print('total vocab:', len(vocab))


In [None]:
word2index = {}
for index, word in enumerate(vocab):
    word2index[word] = index


In [None]:
input_data = []
for review in all_reviews:
    tokens = get_tokens(review)
    sentence2index = []
    for token in tokens:
        sentence2index.append(word2index[token])
    input_data.append(sentence2index)

In [None]:
target_data = []
labels = review_df.sentiment.to_list()
target_data = [1 if label == 'positive' else 0 for label in labels]

In [None]:
print('input data length:', len(input_data))

In [None]:
np.random.seed(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))


In [None]:
import sys

alpha, iterations = (0.01, 2)
hidden_size = 100

weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0, 0)
for iteration in range(iterations):
    for i in range(len(input_data) -1000):
        x,y = (input_data[i], target_data[i])
        
        #embed + sigmoid
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        
        #linear + softmax
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta * alpha
        weights_1_2 -= np.outer(layer_1, layer_2_delta) * alpha
        
        if np.abs(layer_2_delta) < 0.5:
            correct += 1
        total += 1
        
#         print(i)
        if (i%10) == 9:
            progress = str(i/float(len(input_data) - 1000) * 100)
            sys.stdout.write('\rIter:'+str(iteration)\
                        +' Progress:'+ str(progress)\
                        +'% Training Accuracy:'\
                        + str(correct/float(total) * 100)[0:5] + '%'
                            )
#     break

In [None]:
correct, total = (0, 0)
for i in range(len(input_data)-1000,len(input_data)):
    x, y = (input_data[i], target_data[i])
    
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy:" + str(correct / float(total))) 
    

In [None]:
from collections import Counter
import math

def get_word_similarity(target_word):
    target_index = word2index[target_word]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - weights_0_1[target_index]
        sqrt_difference = raw_difference**2
        scores[word] = -math.sqrt(sum(sqrt_difference))
    return scores.most_common(10)

In [None]:
get_word_similarity('beautiful')

In [None]:
get_word_similarity('terrible')

In [None]:
get_word_similarity('cool')