In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
reviews_df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [None]:
reviews_df.head(5)

In [None]:
def get_tokens(sentence):
    tokens = sentence.split(' ')
    tokens = [ele for ele in tokens if ele]
    return tokens

reviews_df['tokens'] = reviews_df.review.apply(get_tokens)

In [None]:
def clean_token(token):
    #lowercase it 
    token = token.lower()
    
    #strip the spaces
    token = token.strip()
    
    return token

vocab = []
for tokens in reviews_df.tokens.to_list():
    vocab.extend(tokens)
vocab = [clean_token(token) for token in vocab]
vocab = [token for token in vocab if token]
vocab = list(set(vocab))
print('total vocabulary: ', len(vocab))

In [None]:
word2index = {}
for index, token in enumerate(vocab):
    word2index[token] = index

In [None]:
target_data = reviews_df.sentiment.to_list()

#convert positive and negative to 0 and 1
target_data = [1 if sentiment == 'positive' else 0 for sentiment in target_data]
target_data[:5]

In [None]:
input_data = []
reviews_tokens = reviews_df.tokens.to_list()
for index, tokens in enumerate(reviews_tokens):
     sentence2index = []
     for token in tokens:
            token = clean_token(token)
            if token:
                word_index = word2index[token]
                sentence2index.append(word_index)
     input_data.append(sentence2index)
input_data[0][:4]

In [None]:
alpha, iterations, hidden_size = (0.01, 2, 100)

np.random.seed(1)

def sigmoid(x):
    return 1/(1+np.exp(-x))

#2 layer network
weights_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1 

correct, error, total = (0, 0, 0)

for iteration in range(iterations):
    for i in range(len(input_data) - 1000):
        x, y = (input_data[i], target_data[i])

        #word embedding
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))

        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))

        delta_2 = layer_2 - y
        delta_1 = delta_2.dot(weights_1_2.T)

        if np.abs(delta_2) < 0.5:
            correct += 1  
        total += 1

        error += delta_2**2

        weights_0_1[x] -= delta_1 * alpha
        weights_1_2 -= np.outer(layer_1, delta_2) * alpha

print('Train Accuracy: ', (correct/total) * 100)

In [None]:
total, correct = (0, 0)
for i in range(len(input_data) - 1000, len(input_data)):
    x, y  = (input_data[i], target_data[i])
    
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
    
    delta_2 = layer_2 - y
    
    if np.abs(delta_2) < 0.5:
        correct += 1
    
    total +=1

print('Test Accuracy:', (correct/total)* 100)

In [None]:
from collections import Counter
import math

def get_similar_words(target_word):
    target_word_index = word2index[target_word]
    scores = Counter()
    for word, index in word2index.items():
        
        #calculate euclidean distance between two word weights
        weight_differences = weights_0_1[target_word_index] - weights_0_1[index]
        squared_differences = weight_differences**2
        scores[word] = -math.sqrt(sum(squared_differences))
    
    return scores.most_common(20)
 

In [None]:
get_similar_words('beautiful')

In [None]:
get_similar_words('terrible')

In [None]:
get_similar_words('money')

**Fill in the blanks**

In [None]:
import random

random.seed(1)

input_dataset = list()
contactenated = list()
for review_tokens in reviews_df.tokens.to_list():
    sentence2index = []
    for token in review_tokens:
        token = clean_token(token)
        if token:
            sentence2index.append(word2index[token])
            contactenated.append(word2index[token])
    input_dataset.append(sentence2index)

contactenated = np.array(contactenated)
    
random.shuffle(input_dataset)

In [None]:
alpha, iteration = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size) * 0

layer_2_target = np.zeros(negative + 1)
layer_2_target[0] = 1

for rev_i, review in enumerate(input_dataset * iteration):
    for target_i in range(len(review)):
        target_samples = [review[target_i]] + list(contactenated[(np.random.rand(negative)*len(contactenated)).astype('int').tolist()])
        
        left_context = review[max(0, target_i - window): target_i]
        right_context = review[target_i+1: target_i + min(len(review), target_i + window)]
        
        layer_1 = np.mean(weights_0_1[left_context + right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
        
        delta_2 = layer_2 - layer_2_target
        delta_1 = delta_2.dot(weights_1_2[target_samples])
        
        weights_0_1[left_context+right_context] -= delta_1 * alpha
        weights_1_2[target_samples] -= np.outer(delta_2, layer_1) * alpha
         
        
        

In [None]:
get_similar_words('terrible')