# NN + Cosine Distance on IMDB Movie Review Dataset

In [1]:
import gzip
import os.path as op

import numpy as np
import pandas as pd

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset

In [2]:
if not op.isfile("train.csv") and op.isfile("val.csv") and op.isfile("test.csv"):
    download_dataset()

    df = load_dataset_into_to_dataframe()
    partition_dataset(df)

In [3]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [4]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(lowercase=True, max_features=10_000, stop_words="english")

cv.fit(df_train["text"])

X_train = cv.transform(df_train["text"])
X_val = cv.transform(df_val["text"])
X_test = cv.transform(df_test["text"])

In [37]:
from tqdm import tqdm
from numpy.linalg import norm

k = 2

predicted_classes = []

for i in tqdm(range(df_test.shape[0]), total=df_test.shape[0]):

    test_vec = X_test[i].toarray().reshape(-1)
    test_label = df_test.iloc[i]["label"]
    distance_from_test_instance = []
    
    for j in range(df_train.shape[0]):
        train_vec = X_train[j].toarray().reshape(-1)
        train_label = df_train.iloc[j]["label"]
        
        cosine = 1 - np.dot(test_vec, train_vec)/(norm(test_vec)*norm(train_vec))
        distance_from_test_instance.append(cosine)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    
    top_k_class = df_train.iloc[sorted_idx[:k]]["label"].values
    predicted_class = np.argmax(np.bincount(top_k_class))
    
    predicted_classes.append(predicted_class)
        
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["label"].values))

100%|██████████████████████████████████| 10000/10000 [19:01:07<00:00,  6.85s/it]

Accuracy: 0.651



