# COSINE SIMILARITY

In [2]:
import numpy as np
import pandas as pd
import math

In [None]:
#UPDATE FILENAME PATH AND VALUES AS NEEDED
filename = "train.txt"
M, T, N = 1001, 100, 200
K, f = 200, 20

In [1]:
def data_import(path, flag):
    
    print("Importing Data...")
    
    cols = [i for i in range(1, 1001)]
    training_data = pd.read_csv(path, sep="\t", header=None, names = cols)
    
    colnames = ['UID', 'MID', 'R']
    
    if flag == 5:
        testing_data5 = pd.read_csv('test5.txt', sep=" ", header=None, names = colnames) 
        start, end = 201, 301
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data5, start, end
    
    elif flag == 10:
        testing_data10 = pd.read_csv('test10.txt', sep=" ", header=None, names = colnames)
        start, end = 301, 401
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data10, start, end
    
    elif flag == 20:
        testing_data20 = pd.read_csv('test20.txt', sep=" ", header=None, names = colnames)
        start, end = 401, 501
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data20, start, end


In [2]:
def remove_zeros(v1, v2):
    
    new_v1, new_v2 = [], []
    for i, j in zip(v1, v2):
        if i == 0:
            continue
        if j == 0:
            continue
        else:
            new_v1.append(i)
            new_v2.append(j)
            
    return new_v1, new_v2

def cosine_sim(v1, v2):
    
    v1, v2 = remove_zeros(v1, v2)
    v1, v2 = np.array(v1), np.array(v2)
    
    if len(v1) <= 1:
        return 0
    else:
        numerator = sum(v1*v2)
        denominator = math.sqrt(sum(v1*v1)) * math.sqrt(sum(v2*v2))

        result = numerator / denominator

        return result

In [3]:
def weighted_average(n_weights, n_ratings):
    
    num, denom = 0,0
    
    for i, j in zip(n_weights, n_ratings):
        num += i*j
        denom += i
    
    if num == 0 or denom == 0:
        return 0
    else:
        return num / denom

def neighbour_rating(training_data, nid, mid):
    
    return training_data.loc[nid][mid] 

def average_user_rating(test_data, uid):
    
    ratings = list(test_data.loc[uid])
    ratings = [i for i in ratings if i != 0]
    
    return np.mean(ratings)

def round_predictions(pred):
    
    if pred < 1:
        return 1
    elif pred > 5:
        return 5
    else: 
        return int(round(pred))
    
def output_file(prediction, path):
    
    f = open(path, 'w')

    for p in prediction:
        f.write(str(p[0]) + " " + str(p[1]) + " " + str(round_predictions(p[2])) + "\n")
    
    print("Prediction file written at " + path)

In [8]:
train, test, start, end = data_import(filename, f)

print("Converting data...")

group = test.groupby('UID')

mids = [i for i in range(1,M)]
newtestdf = pd.DataFrame(columns = mids)

z = [0 for i in range(1,M)]
for i in range(0, T):
    newtestdf.loc[i] = z
    
users = [i for i in range(start, end)] #201, 301

j = 0
for uid in users:
    curr_list = []
    ug = group.get_group(uid)
    for i, r in ug.iterrows():
        newtestdf.loc[j][r['MID']] = int(r['R'])
    j += 1
    
print("Calculating Similarity Values...")
sims = np.zeros((T, N))

for i in range(0, T):
    v1 = newtestdf.loc[i]
    for j in range(0, N):
        v2 = train.loc[j]    
        sims[i][j] = cosine_sim(v1, v2)

print(sims.shape)

print("Making predictions...")

predictions = []

test_dropped = test.set_index('UID', drop=True)

threshold = 0.5
for i, row in test_dropped.iterrows():

    if row['R'] == 0:

        active_user, target_movie = i, row['MID']

        weights = np.zeros((K))
        neighbours = np.zeros((K))

        ids, temp = zip(*sorted(enumerate(sims[i-start]), key=lambda i: i[1], reverse = True))

        check = 0
        for s in range(0, N):
            if check == K or s == len(temp):
                break
            if train.loc[ids[s]][target_movie] != 0 and threshold <= temp[s]:
                weights[check] = temp[s]
                neighbours[check] = ids[s] 
                check += 1

        n_r =[]
        for n in neighbours:
            n_r.append(neighbour_rating(train, n, target_movie))
            
        p_rating = weighted_average(weights, n_r)
        if p_rating == 0:
            avg_user_r = average_user_rating(newtestdf, active_user-start)
            predictions.append([active_user, target_movie, avg_user_r]) ## Either return avg user or avg of movie?
        else:
            predictions.append([active_user, target_movie, p_rating])
            
output_file(predictions, "New_Cosine_result" + str(f) + "_" + str(K) + ".txt")

Importing Data...
Test File 20 Imported.
Converting data...
Calculating Similarity Values...
(100, 200)
Making predictions...
Prediction file written at New_Cosine_result20_200.txt
