In [48]:
import numpy as np
import pandas as pd
import math

In [49]:
def data_import(path, flag):
    
    print("Importing Data...")
    
    cols = [i for i in range(1, 1001)]
    training_data = pd.read_csv(path, sep="\t", header=None, names = cols)
    
    colnames = ['UID', 'MID', 'R']
    
    if flag == 5:
        testing_data5 = pd.read_csv('test5.txt', sep=" ", header=None, names = colnames) 
        start, end = 201, 301
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data5, start, end
    
    elif flag == 10:
        testing_data10 = pd.read_csv('test10.txt', sep=" ", header=None, names = colnames)
        start, end = 301, 401
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data10, start, end
    
    elif flag == 20:
        testing_data20 = pd.read_csv('test20.txt', sep=" ", header=None, names = colnames)
        start, end = 401, 501
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data20, start, end
    
    else:
        cols = [i for i in range(1, 6)]
        training_data = pd.read_csv(path, sep=" ", header=None, names = cols)
        testing_data20 = pd.read_csv('sample_test.txt', sep=" ", header=None, names = colnames)
        start, end = 177, 177 + 23
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data20, start, end

In [50]:
def remove_zeros(v1, v2):
    
    new_v1, new_v2 = [], []
    for i, j in zip(v1, v2):
        if i == 0:
            continue
        if j == 0:
            continue
        else:
            new_v1.append(i)
            new_v2.append(j)
            
    return new_v1, new_v2

  
def pearsons_cor(x, y):
    
    x_up = [i for i in x if i != 0]
    y_up = [i for i in y if i != 0]
    
    x_bar, y_bar = 0, 0
    if x_up: x_bar = np.mean(x_up)
    if y_up: y_bar = np.mean(y_up)
    
    x, y = remove_zeros(x, y)

    if len(x) <= 1: return 0
    
    else:
    
        num, sqx, sqy = 0, 0, 0

        for i, j in zip(x, y):
            num += (i-x_bar)*(j-y_bar)
            sqx += (i-x_bar)**2
            sqy += (j-y_bar)**2

        if (sqx*sqy)** 0.5 == 0: return 0
        else: return num / ((sqx*sqy )** 0.5)
    
def pearson_average(n_weights, n_ratings, a_avg, n_avg_ratings):
    
    num, denom = 0,0
    
    for i, j, k in zip(n_weights, n_ratings, n_avg_ratings):
        num += i*(j-k)
        denom += abs(i)
    
    if denom == 0: return 0
    else:
        result = a_avg + (num / denom)
        return result

In [1]:
def neighbour_rating(training_data, nid, mid):
    
    return training_data.loc[nid][mid] 

def average_neighbour_rating(training_data, nid):
    
    ratings = list(training_data.loc[nid])
    ratings = [i for i in ratings if i != 0]
    
    return np.mean(ratings)

def average_user_rating(test_data, uid):
    
    ratings = list(test_data.loc[uid])
    ratings = [i for i in ratings if i != 0]
    
    return np.mean(ratings)

def round_predictions(pred):
    
    if pred < 1:
        return 1
    elif pred > 5:
        return 5
    else: 
        return int(round(pred))
    
def output_file(prediction, path):
    
    f = open(path, 'w')

    for p in prediction:
        f.write(str(p[0]) + " " + str(p[1]) + " " + str(round_predictions(p[2])) + "\n")
    
    print("Prediction file written at " + path)

In [55]:
M, T, N = 1001, 100, 200
#M, T, N = 6, 1, 3

filename = "train.txt"
#filename = "minitrain3.txt"
K, f = 50, 20

train, test, start, end = data_import(filename, f)

print("Converting data...")

group = test.groupby('UID')

mids = [i for i in range(1,M)]
newtestdf = pd.DataFrame(columns = mids)

z = [0 for i in range(1,M)]
for i in range(0, T):
    newtestdf.loc[i] = z
    
users = [i for i in range(start, end)] #201, 301

j = 0
for uid in users:
    curr_list = []
    ug = group.get_group(uid)
    for i, r in ug.iterrows():
        newtestdf.loc[j][r['MID']] = int(r['R'])
    j += 1


print("Calculating Similarity Values...")
sims = np.zeros((T, N))

for i in range(0, T):
    u1 = newtestdf.loc[i]
    for j in range(0, N):
        n1 = train.loc[j]
        u1, n1 = list(u1), list(n1)
        sims[i][j] = pearsons_cor(u1, n1)


print("Making predictions...")
predictions = []

test_dropped = test.set_index('UID', drop=True)

for i, row in test_dropped.iterrows():

    if row['R'] == 0:

        active_user, target_movie = i, row['MID']
        weights = np.zeros((K))
        neighbours = np.zeros((K))

        ids, temp = zip(*sorted(enumerate(sims[i-start]), key=lambda i: abs(i[1]), reverse = True))

        check = 0
        for s in range(0, N):
            if check == K or s == len(temp):
                break
            if train.loc[ids[s]][target_movie] != 0:
                weights[check] = temp[s]
                neighbours[check] = ids[s] 
                check += 1
        
        n_r = []
        n_ar = []
        for n in neighbours:
            n_r.append(neighbour_rating(train, n, target_movie))
            n_ar.append(average_neighbour_rating(train, n))

        avg_user_r = average_user_rating(newtestdf, active_user-start)
        pc_rating = pearson_average(weights, n_r, avg_user_r, n_ar)
        
        if pc_rating == 0:
            predictions.append([active_user, target_movie, avg_user_r]) ## Either return avg user or avg of movie?
        else:
            predictions.append([active_user, target_movie, pc_rating])

output_file(predictions, "V2_PC_result" + str(f) + "_" + str(K) + ".txt")


Importing Data...
Test File 20 Imported.
Converting data...
Calculating Similarity Values...
Making predictions...
Prediction file written at V2_PC_result20_50.txt
