# ITEM-BASED

In [138]:
import numpy as np
import pandas as pd
import math

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [139]:
def data_import(path, flag, M):
    
    print("Importing Data...")
    
    cols = [i for i in range(1, M)]
    training_data = pd.read_csv(path, sep="\t", header=None, names = cols)
    
    colnames = ['UID', 'MID', 'R']
    
    if flag == 5:
        testing_data5 = pd.read_csv('test5.txt', sep=" ", header=None, names = colnames) 
        start, end = 201, 301
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data5, start, end
    
    elif flag == 10:
        testing_data10 = pd.read_csv('test10.txt', sep=" ", header=None, names = colnames)
        start, end = 301, 401
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data10, start, end
    
    elif flag == 20:
        testing_data20 = pd.read_csv('test20.txt', sep=" ", header=None, names = colnames)
        start, end = 401, 501
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data20, start, end
    

In [144]:
def remove_zeros(v1, v2):
    
    new_v1, new_v2 = [], []
    for i, j in zip(v1, v2):
        if i == 0:
            continue
        if j == 0:
            continue
        else:
            new_v1.append(i)
            new_v2.append(j)
            
    return new_v1, new_v2


def user_rating(test_data, uid):
    
    ratings = list(test_data[uid])
    indexes = [i+1 for i, r in enumerate(ratings) if r != 0]
    ratings = [r for r in ratings if r != 0]
    
    return ratings, indexes


def adj_cosine_sim(df1, df2, m_id, t_id, u_id, avg_list): 
    
    m1 = list(df1.loc[m_id])
    m2 = list(df1.loc[t_id])
    
    m1.append(df2.loc[m_id][u_id])
    m2.append(df2.loc[t_id][u_id])

    m1, m2 = remove_zeros(m1, m2)
    
    if len(m1) <= 1:
        return 0
    else:
        m1, m2 = np.array(m1), np.array(m2)
        numerator, denom1, denom2 = 0, 0, 0
        
        for i in range(0, len(m1)):
            r_u = avg_list[i]
            
            numerator += (m1[i]-r_u)*(m2[i]-r_u)
            denom1 += math.pow(m1[i]-r_u, 2)
            denom2 += math.pow(m2[i]-r_u, 2)

        denominator = math.sqrt(denom1) * math.sqrt(denom2)
        
        if denominator == 0: return 0
        else: return abs(numerator / denominator)
        
def weighted_average(n_weights, u_ratings):
    
    num, denom = 0,0
    
    for i, j in zip(n_weights, u_ratings):
        num += i*j
        denom += i
    
    if num == 0 or denom == 0: 
        return 0
    else:
        return num / denom

def round_predictions(pred):
    
    if pred < 1:
        return 1
    elif pred > 5:
        return 5
    else: 
        return int(round(pred))
    
def output_file(prediction, path):
    
    f = open(path, 'w')

    for p in prediction:
        f.write(str(p[0]) + " " + str(p[1]) + " " + str(round_predictions(p[2])) + "\n")
    
    print("Prediction file written at " + path)
            

In [147]:
K, f, MAX_NEI = 20, 20, 20
M, T, N = 1001, 100, 200
filename = "train.txt"

train, test, start, end = data_import(filename, f, M)

train = train.T

all_movies = train.copy()

print("Converting data...")

group = test.groupby('UID')

mids = [i for i in range(1,M)]
newtestdf = pd.DataFrame(columns = mids)

z = [0 for i in range(1,M)]
for i in range(0, T):
    newtestdf.loc[i] = z

users = [i for i in range(start, end)] #201, 301

j, x = N, 0
for uid in users:

    ug = group.get_group(uid)
    all_movies[j] = [0 for i in range(1, M)]
    for i, r in ug.iterrows():
        all_movies[j][r['MID']] = int(r['R'])
        newtestdf.loc[x][r['MID']] = int(r['R'])
    j += 1
    x += 1

avg_user_ratings = np.zeros((N+T))

for i in range(0, N+T):
    
    u_r = [r for r in all_movies[i] if r != 0]
    avg_user_ratings[i] = np.mean(u_r)

newtestdf = newtestdf.T

print("Making predictions...")

predictions = []

test_dropped = test.set_index('UID', drop=True)

for i, row in test_dropped.iterrows():

    if row['R'] == 0:

        total += 1
        active_user, target_movie = i, row['MID']
        active_user_ratings, active_user_movie_ids = user_rating(newtestdf, active_user-start)

        sims = np.zeros((MAX_NEI))
        weights = np.zeros((MAX_NEI))
        neighbours = np.zeros((MAX_NEI))
        ratings = np.zeros((MAX_NEI))
        
        x = 0
        for movie_id in active_user_movie_ids:
            if x == MAX_NEI:
                break
            sims[x] = adj_cosine_sim(train, newtestdf, movie_id, target_movie, active_user-start, avg_user_ratings)
            x += 1
            
        ids, temp, r = zip(*sorted(zip(active_user_movie_ids, sims, active_user_ratings), key=lambda i: abs(i[1]), reverse = True))
        neighbours = ids[:K]
        weights = temp[:K]
        ratings = r[:K]

        p_rating = weighted_average(weights, ratings)
        if p_rating == 0:
            avg_user_r = np.mean(active_user_ratings)
            predictions.append([active_user, target_movie, avg_user_r]) ## Either return avg user or avg of movie?
        else:
            predictions.append([active_user, target_movie, p_rating])


output_file(predictions, "V5_ItemBased_result" + str(f) + "_" + str(K) + ".txt")
#output_file(predictions, "SAMPLE_V2_ItemBased_result" + str(f) + "_" + str(K) + ".txt")


Importing Data...
Test File 20 Imported.
Converting data...
Making predictions...
Prediction file written at V5_ItemBased_result20_20.txt
