# MY ALGORITHM

In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
def data_import(path, flag):
    
    print("Importing Data...")
    
    cols = [i for i in range(1, 1001)]
    training_data = pd.read_csv(path, sep="\t", header=None, names = cols)
    
    colnames = ['UID', 'MID', 'R']
    
    if flag == 5:
        testing_data5 = pd.read_csv('test5.txt', sep=" ", header=None, names = colnames) 
        start, end = 201, 301
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data5, start, end
    
    elif flag == 10:
        testing_data10 = pd.read_csv('test10.txt', sep=" ", header=None, names = colnames)
        start, end = 301, 401
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data10, start, end
    
    elif flag == 20:
        testing_data20 = pd.read_csv('test20.txt', sep=" ", header=None, names = colnames)
        start, end = 401, 501
        print("Test File " + str(flag) + " Imported.")
        return training_data, testing_data20, start, end
    

In [3]:
def remove_zeros(v1, v2):
    
    new_v1, new_v2 = [], []
    for i, j in zip(v1, v2):
        if i == 0:
            continue
        if j == 0:
            continue
        else:
            new_v1.append(i)
            new_v2.append(j)
            
    return new_v1, new_v2

def cosine_sim(v1, v2):
    
    v1, v2 = remove_zeros(v1, v2)
    v1, v2 = np.array(v1), np.array(v2)
    
    if len(v1) <= 1:
        return 0
    else:
        numerator = sum(v1*v2)
        denominator = math.sqrt(sum(v1*v1)) * math.sqrt(sum(v2*v2))

        result = numerator / denominator

        return result
    
def weighted_average(n_weights, n_ratings):
    
    num, denom = 0,0
    
    for i, j in zip(n_weights, n_ratings):
        num += i*j
        denom += i
    
    if num == 0 or denom == 0:
        return 0
    else:
        return num / denom
    
def pearsons_cor(x, y):
    
    x_up = [i for i in x if i != 0]
    y_up = [i for i in y if i != 0]
    
    x_bar, y_bar = 0, 0
    if x_up: x_bar = np.mean(x_up)
    if y_up: y_bar = np.mean(y_up)
    
    x, y = remove_zeros(x, y)

    if len(x) <= 1: return 0
    
    else:
    
        num, sqx, sqy = 0, 0, 0

        for i, j in zip(x, y):
            num += (i-x_bar)*(j-y_bar)
            sqx += (i-x_bar)**2
            sqy += (j-y_bar)**2

        if (sqx*sqy)** 0.5 == 0: return 0
        else: return num / ((sqx*sqy )** 0.5)
    
def pearson_average(n_weights, n_ratings, a_avg, n_avg_ratings):
    
    num, denom = 0,0
    
    for i, j, k in zip(n_weights, n_ratings, n_avg_ratings):
        num += i*(j-k)
        denom += abs(i)
    
    if denom == 0: return 0
    else:
        result = a_avg + (num / denom)
        return result
    

def adj_cosine_sim(df1, df2, m_id, t_id, u_id, avg_list): 
    
    m1 = list(df1.loc[m_id])
    m2 = list(df1.loc[t_id])
    
    m1.append(df2.loc[m_id][u_id])
    m2.append(df2.loc[t_id][u_id])

    m1, m2 = remove_zeros(m1, m2)
    
    if len(m1) <= 1:
        return 0
    else:
        m1, m2 = np.array(m1), np.array(m2)
        numerator, denom1, denom2 = 0, 0, 0
        
        for i in range(0, len(m1)):
            r_u = avg_list[i]
            
            numerator += (m1[i]-r_u)*(m2[i]-r_u)
            denom1 += math.pow(m1[i]-r_u, 2)
            denom2 += math.pow(m2[i]-r_u, 2)

        denominator = math.sqrt(denom1) * math.sqrt(denom2)
        
        if denominator == 0: return 0
        else: return abs(numerator / denominator)

In [4]:
def user_rating(test_data, uid):
    
    ratings = list(test_data[uid])
    indexes = [i+1 for i, r in enumerate(ratings) if r != 0]
    ratings = [r for r in ratings if r != 0]
    
    return ratings, indexes

def get_total_ratings(training_data, test_data, mid):
    
    tr = list(training_data[mid])
    ur = list(test_data[mid])
    
    ratings = tr + ur
    ratings = [i for i in ratings if i != 0]
        
    return len(ratings)

def neighbour_rating(training_data, nid, mid):
    
    return training_data.loc[nid][mid] 

def average_neighbour_rating(training_data, nid):
    
    ratings = list(training_data.loc[nid])
    ratings = [i for i in ratings if i != 0]
    
    if ratings: return np.mean(ratings)
    else: return 0
    
def average_user_rating(test_data, uid):
    
    ratings = list(test_data.loc[uid])
    ratings = [i for i in ratings if i != 0]
    
    if ratings: return np.mean(ratings)
    else: return 0

def average_movie_rating(training_data, test_data, mid):
    
    tr = list(training_data[mid])
    ur = list(test_data[mid])

    ratings = tr+ur
    ratings = [i for i in ratings if i != 0]
    
    if ratings: 
        return np.mean(ratings)
    else: 
        return 0

def get_iuf(m_i, T_U):
    
    if m_i == 0:
        return math.log(T_U, 10)
    else:
        return math.log(T_U/m_j, 10)
    

def round_predictions(pred):
    
    if pred < 1:
        return 1
    elif pred > 5:
        return 5
    else: 
        return int(round(pred))
    
def output_file(prediction, path):
    
    f = open(path, 'w')

    for p in prediction:
        f.write(str(p[0]) + " " + str(p[1]) + " " + str(round_predictions(p[2])) + "\n")
    
    print("Prediction file written at " + path)

In [10]:
for f in [5, 10, 20]:

    M, T, N = 1001, 100, 200
    Total_number_of_Users = T + N

    filename = "train.txt"
    K = 50
    MAX_NEI = 5

    train, test, start, end = data_import(filename, f)

    print("Converting data...")

    group = test.groupby('UID')

    mids = [i for i in range(1,M)]
    newtestdf = pd.DataFrame(columns = mids)

    z = [0 for i in range(1,M)]
    for i in range(0, T):
        newtestdf.loc[i] = z

    users = [i for i in range(start, end)] #201, 301
    
    all_movies = (train.T).copy()
    
    j, x = N, 0
    for uid in users:
        ug = group.get_group(uid)
        all_movies[j] = [0 for i in range(1, M)]
        for i, r in ug.iterrows():
            newtestdf.loc[x][r['MID']] = int(r['R'])
            all_movies[j][r['MID']] = int(r['R'])
        j += 1
        x += 1
        
    avg_user_ratings = np.zeros((N+T))

    for i in range(0, N+T):

        u_r = [r for r in all_movies[i] if r != 0]
        avg_user_ratings[i] = np.mean(u_r)

    iuf_train = train.copy()
    iuf_test = newtestdf.copy()

    for i in range(1, M):
        m_j = get_total_ratings(train, newtestdf, i)
        iuf = get_iuf(m_j, Total_number_of_Users)
        for k in range(0, N):
            iuf_train[i][k] = iuf_train[i][k] * iuf
        for l in range(0, T):
            iuf_test[i][l] = iuf_test[i][l] * iuf

    print("Calculating Similarity Values...")
    sims_cos = np.zeros((T, N))
    sims_pc = np.zeros((T, N))

    for i in range(0, T):
        v1 = iuf_test.loc[i]
        for j in range(0, N):
            v2 = iuf_train.loc[j]    
            sims_cos[i][j] = cosine_sim(v1, v2)
            sims_pc[i][j] = pearsons_cor(v1, v2)

    print(sims_cos.shape)
    
    ib_train = (train.T).copy()
    ib_newtestdf = (newtestdf.T).copy()

    f = open("Orig_Preds" + str(f) + str(K), 'w') 
    
    print("Making predictions...")

    predictions = []

    for uid in users:

        active_user = uid

        ids1, temp1 = zip(*sorted(enumerate(sims_cos[uid-start]), key=lambda i: i[1], reverse = True))
        ids2, temp2 = zip(*sorted(enumerate(sims_pc[uid-start]), key=lambda i: abs(i[1]), reverse = True))
        avg_user_r = average_user_rating(newtestdf, active_user-start)
        active_user_ratings, active_user_movie_ids = user_rating(ib_newtestdf, active_user-start)

        ug = group.get_group(uid)
        for i, row in ug.iterrows():
            if row['R'] == 0:

                target_movie = row['MID']

                cos_weights = np.zeros((K))
                cos_neighbours = np.zeros((K))

                pc_weights = np.zeros((K))
                pc_neighbours = np.zeros((K))
                
                ib_sims = np.zeros((MAX_NEI))
                ib_weights = np.zeros((MAX_NEI))
                ib_neighbours = np.zeros((MAX_NEI))
                ib_ratings = np.zeros((MAX_NEI))
                
                x = 0
                for movie_id in active_user_movie_ids:
                    if x == MAX_NEI:
                        break
                    ib_sims[x] = adj_cosine_sim(ib_train, ib_newtestdf, movie_id, target_movie, active_user-start, avg_user_ratings)
                    x += 1

                ib_ids, ib_temp, ib_r = zip(*sorted(zip(active_user_movie_ids, ib_sims, active_user_ratings), key=lambda i: abs(i[1]), reverse = True))
                ib_neighbours = ib_ids[:K]
                ib_weights = ib_temp[:K]
                ib_ratings = ib_r[:K]

                check1, check2 = 0, 0
                for s in range(0, N):
                    if train.loc[ids1[s]][target_movie] != 0 and check1 < K and s < len(temp1):
                        cos_weights[check1] = temp1[s]
                        cos_neighbours[check1] = ids1[s] 
                        check1 += 1

                    if train.loc[ids2[s]][target_movie] != 0 and check2 < K and s < len(temp2):
                        pc_weights[check2] = temp2[s]
                        pc_neighbours[check2] = ids2[s] 
                        check2 += 1

                n_cos, n_pc, n_ar =[], [], []

                for n in cos_neighbours:
                    n_cos.append(neighbour_rating(train, n, target_movie))

                for n in pc_neighbours:
                    n_pc.append(neighbour_rating(train, n, target_movie))
                    n_ar.append(average_neighbour_rating(train, n))
                
                cos_rating = weighted_average(cos_weights, n_cos)
                pc_rating = pearson_average(pc_weights, n_pc, avg_user_r, n_ar)
                ib_rating = weighted_average(ib_weights, ib_ratings)

                f.write(str(active_user)+ " " + str(target_movie)+ " " + str(cos_rating) + " " + str(pc_rating) + " " + str(ib_rating) + "\n")
                
                final_rating = 0.5*pc_rating + 0.4*cos_rating + 0.1*ib_rating

                if final_rating == 0:
                    a_m_r = average_movie_rating(train, newtestdf, target_movie)
                    if a_m_r != 0: 
                        predictions.append([active_user, target_movie, a_m_r]) 
                    else: 
                        predictions.append([active_user, target_movie, avg_user_r]) 
                else: predictions.append([active_user, target_movie, final_rating])

    output_file(predictions, "V4_MyOwn" + str(f) + "_" + str(K) + ".txt")


Importing Data...
Test File 5 Imported.
Converting data...
Calculating Similarity Values...
(100, 200)
Making predictions...
Prediction file written at V4_MyOwn<_io.TextIOWrapper name='Orig_Preds550' mode='w' encoding='UTF-8'>_50.txt
Importing Data...
Test File 10 Imported.
Converting data...
Calculating Similarity Values...
(100, 200)
Making predictions...
Prediction file written at V4_MyOwn<_io.TextIOWrapper name='Orig_Preds1050' mode='w' encoding='UTF-8'>_50.txt
Importing Data...
Test File 20 Imported.
Converting data...
Calculating Similarity Values...
(100, 200)
Making predictions...
Prediction file written at V4_MyOwn<_io.TextIOWrapper name='Orig_Preds2050' mode='w' encoding='UTF-8'>_50.txt
