In [62]:
import numpy as np
from numpy import linalg as la

#### Load data from train and test files

In [63]:
def get_train_data():
    file = open('traindata/train.txt', 'r')
    train = []
    for line in file:
        line = line.rstrip('\n')
        l = [int(e) for e in line.split('\t')]
        train.append(np.transpose(l))
    file.close()
    return train

In [64]:
def get_test_data(filename):
    file = open('testdata/'+filename+'.txt', 'r')
    testdata = np.zeros([100,1000])
    ind = []
    original_test_data = []
    indexes = []
    prev = 0
    for line in file:
        line = line.rstrip('\n')
        l = [int(e) for e in line.split(' ')]
        original_test_data.append(l)
        if prev == 0:
            index = l[0]
        if prev != l[0]:
            prev = l[0]
            if len(ind) > 0:
                indexes.append(np.transpose(ind))
                ind = []
        else:
            if l[2] == 0:
                ind.append(l[1]-1)
        testdata[prev-index][l[1]-1] = l[2]
        if l[2] == 0:
            testdata[prev-index][l[1]-1] = -1
    file.close()
    indexes.append(np.transpose(ind))
    return original_test_data, indexes, testdata

#### Methods to find similar users

In [65]:
# Find if the train user has a movie rating available
def does_rating_exist_to_predict(train,test_index):
    return True if train[test_index] != 0 else False

In [66]:
# Find count of available ratings for finding similarity
def num_coexisting_ratings(train,test):
    count = 0
    for i in range(len(train)):
        if train[i] != 0 and test[i] != 0:
            count += 1
    return count

In [67]:
# Return test&train user ratings by eliminating unknown ratings of test user
def return_filtered_vectors(train, test, test_indexes):
    train_new = np.delete(train, test_indexes, 1)
    test_new = np.delete(test, test_indexes)
    return train_new, test_new

In [68]:
# Return cosine similarity between ratings of test and train user
def find_cosine_similarity(vec1,vec2):
    vec1 = vec1 - return_average(vec1)
    vec2 = vec2 - return_average(vec2)
    return np.dot(vec1,vec2)/(la.norm(vec1)*la.norm(vec2))

In [69]:
# Return average from the given ratings
def return_average(test):
    return np.sum(test)/np.count_nonzero(test)

In [70]:
# Predict rating
def predict_rating(train, train_new, similarities, itemid, test):
    nr = 0
    dr = 0
    k = len(similarities)
    for i in range(k):
        nr += similarities[i][0]*(train[similarities[i][1]][itemid]-return_average(train_new[similarities[i][1]]))
        dr += abs(similarities[i][0])  
    if nr != 0 and dr != 0:
        x = int(round(nr/dr))
    else:
        x = int(round(return_average(test)))
    return x

In [71]:
# Get list of similar items for one test user
def get_ratings_of_items(train, test, test_indexes):
    rating = []
    train_new, test_new = return_filtered_vectors(train, test, test_indexes)
    avg = int(round(return_average(test_new)))
    for i in test_indexes:
        sim = []
        for j in range(200):
            if does_rating_exist_to_predict(train[j],i):
                sim.append([find_cosine_similarity(train_new[j],test_new), j])
        sim = sorted(sim)
        if len(sim) > 0:
            r = avg+predict_rating(train, train_new, sim, i, test_new)
            rating.append(r)
        else:
            rating.append(avg)
    return rating

In [72]:
# Replace zeros with predicted ratings
def insert_predicted_ratings(org_data, startindex, ratingsarray):
    for i in range(len(ratingsarray)):
        org_data[startindex][2] = ratingsarray[i]
        startindex += 1
    return startindex, org_data

## Generate result files

In [73]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test5')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(traindata, testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+5, rat)
np.savetxt('output/test5.txt', org_test_data, fmt='%d')

In [74]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test10')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(traindata, testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+10, rat)
np.savetxt('output/test10.txt', org_test_data, fmt='%d')

In [75]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test20')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(traindata, testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+20, rat)
np.savetxt('output/test20.txt', org_test_data, fmt='%d')