In [56]:
import numpy as np
from numpy import linalg as la

#### Load data from train and test files

In [57]:
def get_train_data():
    file = open('traindata/train.txt', 'r')
    train = []
    for line in file:
        line = line.rstrip('\n')
        l = [int(e) for e in line.split('\t')]
        train.append(np.transpose(l))
    file.close()
    return train

In [58]:
def get_test_data(filename):
    file = open('testdata/'+filename+'.txt', 'r')
    testdata = np.zeros([100,1000])
    ind = []
    original_test_data = []
    indexes = []
    prev = 0
    for line in file:
        line = line.rstrip('\n')
        l = [int(e) for e in line.split(' ')]
        original_test_data.append(l)
        if prev == 0:
            index = l[0]
        if prev != l[0]:
            prev = l[0]
            if len(ind) > 0:
                indexes.append(np.transpose(ind))
                ind = []
        else:
            if l[2] == 0:
                ind.append(l[1]-1)
        testdata[prev-index][l[1]-1] = l[2]
    file.close()
    indexes.append(np.transpose(ind))
    return original_test_data, indexes, testdata

#### Methods to find similar users

In [59]:
# Find count of available ratings for finding similarity
def num_coexisting_ratings(train1,train2):
    count = 0
    for i in range(len(train1)):
        if train1[i] != 0 and train2[i] != 0:
            count += 1
    return count

In [60]:
# Find if the train user has a movie rating available
def get_rated_indices(test):
    rated_indices = []
    for i in range(len(test)):
        if test[i] != 0:
            rated_indices.append(i)
    return np.transpose(rated_indices)

In [61]:
# Return average from the given ratings
def return_average(test):
    return np.sum(test)/np.count_nonzero(test)

In [62]:
# Return modified train matrix
def modified_train_matrix(train):
    for i in range(200):
        avg = return_average(train[i])
        for j in range(1000):
            if train[i][j] != 0:
                train[i][j] -= avg
    return train

In [63]:
# Return cosine similarity between ratings of test and train user
def find_cosine_similarity(vec1,vec2):
    return np.dot(vec1,vec2)/(la.norm(vec1)*la.norm(vec2))

In [64]:
# Predict rating
def predict_rating_for_one_item(test, weights):
    nr = 0
    dr = 0
    k = len(weights)
    if k > 0:
        for i in range(k):
            nr += weights[i][0]*test[weights[i][1]]
            dr += weights[i][0]
        x = int(round(nr/dr))
        if x > 5:
            x = 5
        elif x < 1:
            x = 1
    else:
        x = int(round(return_average(test)))
    return x

In [65]:
# Get list of similar items for one test user
def get_ratings_of_items(train, test, test_indexes):
    rating = []
    ind = get_rated_indices(test)
    for i in test_indexes:
        sim = []
        weights = []
        for j in ind:
            if num_coexisting_ratings(train[i], train[j]) > 0:
                weights.append([find_cosine_similarity(train[j],train[i]), j])
        rating.append(predict_rating_for_one_item(test, weights))
    return np.transpose(rating)

In [66]:
# Replace zeros with predicted ratings
def insert_predicted_ratings(org_data, startindex, ratingsarray):
    for i in range(len(ratingsarray)):
        org_data[startindex][2] = ratingsarray[i]
        startindex += 1
    return startindex, org_data

## Generate result files

In [67]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test5')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(np.transpose(traindata), testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+5, rat)
np.savetxt('output/test5.txt', org_test_data, fmt='%d')

In [68]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test10')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(np.transpose(traindata), testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+10, rat)
np.savetxt('output/test10.txt', org_test_data, fmt='%d')

In [69]:
traindata = get_train_data()
org_test_data, test_indexes, testdata = get_test_data('test20')
ind = 0
for i in range(100):
    rat = get_ratings_of_items(np.transpose(traindata), testdata[i], test_indexes[i])
    ind, org_test_data = insert_predicted_ratings(org_test_data, ind+20, rat)
np.savetxt('output/test20.txt', org_test_data, fmt='%d')