In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import operator

In [2]:
def rating_mat(file_name):
        df = pd.read_csv(file_name)
        df_pivot = df.pivot(index='userId', columns='movieId', values='rating')
        mat = df_pivot.values
        return mat

In [4]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
mat = rating_mat('ml-latest-small/ratings.csv')

In [6]:
def dataSplit(mat, split = 0.9):
        n = split
        all_indices = np.argwhere(~np.isnan(mat))
        np.random.shuffle(all_indices)
        train_ind, test_ind = all_indices[:int(n*len(all_indices))], all_indices[int(n*len(all_indices))+1:]
        train = np.empty(mat.shape)
        test = np.empty(mat.shape)
        train[:] = np.nan
        test[:] = np.nan
        for i in train_ind:
            train[i[0], i[1]] = mat[i[0], i[1]]
        for j in test_ind:
            test[j[0], j[1]] = mat[j[0], j[1]]
        
        return train, test

In [7]:
train, test = dataSplit(mat)

In [8]:
def ComputeSimilarity(a, b): ## a = arr([0.0, 3.0, nan...])
    q = np.argwhere(~np.isnan(a))
    q = set([x[0] for x in q])
    
    r = np.argwhere(~np.isnan(b))
    r = set([y[0] for y in r])
    t = r & q
    t = list(t)
    if len(t) > 0:
        userDistance = 1-spatial.distance.cosine(a[t], b[t])
        return userDistance
    else:
        return 0

In [9]:
def getNeighbors(userID, K):
    distances = []
    for user in userDict:
        if (user != userID):
            dist = ComputeSimilarity(userDict[user], userDict[userID])
            distances.append((user, dist))
    distances.sort(key=operator.itemgetter(1), reverse = True)
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

In [30]:
getNeighbors(1,10)

[2, 9, 12, 77, 85, 87, 120, 148, 184, 259]

In [10]:
userID = np.array(range(len(train)+1))

In [11]:
userID = userID[1:]

In [12]:
userDict = {}
for i in range(len(train)):
    userDict[i+1] = train[i]

In [13]:
def meanRating(i):
    return np.nanmean(i)

In [14]:
def predictRating(test_user, NN, j):   ## test_user = arr()
    mean_test_user = meanRating(test_user)
    
    dist_NN = 0
    for i in NN:
        dist_NN += ComputeSimilarity(test_user, userDict[i])
    
    if dist_NN == 0:
        K = 0
    else:
        K = 1 / dist_NN
    temp = 0
    for l in NN:
        a = (ComputeSimilarity(test_user, userDict[l]))*(userDict[l][j] - meanRating(userDict[l]))
        if np.isnan(a):
            temp+=0
        else:
            temp+=a

        
    prediction = mean_test_user + K*temp
    
    return prediction

In [15]:
test_userDict = {}
for i in range(len(test)):
    test_userDict[i+1] = test[i]

In [16]:
test_userDict

{1: array([nan, nan, nan, ..., nan, nan, nan]),
 2: array([nan, nan, nan, ..., nan, nan, nan]),
 3: array([nan, nan, nan, ..., nan, nan, nan]),
 4: array([nan, nan, nan, ..., nan, nan, nan]),
 5: array([nan, nan, nan, ..., nan, nan, nan]),
 6: array([nan, nan, nan, ..., nan, nan, nan]),
 7: array([nan, nan, nan, ..., nan, nan, nan]),
 8: array([nan,  4., nan, ..., nan, nan, nan]),
 9: array([nan, nan, nan, ..., nan, nan, nan]),
 10: array([nan, nan, nan, ..., nan, nan, nan]),
 11: array([nan, nan, nan, ..., nan, nan, nan]),
 12: array([nan, nan, nan, ..., nan, nan, nan]),
 13: array([nan, nan, nan, ..., nan, nan, nan]),
 14: array([nan, nan, nan, ..., nan, nan, nan]),
 15: array([nan, nan, nan, ..., nan, nan, nan]),
 16: array([nan, nan, nan, ..., nan, nan, nan]),
 17: array([4.5, nan, nan, ..., nan, nan, nan]),
 18: array([nan, nan, nan, ..., nan, nan, nan]),
 19: array([nan, nan, nan, ..., nan, nan, nan]),
 20: array([nan, nan, nan, ..., nan, nan, nan]),
 21: array([nan, nan, nan, ..

In [28]:
test_ind = np.argwhere(~np.isnan(test))

pred_ij = np.empty(test.shape)
for user in test_userDict:
    NN = getNeighbors(user, 5)
    test_arr = np.argwhere(~np.isnan(test_userDict[user]))
    for j in test_arr:
        pred_ij[user-1, j] = predictRating(test_userDict[user], NN, j)
#     for item in test_ind:
#         pred_ij = predictRating(test_userDict[user], NN, item[1])
#         error = (test[item[0],item[1]] - pred_ij)**2
#         SE += error

# MSE = (SE/len(test_ind))

In [29]:
AE=0
SE=0
for i in test_ind:
    error = (test[i[0], i[1]] - pred_ij[i[0], i[1]])**2
    SE += error
    AE+=abs((test[i[0], i[1]] - pred_ij[i[0], i[1]]))
MSE = SE / len(test_ind) 
MAE=AE/ len(test_ind)
print("MAE=",MAE)   
print("MSE=",MSE)

MAE= 0.7117387312640245
MSE= 0.8487472494263083
