In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.neighbors import NearestNeighbors


In [2]:
TRAINING_PATH = './netflix/TrainingRatings.txt'
TEST_PATH = './netflix/TestingRatings.txt'

dfTest = pd.read_csv(TEST_PATH)
dfTest.columns = ['movie', 'user', 'rating']

dfTrain = pd.read_csv(TRAINING_PATH)
dfTrain.columns = ['movie', 'user', 'rating']
dfTest.head()

Unnamed: 0,movie,user,rating
0,8,2149668,3.0
1,8,1089184,3.0
2,8,2465894,3.0
3,8,534508,1.0
4,8,992921,4.0


In [3]:
dfTrain0 = dfTrain.pivot_table(index = 'user', columns = 'movie', values = 'rating', fill_value= float(0)).astype('float32')
dfTrainNa = dfTrain.pivot_table(index = 'user', columns = 'movie', values = 'rating').astype('float32')

In [4]:
def to_boolean(df):
    return (df > 0).astype('float32')

def row_mean(df):
    return np.nan_to_num(np.nanmean(df, axis=1).reshape((len(df), 1)))

def mean_matrix(df):
    global dfTrainMean

    dfTrainMean = row_mean(df)
    return dfTrainMean * to_boolean(df)

def normalize_rating(df0, dfNa):
    return df0 - mean_matrix(dfNa)

boolean_matrix = to_boolean(dfTrainNa)

dfTrainNormalized = normalize_rating(dfTrain0, dfTrainNa)
dfTrainNormalized

movie,8,28,43,48,61,64,66,92,96,111,...,17654,17660,17689,17693,17706,17725,17728,17734,17741,17742
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,1.096154,0.096154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
79,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
199,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056338,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
481,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.648649,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
769,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648869,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.375,0.0,0.0,0.0,0.0
2648885,0.000000,0.051471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
2649120,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0
2649267,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.851852,...,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0


In [5]:
train_arr = np.asarray(dfTrain0)
normalized_train_arr = np.asarray(dfTrainNormalized)
index_arr = dfTrainNormalized.index # contains user index

In [6]:
index_arr

Int64Index([      7,      79,     199,     481,     769,     906,    1310,
               1333,    1427,    1442,
            ...
            2648572, 2648589, 2648730, 2648734, 2648853, 2648869, 2648885,
            2649120, 2649267, 2649285],
           dtype='int64', name='user', length=28978)

In [120]:
nbrs = NearestNeighbors(n_neighbors = 3, algorithm = 'ball_tree').fit(train_arr)

In [8]:
def nearest_neighbors(user_arr):
    distances, indices = nbrs.kneighbors(user_arr)

    return indices[0][:]

In [138]:
def get_weights(active_user, other_users):
    
    dot_product = np.multiply(np.asarray(active_user), np.asarray(other_users))
    numerator = np.sum(dot_product, axis = 1)

    i, j = np.nonzero(dot_product)
    
    active_user = np.tile(active_user, (len(other_users), 1))
    other_users = np.asarray(other_users)
    common_active_user = [active_user[x, j[x == i]] for x in range(active_user.shape[0])]
    common_other_users = [other_users[x, j[x == i]] for x in range(other_users.shape[0])] 

    sq_cau = np.array([np.square(row) for row in common_active_user], dtype = object).reshape(len(other_users), -1)
    sq_cou = np.array([np.square(row) for row in common_other_users], dtype = object).reshape(len(other_users), -1)

    

    sq_cau_sum = np.array([sum(row[0]) for row in sq_cau])
    sq_cou_sum = np.array([sum(row[0]) for row in sq_cou])

    # denominator = np.sqrt(np.multiply(sq_cau_sum, sq_cou_sum))

    # weights = numerator / denominator
    return np.nan_to_num(weights) 

In [132]:
# weights needs to be a (1, n) array 
# find the dot product between the users, get the index of the non zero elements. Use these elements to get the denominator. 
# find the mean matrix and the normalized rating matrix 
def predict(df, user_id, movie_id):
    total = 0.0
    kappa = 0.0
    user_mean = dfTrainMean[list(index_arr).index(user_id)][0]

    prediction = user_mean

    # print(sum(df.loc[user_id, : ]))
    weights = get_weights(df.loc[user_id, :], df)
    kappa = np.sum(np.abs(weights))

    movie_ratings = np.array(df[movie_id])
    total = np.dot(weights.T, movie_ratings)
    
    if kappa != 0:
        prediction = user_mean + (total / kappa)
    
    return prediction

In [11]:
dfTest.head()

Unnamed: 0,movie,user,rating
0,8,2149668,3.0
1,8,1089184,3.0
2,8,2465894,3.0
3,8,534508,1.0
4,8,992921,4.0


In [139]:
count = 0
predictions = [] 
for row in dfTest.itertuples():
    active_user = getattr(row, 'user')
    indices_nn = nearest_neighbors(dfTrain0.loc[active_user].values.reshape(1, -1))

    dfNN = pd.DataFrame(normalized_train_arr[indices_nn], index = index_arr[indices_nn], columns = dfTrain0.columns)

    prediction = predict(dfNN, getattr(row, 'user'), getattr(row, 'movie'))
    predictions.append(prediction)
    
    count += 1
    if count == 1:
        break

[[array([0.19360004, 0.19360004, 5.9536004 , 0.19360004, 2.4335997 ,
         0.19360004, 0.31359994, 0.19360004, 0.19360004, 0.19360004,
         2.4335997 , 2.4335997 , 0.31359994, 0.31359994, 2.0736    ,
         2.4335997 , 2.4335997 , 0.31359994, 0.19360004, 0.19360004,
         0.19360004, 0.19360004, 0.19360004, 2.0736    , 2.4335997 ,
         2.0736    , 0.19360004, 0.31359994, 0.31359994, 0.19360004,
         0.19360004, 2.4335997 , 2.4335997 , 0.19360004, 0.31359994,
         0.31359994, 2.0736    , 0.19360004, 2.0736    , 2.0736    ,
         0.19360004, 2.4335997 , 0.31359994, 0.19360004, 2.4335997 ,
         0.31359994, 2.0736    , 0.19360004, 0.19360004, 2.4335997 ,
         2.4335997 , 0.19360004, 0.19360004, 2.4335997 , 5.9536004 ,
         0.19360004, 0.31359994, 2.4335997 , 0.31359994, 2.0736    ,
         0.19360004, 0.31359994, 0.19360004, 0.31359994, 2.4335997 ,
         2.0736    , 2.0736    , 0.19360004, 0.31359994, 0.19360004,
         2.0736    , 5.9536004 , 2

NameError: name 'weights' is not defined

In [None]:
mae = mean_absolute_error(list(dfTest.rating.values), predictions)
rmse = mean_squared_error(list(dfTest.rating.values), predictions, squared = False)
print(mae)
print(rmse)

In [35]:
arr1 = np.array([[1, 2], [3, 4, 5], [6, 7, 8, 9]])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.