In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.neighbors import NearestNeighbors

In [47]:
TRAINING_PATH = './netflix/TrainingRatings.txt'
TEST_PATH = './netflix/TestingRatings.txt'

dfTest = pd.read_csv(TEST_PATH)
dfTest.columns = ['movie', 'user', 'rating']

dfTrain = pd.read_csv(TRAINING_PATH)
dfTrain.columns = ['movie', 'user', 'rating']
print(len(dfTest))

100477


In [4]:
dfTrain0 = dfTrain.pivot_table(index = 'user', columns = 'movie', values = 'rating', fill_value= float(0)).astype('float32')
dfTrainNa = dfTrain.pivot_table(index = 'user', columns = 'movie', values = 'rating').astype('float32')

In [5]:
def to_boolean(df):
    return (df > 0).astype('float32')

def row_mean(df):
    return np.nan_to_num(np.nanmean(df, axis=1).reshape((len(df), 1)))

def mean_matrix(df):
    global dfTrainMean

    dfTrainMean = row_mean(df)
    return dfTrainMean * to_boolean(df)

def normalize_rating(df0, dfNa):
    return df0 - mean_matrix(dfNa)

boolean_matrix = to_boolean(dfTrainNa)

dfTrainNormalized = normalize_rating(dfTrain0, dfTrainNa)

In [6]:
train_arr = np.asarray(dfTrain0)
normalized_train_arr = np.asarray(dfTrainNormalized)
index_arr = dfTrainNormalized.index # contains user index

In [7]:
nbrs = NearestNeighbors(n_neighbors = 20, algorithm = 'ball_tree').fit(normalized_train_arr)

In [8]:
def nearest_neighbors(user_arr):
    distances, indices = nbrs.kneighbors(user_arr)

    return indices[0][:]

In [45]:
computed_weights = dict() 

In [46]:
def get_weights(active_user, other_users, active_user_id):
    
    if active_user in computed_weights: 
        return computed_weights[active_user_id]

    dot_product = np.multiply(np.asarray(active_user), np.asarray(other_users))
    numerator = np.sum(dot_product, axis = 1)

    i, j = np.nonzero(dot_product)

    if len(i) + len(j) == 0: 
        return np.zeros(len(other_users))
       
    active_user = np.tile(active_user, (len(other_users), 1))
    other_users = np.asarray(other_users)
    common_active_user = [active_user[x, j[x == i]] for x in range(active_user.shape[0])]
    common_other_users = [other_users[x, j[x == i]] for x in range(other_users.shape[0])] 

    sq_cau = np.array([np.square(row) for row in common_active_user], dtype = object).reshape(len(other_users), -1)
    sq_cou = np.array([np.square(row) for row in common_other_users], dtype = object).reshape(len(other_users), -1)

    sq_cau_sum = np.array([sum(sum(row)) for row in sq_cau])
    sq_cou_sum = np.array([sum(sum(row)) for row in sq_cou])
   
    denominator = np.sqrt(np.multiply(sq_cau_sum, sq_cou_sum))
    with np.errstate(divide='ignore', invalid='ignore'):
        weights = np.divide(numerator, denominator)

    computed_weights[active_user_id] = np.nan_to_num(weights)
    return np.nan_to_num(weights) 

In [10]:
# weights needs to be a (1, n) array 
# find the dot product between the users, get the index of the non zero elements. Use these elements to get the denominator. 
def predict(df, user_id, movie_id):
    total = 0.0
    kappa = 0.0
    user_mean = dfTrainMean[list(index_arr).index(user_id)][0]

    prediction = user_mean

    # print(sum(df.loc[user_id, : ]))
    weights = get_weights(df.loc[user_id, :], df, user_id)
    kappa = np.sum(np.abs(weights))

    movie_ratings = np.array(df[movie_id])
    total = np.dot(weights.T, movie_ratings)
    
    if kappa != 0:
        prediction = user_mean + (total / kappa)
    
    return prediction

In [48]:
dfTest = dfTest.sample(frac = 0.01, random_state=42).reset_index(drop=True)
len(dfTest)

1005

In [49]:
len(np.unique(dfTest['user'].values))

988

In [42]:
computed_neighbors = dict()

In [50]:
count = 0
predictions = [] 
for row in dfTest.itertuples():
    active_user = getattr(row, 'user')
    
    if active_user in computed_neighbors:
        indices_nn = computed_neighbors[active_user]
    else:
        indices_nn = nearest_neighbors(dfTrainNormalized.loc[active_user].values.reshape(1, -1))
        computed_neighbors[active_user] = indices_nn

    dfNN = pd.DataFrame(normalized_train_arr[indices_nn], index = index_arr[indices_nn], columns = dfTrain0.columns)

    prediction = predict(dfNN, getattr(row, 'user'), getattr(row, 'movie'))
    predictions.append(prediction)
    
    count += 1
    if count % 1000 == 0:
        print(f'Done with {count} predictions')

TypeError: unhashable type: 'Series'

In [21]:
mae = mean_absolute_error(list(dfTest.rating.values), predictions)
rmse = mean_squared_error(list(dfTest.rating.values), predictions, squared = False)
print(mae)
print(rmse)

0.7776659040906202
0.9795850185615131
