In [12]:
import pandas as pd
import numpy as np
import os


from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from itertools import product

In [478]:
class KNN():
    
    def compute_similarity(self, ratings):
        self.similarities = cosine_similarity(ratings)
    
    def __init__(self, ratings):
        self.ratings = ratings
        self.compute_similarity(self.ratings)
        
    def predict(self, user_id, movie_id, k):
        
        #get nearest neighbors
        idx = np.flip(np.argsort(self.similarities[user_id]))
        idx = idx[1:] #remove self index
        user_similarities = self.similarities[user_id]
        top_neighbors = idx[:k]
        
        #user ratings
        user_ratings = self.ratings[user_id]
        user_ratings[movie_id] = np.nan
        user_ratings[user_ratings[:]==0] = np.nan
        
        #user neigbor similarities
        user_neighbor_similarity = user_similarities[top_neighbors]
        
        #neighbor ratings
        neighbor_ratings = self.ratings[top_neighbors]
        neighbor_ratings[neighbor_ratings[:]==0] = np.nan #convert unrated items (0) to nans
        neighbor_ratings_ex = neighbor_ratings.copy()
        neighbor_ratings_ex[:, movie_id] = np.nan
        neighbor_rating_mean = np.nanmean(neighbor_ratings_ex, axis=1)
        neighbor_ratings[np.isnan(neighbor_ratings[:])] = 0
        neighbor_target_movie_rating = neighbor_ratings[:,movie_id]
        
        #compute prediction score
        pred_rating = neighbor_target_movie_rating - neighbor_rating_mean
        pred_rating = (np.nansum(pred_rating * user_neighbor_similarity)) / abs(user_neighbor_similarity).sum() 
        pred_rating = np.nansum([pred_rating, np.nanmean(user_ratings)])
        
        return pred_rating
    
    #def recommend():
    #    pass

In [312]:
path = 'ml-20m'
genome_scores = pd.read_csv(os.path.join(path,'genome-scores.csv'))
genome_tags = pd.read_csv(os.path.join(path,'genome-tags.csv'))
tags = pd.read_csv(os.path.join(path,'tags.csv'))
movies = pd.read_csv(os.path.join(path,'movies.csv'))
ratings_full = pd.read_csv(os.path.join(path,'ratings.csv'))


#remove movies without ratings
#movies = movies[movies['movieId'].isin(ratings_full['movieId'].unique())]
#genome_scores = genome_scores[genome_scores['movieId'].isin(ratings_full['movieId'].unique())]

#remove on deployment
ratings = ratings_full.head(1000000)

#remap ids to continuous integers
user_ids = np.sort(np.unique(ratings['userId']))
userid2idx = {o:i for i,o in enumerate(user_ids) }

movie_ids = np.sort(np.unique(ratings['movieId']))
movieid2idx = {o:i for i,o in enumerate(movie_ids) }

ratings['userId'] = ratings['userId'].apply(lambda x : userid2idx[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x : movieid2idx[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['userId'] = ratings['userId'].apply(lambda x : userid2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['movieId'] = ratings['movieId'].apply(lambda x : movieid2idx[x])


In [326]:
ratings['is_train'] = np.random.random(len(ratings)) < 0.95
training_data = ratings[ratings['is_train']]
test_data = ratings[~ratings['is_train']]
test_data = test_data[test_data['userId'].isin(training_data['userId'])]
test_data = test_data[test_data['movieId'].isin(training_data['movieId'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['is_train'] = np.random.random(len(ratings)) < 0.95


In [329]:
#user_movie_pair = list(product(userid2idx.values(), movieid2idx.values()))
#user_movie_pair_df = pd.DataFrame(user_movie_pair, columns =['userId', 'movieId'])
#m = user_movie_pair_df.merge(training_data, on=['userId', 'movieId'], how='left')
ratings_pvt = ratings.pivot_table(index='userId', columns=['movieId'], values='rating', dropna=False, fill_value=0)


In [485]:
test_data.shape[0]

49997

In [479]:
model = KNN(ratings_pvt.values)

In [520]:
def compute_mae(model, test_data, k):
    
    test_data = test_data.copy()
    
    error = 0
    test_data['predicted'] = pd.Series()
    test_data['mae'] = pd.Series()
    
    #'(ColA=="7") & (ColB=="3") & (ColC=="alpha") & (ColD=="yu")'   
    
    
    for test_item in test_data.values:    
        user_id = test_item[0]
        movie_id = test_item[1]
        rating = test_item[2]
        prediction = model.predict(user_id, movie_id, k)
        abs_diff = abs( prediction - rating)
        
        query = f'userId=={user_id} & movieId == {movie_id}'
        test_data.loc[test_data.query(query).index,'predicted'] = prediction
        test_data.loc[test_data.query(query).index,'mae'] = abs_diff
        
        error += abs_diff
    
    test_data.to_csv('test_result.csv')
    mae = error / test_data.shape[0]
    
    return mae


In [521]:
compute_mae(model, test_data, 3)

  test_data['predicted'] = pd.Series()
  test_data['mae'] = pd.Series()


1.9896388318036877

In [511]:
test_data

Unnamed: 0,userId,movieId,rating,timestamp,is_train
8,0,247,4.0,1112484940,False
15,0,531,4.0,1112484603,False
17,0,581,3.5,1112484661,False
121,0,4227,4.0,1112485822,False
131,0,4734,5.0,1112484682,False
...,...,...,...,...,...
999866,6742,373,3.5,1117760565,False
999919,6742,722,4.5,1117760974,False
999969,6742,1218,3.5,1117761366,False
999984,6742,1414,4.0,1117761156,False


In [498]:



x = np.array([[3,5],[5,2],[4, 1]])
x

#cosine_similarity

array([[3, 5],
       [5, 2],
       [4, 1]])

In [499]:
cosine_similarity(x)

array([[1.        , 0.79616219, 0.70710678],
       [0.79616219, 1.        , 0.99083017],
       [0.70710678, 0.99083017, 1.        ]])

In [500]:
df = pd.DataFrame({'id': [0,1,2],'value':[0.1,0.2,0.3]})
df

Unnamed: 0,id,value
0,0,0.1
1,1,0.2
2,2,0.3


In [504]:
df['new'] = pd.Series()

  df['new'] = pd.Series()


In [505]:
df

Unnamed: 0,id,value,new
0,0,0.1,
1,1,0.2,
2,2,0.3,


In [508]:
df

Unnamed: 0,id,value,new
0,0,0.1,
1,2,0.2,
2,2,0.3,


In [506]:
query = '(id==1)'
