In [1]:
import pandas as pd
from scipy.stats import pearsonr

In [2]:
movies_df = pd.read_csv(r"ml-latest-small/ratings.csv")

In [3]:
movies_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


#### Tasks

1. Select a user with the movies the user has watched
2. Based on his rating to movies, find the top X neighbours
3. Combine similarity of neighbour and his movie ratings into a score  
4. Recommend the movies with the highest score

#### 1. Select movies watched by user

In [44]:
# first we need to specify one user, 
# for whom we wanna find recommondation

user_id = 1
num_user_ratings = movies_df.loc[movies_df['userId']==user_id].shape[0]
user_ratings = movies_df.loc[movies_df['userId']==user_id]['movieId'].values
rated_movies = movies_df.loc[movies_df['movieId'].isin(user_ratings)]
rated_movies

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
99742,610,3671,5.0,1493850253
99748,610,3703,4.5,1493850272
99752,610,3740,4.5,1479543090
99753,610,3744,3.0,1493847444


#### 2. Find Top X Neighbours

In [79]:
# select top 100 users based on number of same movies rated
top_100 = rated_movies['userId'].value_counts()[:100].index

# view ratings by top 100 users
rated_movies.loc[rated_movies['userId'].isin(top_100)]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
99742,610,3671,5.0,1493850253
99748,610,3703,4.5,1493850272
99752,610,3740,4.5,1479543090
99753,610,3744,3.0,1493847444


In [88]:
# group top 100 by user
top_100_user_groups = rated_movies.loc[rated_movies['userId'].isin(top_100)].groupby(['userId'])

# output all user ids inside top 100 selection
top_100_user_groups.groups.keys()

dict_keys([1, 18, 19, 21, 28, 39, 42, 45, 57, 63, 64, 66, 68, 91, 103, 122, 132, 135, 140, 156, 160, 166, 177, 182, 186, 198, 199, 200, 202, 217, 219, 220, 221, 226, 232, 234, 239, 249, 266, 274, 275, 282, 288, 292, 294, 298, 304, 305, 307, 312, 313, 318, 328, 330, 332, 354, 356, 357, 368, 372, 380, 381, 385, 387, 391, 414, 425, 428, 434, 438, 448, 452, 453, 462, 469, 474, 477, 480, 483, 489, 514, 517, 525, 534, 555, 560, 561, 570, 573, 577, 580, 590, 597, 599, 600, 603, 606, 607, 608, 610])

In [89]:
# example output for user 18
top_100_user_groups.get_group(18)

Unnamed: 0,userId,movieId,rating,timestamp
1772,18,1,3.5,1455209816
1774,18,6,4.0,1460138360
1779,18,47,4.5,1455050013
1780,18,50,5.0,1455049343
1781,18,70,3.5,1455735732
...,...,...,...,...
1948,18,3147,4.0,1455231412
1949,18,3253,3.0,1512850503
1953,18,3578,4.5,1455209857
1955,18,3702,3.5,1457382557


In [114]:
# create correlation dict, where key is user
# and value is the correlation value
user_a = top_100_user_groups.get_group(user_id)[['movieId', 'rating']]
correlation={}

for name, user_b in top_100_user_groups:
    # first we need to find out, which movies BOTH have seen
    # we call this movie intersection
    user_b_movies = user_b['movieId'] 
    movie_intersection = pd.Series(list(set(user_a['movieId']).intersection(set(user_b_movies))))
    
    # now we select ratings of user_a and user_b for all films, that both has seen (movie intersections)
    user_a_ratings = user_a.loc[user_a['movieId'].isin(movie_intersection)].rating
    user_b_ratings = user_b.loc[user_b['movieId'].isin(movie_intersection)].rating
    
    # now we calculate pearson correlation between their ratings
    correlation[name]=pearsonr(user_a_ratings, user_b_ratings)[0]

# in the end we remove user_a from dict, since the correlation between user_a and himself should always be 1
del correlation[user_id]

In [123]:
# print top 10 nearest neighbours
sorted(correlation.items(), key=lambda item: item[1], reverse=True)[:10]

[(597, 0.42702914061318026),
 (414, 0.4118641035209051),
 (477, 0.4053425045684784),
 (198, 0.37747610449530933),
 (266, 0.35651099088706956),
 (312, 0.3564457659159248),
 (57, 0.354259896241516),
 (199, 0.35267370641266577),
 (448, 0.3451787982895458),
 (122, 0.3280370267624435)]

#### 3. Combine similarity of neighbour and his movie ratings into a score  

In [128]:
u_id = 18
num_of_ratings = {}
for index, row in top_100_user_groups.get_group(u_id).iterrows():
    movie_value = correlation[name] * row['rating']
    movie_exists = add_value_to_dict(row['movieId'], movie_value, movie_scores)
    if (movie exists):
        add_value_to_dict(row['movieId'], 1, num_of_ratings)

Unnamed: 0,userId,movieId,rating,timestamp
1772,18,1,3.5,1455209816
1774,18,6,4.0,1460138360
1779,18,47,4.5,1455050013
1780,18,50,5.0,1455049343
1781,18,70,3.5,1455735732
...,...,...,...,...
1948,18,3147,4.0,1455231412
1949,18,3253,3.0,1512850503
1953,18,3578,4.5,1455209857
1955,18,3702,3.5,1457382557


In [144]:
def add_value_to_dict(key, value, dictionary):
    if (key in dictionary):
        dictionary[key] += value
        return 1
    else:
        dictionary[key] = value
        return 0

In [146]:
# calculate score for movies, which was watched by neighbours
# every movie get stored in a dict, with a certain score
# the score gets calculated by:
# correlation of the neighbour, who watched the movie * rating
# if multiple neighbours had rated the same movie, we will build the mean value
# among all neighbours, who watched it
# for this pupose i create num of ratings dict,
# which get incremented by, whenever another "collision" happens

movie_scores={}
num_of_ratings = {}
for user, features in top_100_user_groups:
    if (user!=1):
        for index, row in top_100_user_groups.get_group(user).iterrows():
            movie_value = correlation[user] * row['rating']
            movie_existed_before = add_value_to_dict(row['movieId'], movie_value, movie_scores)
            if (movie_existed_before):
                add_value_to_dict(row['movieId'], 1, num_of_ratings)

# calculate mean values
for movie, count in num_of_ratings:
    movie_scores[movie] = movie_scores[movie] / count

#### 4.Recommend the movies with the highest score

In [157]:
# print top 10 movies
print("Movie_ID    Score")
for movie, score in sorted(movie_scores.items(), key=lambda item: item[1], reverse=True)[:10]:
    print(f"{movie:.0f}:     {score:.2f}")

Movie_ID    Score
1196:     69.32
260:     68.97
1210:     65.71
2571:     64.68
356:     63.38
296:     62.79
480:     56.75
1198:     56.66
2959:     56.07
1265:     55.93
