In [1]:
import os
import numpy as np
import pandas as pd
from itertools import combinations
from collections import Counter

In [2]:
os.chdir("C:\\recommender_system\\course1\\week2")

In [3]:
# Encoding issues with movies and tags data
movie_data = pd.read_csv("movies.csv", engine='python')
rating_data = pd.read_csv("ratings.csv")
tag_data = pd.read_csv("tags.csv", engine='python')

In [4]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,12882,1,4.0,1147195252
1,12882,32,3.5,1147195307
2,12882,47,5.0,1147195343
3,12882,50,5.0,1147185499
4,12882,110,4.5,1147195239


In [6]:
tag_data.head()

Unnamed: 0,movieId,userId,tag,timestamp
0,3916,12882,sports,1147195545
1,4085,12882,Eddie Murphy,1147195966
2,33660,12882,boxing,1147195514
3,1197,320,must show,1145964801
4,1396,320,must show,1145964810


In [7]:
# Try several methods to see if duplicate records exist
# movie_data
print('There are {:,} records in rating_data.'.format(movie_data.shape[0]))
print('There are {:,} unique movie ID in rating_data.'.format(movie_data.movieId.nunique()))
print('There are {:,} unique title in rating_data.'.format(movie_data.title.nunique()))
print('There are {:,} unqiue movie ID & title combinations in rating_data.'.format(movie_data.groupby(['movieId','title']).ngroups))

There are 2,500 records in rating_data.
There are 2,500 unique movie ID in rating_data.
There are 2,500 unique title in rating_data.
There are 2,500 unqiue movie ID & title combinations in rating_data.


In [8]:
# Try several methods to see if duplicate records exist
# rating_data
print('There are {:,} records in rating_data.'.format(rating_data.shape[0]))
print('There are {:,} unique user ID in rating_data.'.format(rating_data.userId.nunique()))
print('There are {:,} unique movie ID in rating_data.'.format(rating_data.movieId.nunique()))
print('There are {:,} unqiue user ID & movie ID combinations in rating_data.'.format(rating_data.groupby(['userId','movieId']).ngroups))

There are 264,505 records in rating_data.
There are 862 unique user ID in rating_data.
There are 2,500 unique movie ID in rating_data.
There are 264,505 unqiue user ID & movie ID combinations in rating_data.


### Mean-Based Recommendation

Mean and damped mean will be calculated for each moive ID.

###### Rating Mean Calculation

In [9]:
movie_mean = rating_data.drop(columns = ['userId', 'timestamp']).groupby(["movieId"]).mean().reset_index(level = 0).round(3)

In [10]:
movie_mean[movie_mean['movieId'].isin([2959, 1203])]

Unnamed: 0,movieId,rating
591,1203,4.246
1355,2959,4.259


###### Rating Damped Mean Calculation

In [11]:
# Parameters
global_mean = rating_data['rating'].mean()
alpha = 5
# Damped mean function 
damped_mean_func = lambda x: (x.sum() + alpha * global_mean) / (x.count() + alpha)

In [12]:
movie_damped_mean = rating_data.drop(columns = ['userId', 'timestamp']).groupby(["movieId"]).apply(damped_mean_func)[['rating']].reset_index(level = 0).round(3)

In [13]:
movie_damped_mean[movie_damped_mean['movieId'].isin([2959, 1203])]

Unnamed: 0,movieId,rating
591,1203,4.227
1355,2959,4.252


In [14]:
all_mean = pd.merge(movie_mean, movie_damped_mean, on = 'movieId', how = 'inner').rename({'rating_x': 'mean', 'rating_y': 'damped_mean'}, axis=1)

In [15]:
# Same to the Example Output
all_mean[all_mean['movieId'].isin([2959, 1203])]

Unnamed: 0,movieId,mean,damped_mean
591,1203,4.246,4.227
1355,2959,4.259,4.252


In [16]:
rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,12882,1,4.0,1147195252
1,12882,32,3.5,1147195307
2,12882,47,5.0,1147195343
3,12882,50,5.0,1147185499
4,12882,110,4.5,1147195239


### Association Rules

In [17]:
# 1. - Create th movie association combination for each user ID
# 2. - Combine each user ID' list to one list
movie_pair1 = rating_data[['userId', 'movieId']].sort_values(['userId', 'movieId']).groupby('userId')['movieId'].apply(lambda x: list(combinations(x, 2)))
movie_pair2 = [ele for lst in movie_pair1 for ele in lst]

In [18]:
# 1. - Count the frequency of movie pair
# 2. - Count the frequency of movie
# 3. - Count the unique users
movie_pair_cnt = Counter(movie_pair2)
movie_cnt = rating_data[['movieId']].groupby('movieId').size().to_dict()
user_cnt = rating_data['userId'].nunique()

###### Basic Association Model

In [19]:
# Test cases
# Reference Item = 260
# Please note that the movie pairs are saved as (smaller movie ID, larger movie ID).
# Thus, we need to update the sequence of the movie IDs if reference's ID is larger than target's ID.
test_reference1 = 260
test_target1 = [2571, 1196, 4993, 1210, 356, 5952, 7153, 296, 1198, 480]
test_case1 = [tuple(sorted([test_reference1, ele])) for ele in test_target1]

In [20]:
# Test results - same as shown in the example outputs
for ele1, ele2 in zip(test_target1, test_case1):
    print(str(ele1) + " (" + str(movie_data[movie_data["movieId"] == ele1]["title"].values[0]) + "): " + str(round(movie_pair_cnt[ele2]/movie_cnt[test_reference1], 3)))

2571 (Matrix, The (1999)): 0.916
1196 (Star Wars: Episode V - The Empire Strikes Back (1980)): 0.899
4993 (Lord of the Rings: The Fellowship of the Ring, The (2001)): 0.892
1210 (Star Wars: Episode VI - Return of the Jedi (1983)): 0.847
356 (Forrest Gump (1994)): 0.843
5952 (Lord of the Rings: The Two Towers, The (2002)): 0.841
7153 (Lord of the Rings: The Return of the King, The (2003)): 0.83
296 (Pulp Fiction (1994)): 0.828
1198 (Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)): 0.791
480 (Jurassic Park (1993)): 0.789


###### Lift Association Model

In [21]:
# Test cases
# Reference Item = 2761
# Please note that the movie pairs are saved as (smaller movie ID, larger movie ID).
# Thus, we need to update the sequence of the movie IDs if reference's ID is larger than target's ID.
test_reference2 = 2761
test_target2 = [631, 2532, 3615, 1649, 340, 1016, 2439, 332, 2736, 3213]
test_case2 = [tuple(sorted([test_reference2, ele])) for ele in test_target2]

In [22]:
# Test results - same as shown in the example outputs
for ele1, ele2 in zip(test_target2, test_case2):
    print(str(ele1) + " (" + str(movie_data[movie_data["movieId"] == ele1]["title"].values[0]) + "): " + str(round(movie_pair_cnt[ele2]*user_cnt/(movie_cnt[test_reference2]*movie_cnt[ele1]), 3)))

631 (All Dogs Go to Heaven 2 (1996)): 4.898
2532 (Conquest of the Planet of the Apes (1972)): 4.81
3615 (Dinosaur (2000)): 4.546
1649 (Fast, Cheap & Out of Control (1997)): 4.49
340 (War, The (1994)): 4.49
1016 (Shaggy Dog, The (1959)): 4.49
2439 (Affliction (1997)): 4.49
332 (Village of the Damned (1995)): 4.377
2736 (Brighton Beach Memoirs (1986)): 4.329
3213 (Batman: Mask of the Phantasm (1993)): 4.317
