<h2>Collaborative Filtering Recommendation System </h2>

Dataset source: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
moviesds = pd.read_csv('movies.csv')
ratingsds = pd.read_csv('ratings.csv')
moviesds.shape, ratingsds.shape

((34208, 3), (22884377, 4))

In [3]:
moviesds.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Formatting the data types 
moviesds['year'] = moviesds['title'].str.extract('(\(\d\d\d\d\))', expand=False)
moviesds['year'] = moviesds['year'].str.extract('(\d\d\d\d)', expand=False)
moviesds['title'] = moviesds['title'].str.extract(r'(.*)\s\(\d{4}\)')
moviesds.drop('genres', axis=1, inplace=True)

moviesds.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [5]:
ratingsds.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
# Dropping timestamp as we don't need it
ratingsds.drop('timestamp', axis=1, inplace=True)
ratingsds.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [7]:
# Creating a hypothetical user to recommend movies to. 
# The movies already viewed and rated by the user are as follows
userinput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 4.5},
]
userinput = pd.DataFrame(userinput)
userinput

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [8]:
# Extracting the IDs of the rated movies from the movies dataset
inputid = moviesds[moviesds['title'].isin(userinput['title'].tolist())]
inputid.drop('year', axis=1, inplace=True)

inputid

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputid.drop('year', axis=1, inplace=True)


Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
293,296,Pulp Fiction
1246,1274,Akira
1885,1968,"Breakfast Club, The"


In [9]:
# Merging to add movie IDs to the ratings dataset
userinput = pd.merge(inputid, userinput)
userinput

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [10]:
# Extracting the ratings of the users who rated the same movies
subset = ratingsds[ratingsds['movieId'].isin(userinput['movieId'].tolist())]
subset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [11]:
# Grouping the ratings subset by user IDs
groupedsubset = subset.groupby('userId')
groupedsubset.get_group(1130)

Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [12]:
# Sorting the grouped subset so the users that share the most movies 
# in common with the input user have higher priority.

groupedsubset = sorted(groupedsubset, key=lambda x: len(x[1]), reverse=True)
groupedsubset = groupedsubset[:100]
groupedsubset[0:3]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

In [13]:
# Calculating Pearson's correlation values to find users with similar preferences

from math import sqrt
pearson = {}

for name, group in groupedsubset:
    group = group.sort_values(by='movieId')
    userinput = userinput.sort_values(by='movieId')

    n = len(group)

    temp_df = userinput[userinput['movieId'].isin(group['movieId'].tolist())]
    temp_rating = temp_df['rating'].tolist()
    temp_grouprating = group['rating'].tolist()

    Sxx = sum([i**2 for i in temp_rating]) - pow(sum(temp_rating), 2)/float(n)
    Syy = sum([i**2 for i in temp_grouprating]) - pow(sum(temp_grouprating), 2)/float(n)
    Sxy = sum( i*j for i, j in zip(temp_rating, temp_grouprating)) - sum(temp_rating)*sum(temp_grouprating)/float(n)
    
    pearson[name] = (Sxy / sqrt(Sxx * Syy)) if Sxx != 0 and Syy != 0 else 0

pearson.items()

dict_items([(75, 0.8272781516947562), (106, 0.5860090386731182), (686, 0.8320502943378437), (815, 0.5765566601970551), (1040, 0.9434563530497265), (1130, 0.2891574659831201), (1502, 0.8770580193070299), (1599, 0.4385290096535153), (1625, 0.716114874039432), (1950, 0.179028718509858), (2065, 0.4385290096535153), (2128, 0.5860090386731196), (2432, 0.1386750490563073), (2791, 0.8770580193070299), (2839, 0.8204126541423674), (2948, -0.11720180773462392), (3025, 0.45124262819713973), (3040, 0.89514359254929), (3186, 0.6784622064861935), (3271, 0.26989594817970664), (3429, 0.0), (3734, -0.15041420939904673), (4099, 0.05860090386731196), (4208, 0.29417420270727607), (4282, -0.4385290096535115), (4292, 0.6564386345361464), (4415, -0.11183835382312353), (4586, -0.9024852563942795), (4725, -0.08006407690254357), (4818, 0.4885967564883424), (5104, 0.7674257668936507), (5165, -0.4385290096535153), (5547, 0.17200522903844556), (6082, -0.04728779924109591), (6207, 0.9615384615384616), (6366, 0.65779

In [14]:
# Formatting the correlation data
pearson = pd.DataFrame.from_dict(pearson, orient='index')
pearson.columns = ['coef']
pearson['userId'] = pearson.index
pearson.reset_index()
pearson

Unnamed: 0,coef,userId
75,0.827278,75
106,0.586009,106
686,0.832050,686
815,0.576557,815
1040,0.943456,1040
...,...,...
17854,0.537086,17854
17897,0.877058,17897
17944,0.271385,17944
18301,0.298381,18301


In [15]:
# Extracting users with the top 20 correlation values
topusers = pearson.sort_values(by='coef', ascending=False)[:20]
topusers.head()

Unnamed: 0,coef,userId
12325,0.961678,12325
6207,0.961538,6207
10707,0.961538,10707
13053,0.960769,13053
1040,0.943456,1040


In [16]:
# Finding out what other movies the users with similar taste profiles liked by
# adding those ratings to the user correlation data
topuserrating = topusers.merge(ratingsds, left_on='userId', right_on='userId', how='inner')
topuserrating[:10]

Unnamed: 0,coef,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5
5,0.961678,12325,7,3.0
6,0.961678,12325,10,3.0
7,0.961678,12325,11,2.5
8,0.961678,12325,17,4.0
9,0.961678,12325,19,1.0


In [17]:
# Calculating the weighted rating by multiplying with similarity coefficient
topuserrating['weighted_rating'] = topuserrating['coef'] * topuserrating['rating']
topuserrating[:10]

Unnamed: 0,coef,userId,movieId,rating,weighted_rating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196
5,0.961678,12325,7,3.0,2.885035
6,0.961678,12325,10,3.0,2.885035
7,0.961678,12325,11,2.5,2.404196
8,0.961678,12325,17,4.0,3.846713
9,0.961678,12325,19,1.0,0.961678


In [18]:
# Adding the weighted ratings for each movie
temp_toprating = topuserrating.groupby('movieId').sum()
temp_toprating = temp_toprating[['coef', 'weighted_rating']]
temp_toprating.columns = ['sum_coef', 'sum_weights']
temp_toprating.head()

Unnamed: 0_level_0,sum_coef,sum_weights
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,17.885064,67.12999
2,17.885064,45.099094
3,5.43459,14.616637
4,0.929294,2.787882
5,5.498128,10.346701


In [19]:
# Calculating the weighted recommendation score based on the weighted ratings
# and sum of similarity coefficients
recs = pd.DataFrame()
recs['weighted_rec_score'] = temp_toprating['sum_weights'] / temp_toprating['sum_coef']
recs.head()

Unnamed: 0_level_0,weighted_rec_score
movieId,Unnamed: 1_level_1
1,3.753411
2,2.521607
3,2.689556
4,3.0
5,1.881859


In [20]:
# Sorting to extract the top 20 recommendations
recs = recs.sort_values(by='weighted_rec_score', ascending=False).head(20)
recs

Unnamed: 0_level_0,weighted_rec_score
movieId,Unnamed: 1_level_1
4381,5.0
73587,5.0
71264,5.0
3730,5.0
3759,5.0
3769,5.0
3775,5.0
3776,5.0
65685,5.0
65514,5.0


In [21]:
# Getting the movie information from recommended IDs
movierecs = []

for rec_id in recs.index:
    name = moviesds[moviesds['movieId'] == rec_id]
    name = [name['title'].tolist()[0], name['year'].tolist()[0]]
    movierecs.append(name)

movierecs

[['Closet, The (Placard, Le)', '2001'],
 ['Soul Kitchen', '2009'],
 ['Cloudy with a Chance of Meatballs', '2009'],
 ['Conversation, The', '1974'],
 ['Fun and Fancy Free', '1947'],
 ['Thunderbolt and Lightfoot', '1974'],
 ['Make Mine Music', '1946'],
 ['Melody Time', '1948'],
 ['Inkheart', '2008'],
 ['Ip Man', '2008'],
 ['Adventures of Ichabod and Mr. Toad, The', '1949'],
 ["It's the Great Pumpkin, Charlie Brown", '1966'],
 ["Charlie Wilson's War", '2007'],
 ['Condorman', '1981'],
 ['Requiem', '2006'],
 ['Death at a Funeral', '2007'],
 ['In the Mood For Love (Fa yeung nin wa)', '2000'],
 ['Widow of St. Pierre, The (Veuve de Saint-Pierre, La)', '2000'],
 ["Paris, I Love You (Paris, je t'aime)", '2006'],
 ['Shaft', '1971']]