<h2>Collaborative Filtering Recommendation System </h2>

Dataset source: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
moviesds = pd.read_csv('movies.csv')
ratingsds = pd.read_csv('ratings.csv')
moviesds.shape, ratingsds.shape

In [None]:
moviesds.head()

In [None]:
# Formatting the data types 
moviesds['year'] = moviesds['title'].str.extract('(\(\d\d\d\d\))', expand=False)
moviesds['year'] = moviesds['year'].str.extract('(\d\d\d\d)', expand=False)
moviesds['title'] = moviesds['title'].str.extract(r'(.*)\s\(\d{4}\)')
moviesds.drop('genres', axis=1, inplace=True)

moviesds.head()

In [None]:
ratingsds.head()

In [None]:
# Dropping timestamp as we don't need it
ratingsds.drop('timestamp', axis=1, inplace=True)
ratingsds.head()

In [None]:
# Creating a hypothetical user to recommend movies to. 
# The movies already viewed and rated by the user are as follows
userinput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 4.5},
]
userinput = pd.DataFrame(userinput)
userinput

In [None]:
# Extracting the IDs of the rated movies from the movies dataset
inputid = moviesds[moviesds['title'].isin(userinput['title'].tolist())]
inputid.drop('year', axis=1, inplace=True)

inputid

In [None]:
# Merging to add movie IDs to the ratings dataset
userinput = pd.merge(inputid, userinput)
userinput

In [None]:
# Extracting the ratings of the users who rated the same movies
subset = ratingsds[ratingsds['movieId'].isin(userinput['movieId'].tolist())]
subset.head()

In [None]:
# Grouping the ratings subset by user IDs
groupedsubset = subset.groupby('userId')
groupedsubset.get_group(1130)

In [None]:
# Sorting the grouped subset so the users that share the most movies 
# in common with the input user have higher priority.

groupedsubset = sorted(groupedsubset, key=lambda x: len(x[1]), reverse=True)
groupedsubset = groupedsubset[:100]
groupedsubset[0:3]

In [None]:
# Calculating Pearson's correlation values to find users with similar preferences

from math import sqrt
pearson = {}

for name, group in groupedsubset:
    group = group.sort_values(by='movieId')
    userinput = userinput.sort_values(by='movieId')

    n = len(group)

    temp_df = userinput[userinput['movieId'].isin(group['movieId'].tolist())]
    temp_rating = temp_df['rating'].tolist()
    temp_grouprating = group['rating'].tolist()

    Sxx = sum([i**2 for i in temp_rating]) - pow(sum(temp_rating), 2)/float(n)
    Syy = sum([i**2 for i in temp_grouprating]) - pow(sum(temp_grouprating), 2)/float(n)
    Sxy = sum( i*j for i, j in zip(temp_rating, temp_grouprating)) - sum(temp_rating)*sum(temp_grouprating)/float(n)
    
    pearson[name] = (Sxy / sqrt(Sxx * Syy)) if Sxx != 0 and Syy != 0 else 0

pearson.items()

In [None]:
# Formatting the correlation data
pearson = pd.DataFrame.from_dict(pearson, orient='index')
pearson.columns = ['coef']
pearson['userId'] = pearson.index
pearson.reset_index()
pearson

In [None]:
# Extracting users with the top 20 correlation values
topusers = pearson.sort_values(by='coef', ascending=False)[:20]
topusers.head()

In [None]:
# Finding out what other movies the users with similar taste profiles liked by
# adding those ratings to the user correlation data
topuserrating = topusers.merge(ratingsds, left_on='userId', right_on='userId', how='inner')
topuserrating[:10]

In [None]:
# Calculating the weighted rating by multiplying with similarity coefficient
topuserrating['weighted_rating'] = topuserrating['coef'] * topuserrating['rating']
topuserrating[:10]

In [None]:
# Adding the weighted ratings for each movie
temp_toprating = topuserrating.groupby('movieId').sum()
temp_toprating = temp_toprating[['coef', 'weighted_rating']]
temp_toprating.columns = ['sum_coef', 'sum_weights']
temp_toprating.head()

In [None]:
# Calculating the weighted recommendation score based on the weighted ratings
# and sum of similarity coefficients
recs = pd.DataFrame()
recs['weighted_rec_score'] = temp_toprating['sum_weights'] / temp_toprating['sum_coef']
recs.head()

In [None]:
# Sorting to extract the top 20 recommendations
recs = recs.sort_values(by='weighted_rec_score', ascending=False).head(20)
recs

In [None]:
# Getting the movie information from recommended IDs
movierecs = []

for rec_id in recs.index:
    name = moviesds[moviesds['movieId'] == rec_id]
    name = [name['title'].tolist()[0], name['year'].tolist()[0]]
    movierecs.append(name)

movierecs