<h2>Content Based Recommendation System </h2>

Dataset source: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
moviesds = pd.read_csv('movies.csv')
moviesds.shape

(34208, 3)

In [3]:
moviesds.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Formatting the data types 
moviesds['year'] = moviesds['title'].str.extract('(\(\d\d\d\d\))', expand=False)
moviesds['year'] = moviesds['year'].str.extract('(\d\d\d\d)', expand=False)
moviesds['title'] = moviesds['title'].str.extract(r'(.*)\s\(\d{4}\)')
moviesds['genres'] = moviesds['genres'].str.split('|')

moviesds.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [5]:
# Encoding the genres with One Hot Encoding, will be necessary for probability
# computations with genre matrix.

moviegenre = moviesds.copy()

for index, row in moviesds.iterrows():
    for i in row['genres']:
        moviegenre.at[index, i] = 1

moviegenre = moviegenre.fillna(0)
moviegenre.drop('genres', axis=1, inplace=True)

moviegenre.head() 

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,1995,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,1995,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,1995,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Creating a hypothetical user to recommend movies to. 
# The movies already viewed and rated by the user are as follows
userinput = [
    {'title': 'Breakfast Club, The', 'rating': 5},
    {'title': 'Toy Story', 'rating': 3.5},
    {'title': 'Jumanji', 'rating': 2},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 4.5},
]
userinput = pd.DataFrame(userinput)
userinput

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [7]:
# Extracting the IDs of the rated movies from the movies dataset
inputid = moviesds[moviesds['title'].isin(userinput['title'].tolist())]
inputid.drop(['genres', 'year'], axis=1, inplace=True)

inputid

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inputid.drop(['genres', 'year'], axis=1, inplace=True)


Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
293,296,Pulp Fiction
1246,1274,Akira
1885,1968,"Breakfast Club, The"


In [8]:
# Merging to add movie IDs to the ratings dataset
userinput = pd.merge(inputid, userinput)
userinput

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [9]:
# Extracting information about the rated movies from the movies dataset
usermovies = moviegenre[moviegenre['movieId'].isin(userinput['movieId'].tolist())]
usermovies

Unnamed: 0,movieId,title,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293,296,Pulp Fiction,1994,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1246,1274,Akira,1988,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1885,1968,"Breakfast Club, The",1985,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Creating the genre matrix for the rated movies
usermovies.reset_index(drop=True, inplace=True)
genremtx = usermovies.drop(['movieId', 'title', 'year'], axis=1)
genremtx

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
userinput['rating']

0    3.5
1    2.0
2    5.0
3    4.5
4    5.0
Name: rating, dtype: float64

In [12]:
# Calculating user profile based on rated movies and movie genres
userprofile = genremtx.transpose().dot(userinput['rating'])
userprofile

Adventure             10.0
Animation              8.0
Children               5.5
Comedy                13.5
Fantasy                5.5
Romance                0.0
Drama                 10.0
Action                 4.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 4.5
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [13]:
# Extracting the genre matrix from the hot-encoded movie dataset
genretable = moviegenre.set_index(moviegenre['movieId'])
genretable.drop(['movieId', 'title', 'year'], axis=1, inplace=True)
genretable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Calculating scores of movies to recommend based on user profile and movie genres
recs = (genretable * userprofile).sum(axis=1) / userprofile.sum()

# Sorting to find the top 20 movies to recommend
recs = recs.sort_values(ascending=False).head(20)

recs

movieId
5018      0.748252
26093     0.734266
27344     0.720280
148775    0.685315
6902      0.678322
117646    0.678322
122787    0.671329
64645     0.671329
81132     0.671329
2987      0.664336
91335     0.657343
673       0.657343
51632     0.657343
51939     0.657343
32031     0.657343
26340     0.657343
1907      0.657343
108932    0.657343
108540    0.657343
146305    0.657343
dtype: float64

In [15]:
# Getting the movie information from recommended IDs
movierecs = []

for rec_id in recs.keys():
    name = moviesds[moviesds['movieId'] == rec_id]
    name = [name['title'].tolist()[0], name['year'].tolist()[0]]
    movierecs.append(name)

movierecs

[['Motorama', '1991'],
 ['Wonderful World of the Brothers Grimm, The', '1962'],
 ['Revolutionary Girl Utena: Adolescence of Utena (a.k.a. Revolutionary Girl Utena the Movie) (Shoujo kakumei Utena: Adolescence mokushiroku)',
  '1999'],
 ['Wizards of Waverly Place: The Movie', '2009'],
 ['Interstate 60', '2002'],
 ['Dragonheart 2: A New Beginning', '2000'],
 ['The 39 Steps', '1959'],
 ['The Wrecking Crew', '1968'],
 ['Rubber', '2010'],
 ['Who Framed Roger Rabbit?', '1988'],
 ['Gruffalo, The', '2009'],
 ['Space Jam', '1996'],
 ["Atlantis: Milo's Return", '2003'],
 ['TMNT (Teenage Mutant Ninja Turtles)', '2007'],
 ['Robots', '2005'],
 ["Twelve Tasks of Asterix, The (Les douze travaux d'Astérix)", '1976'],
 ['Mulan', '1998'],
 ['The Lego Movie', '2014'],
 ['Ernest & Célestine (Ernest et Célestine)', '2012'],
 ['Princes and Princesses', '2000']]