#### Movie Recommeder system - https://github.com/rounakbanik/movies/blob/master/movies_recommender.ipynb

#### Intro to Recommender system - https://towardsdatascience.com/intro-to-recommender-system-collaborative-filtering-64a238194a26

#### Dataset : https://grouplens.org/datasets/movielens/100k/


## Simple Recommender System
 
 ##### -- IMDb
 ##### -- movie_metadata dataset
 ##### -- metric
 ##### -- evaluation

In [6]:
import pandas as pd
import numpy as np

In [62]:
df = pd.read_csv('/home/exoper/ML_Dwarka_June19/datasets/movie_metadata.csv')

In [63]:
df.head(10)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


In [87]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [64]:
df.shape

(5043, 28)

###### Weighted Rating (WR) =   (v / (v+m) ).R  +  (m / (v+m) ).C

* v is the number of votes for the movie

* m is the minimum votes required

* R is the average rating of the movie

* C is the mean vote across the whole report
 

In [66]:
# average imdb score for movies
C = df['imdb_score'].mean()

In [67]:
print(C)

6.442137616498116


In [88]:
# minimum vote count
m = df['num_voted_users'].quantile(0.60)
m

53178.19999999998

In [89]:
df2 = df.copy().loc[df['num_voted_users'] >= m]

In [90]:
df2.shape

(2017, 28)

In [91]:
# calcuting score
def weighted_rating(x, m=m, C=C):
    v = x['num_voted_users']
    R = x['imdb_score']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [92]:
# Adding a score column
df2['score'] = df2.apply(weighted_rating , axis = 1)

In [93]:
df2 = df2.sort_values('score', ascending=False)

In [94]:
# Top 15 predictions
df2[['movie_title', 'num_voted_users', 'imdb_score', 'score']].head(15)

Unnamed: 0,movie_title,num_voted_users,imdb_score,score
1937,The Shawshank Redemption,1689764,9.3,9.212805
3466,The Godfather,1155770,9.2,9.078689
66,The Dark Knight,1676169,9.0,8.921345
2837,The Godfather: Part II,790926,9.0,8.838856
3355,Pulp Fiction,1324680,8.9,8.805139
339,The Lord of the Rings: The Return of the King,1215718,8.9,8.796993
1874,Schindler's List,865020,8.9,8.757651
97,Inception,1468200,8.8,8.717583
683,Fight Club,1347461,8.8,8.710479
836,Forrest Gump,1251222,8.8,8.703874


## Collaborative Filtering item-item based

### Loading Dataset

In [104]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('/home/exoper/Documents/data/ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))
m_cols = ['movie_id', 'title']
movies = pd.read_csv('/home/exoper/Documents/data//ml-100k/u.item', sep='|',encoding = "ISO-8859-1", names=m_cols, usecols=range(2))

ratings = pd.merge(movies, ratings)

In [105]:
ratings.head(10)

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3
5,1,Toy Story (1995),5,4
6,1,Toy Story (1995),109,4
7,1,Toy Story (1995),181,3
8,1,Toy Story (1995),95,5
9,1,Toy Story (1995),268,3


In [202]:
userRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='rating')
userRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [203]:
userRatings.shape

(943, 1664)

In [204]:
userRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [205]:
corrMatrix = userRatings.corr()
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,,-1.0,-0.5,-0.5,0.522233,,-0.426401,,,...,,,,,,,,,,
1-900 (1994),,1.0,,,,,,-0.981981,,,...,,,,-0.944911,,,,,,
101 Dalmatians (1996),-1.0,,1.0,-0.04989,0.269191,0.048973,0.266928,-0.043407,,0.111111,...,,-1.0,,0.15884,0.119234,0.680414,0.0,0.707107,,
12 Angry Men (1957),-0.5,,-0.04989,1.0,0.666667,0.256625,0.274772,0.178848,,0.457176,...,,,,0.096546,0.068944,-0.361961,0.144338,1.0,1.0,
187 (1997),-0.5,,0.269191,0.666667,1.0,0.596644,,-0.5547,,1.0,...,,0.866025,,0.455233,-0.5,0.5,0.475327,,,


In [206]:
corrMatrix = userRatings.corr(method='pearson', min_periods=100)
corrMatrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,1.0,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,1.0,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


In [275]:
print("Movies and their ratings for custom user 1")
#Can change user for 1 to any user and obtain recommendation for that user. Just change loc[number].
myRatings = userRatings.loc[2].dropna()
myRatings.head(10)

Movies and their ratings for custom user 1


title
3 Ninjas: High Noon At Mega Mountain (1998)    1.0
Absolute Power (1997)                          3.0
Air Force One (1997)                           4.0
Antonia's Line (1995)                          3.0
Apt Pupil (1998)                               1.0
As Good As It Gets (1997)                      5.0
Bed of Roses (1996)                            3.0
Birdcage, The (1996)                           4.0
Breakdown (1997)                               4.0
Contact (1997)                                 3.0
Name: 2, dtype: float64

In [276]:
simCandidates = pd.Series()
for i in range(0, len(myRatings.index)):
    print( "adding sims for " + myRatings.index[i] + "...")
    sims = corrMatrix[myRatings.index[i]].dropna()
    sims = sims.map(lambda x: x * myRatings[i])
    simCandidates = simCandidates.append(sims)
    
print( "sorting in decreasing order of similarity score: ")
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(10)

adding sims for 3 Ninjas: High Noon At Mega Mountain (1998)...
adding sims for Absolute Power (1997)...
adding sims for Air Force One (1997)...
adding sims for Antonia's Line (1995)...
adding sims for Apt Pupil (1998)...
adding sims for As Good As It Gets (1997)...
adding sims for Bed of Roses (1996)...
adding sims for Birdcage, The (1996)...
adding sims for Breakdown (1997)...
adding sims for Contact (1997)...
adding sims for Deceiver (1997)...
adding sims for Devil's Advocate, The (1997)...
adding sims for Donnie Brasco (1997)...
adding sims for Emma (1996)...
adding sims for English Patient, The (1996)...
adding sims for Evita (1996)...
adding sims for Face/Off (1997)...
adding sims for FairyTale: A True Story (1997)...
adding sims for Fargo (1996)...
adding sims for Fierce Creatures (1997)...
adding sims for Fly Away Home (1996)...
adding sims for Full Monty, The (1997)...
adding sims for Godfather, The (1972)...
adding sims for Good Will Hunting (1997)...
adding sims for Heat (199

Titanic (1997)                  5.0
Secrets & Lies (1996)           5.0
L.A. Confidential (1997)        5.0
Good Will Hunting (1997)        5.0
Godfather, The (1972)           5.0
Fargo (1996)                    5.0
Emma (1996)                     5.0
Kolya (1996)                    5.0
Sense and Sensibility (1995)    5.0
Star Wars (1977)                5.0
dtype: float64

In [281]:
type(corrMatrix[myRatings.index[4]].dropna())
simCandidates.shape

(2703,)

In [274]:
simCandidates = simCandidates.groupby(simCandidates.index).sum()
simCandidates.head()

12 Angry Men (1957)             0.921447
2001: A Space Odyssey (1968)    5.057778
Absolute Power (1997)           4.527559
Abyss, The (1989)               2.811822
African Queen, The (1951)       3.762254
dtype: float64

In [298]:
print("Adding up the similarity scores of duplicate values:")
simCandidates.sort_values(inplace = True, ascending = False)
simCandidates.head(10)
#simCandidates.shape

Adding up the similarity scores of duplicate values:


Titanic (1997)                  5.0
Emma (1996)                     5.0
L.A. Confidential (1997)        5.0
Good Will Hunting (1997)        5.0
Godfather, The (1972)           5.0
As Good As It Gets (1997)       5.0
Secrets & Lies (1996)           5.0
Kolya (1996)                    5.0
Sense and Sensibility (1995)    5.0
Star Wars (1977)                5.0
dtype: float64

In [269]:
# Returns the Top 10 recommendations
print("Filtering the result to remove already rated movies:")
filteredSims = simCandidates.drop(myRatings.index, errors='ignore')
filteredSims.head(10)

Filtering the result to remove already rated movies:


Rock, The (1996)                             21.250240
Raiders of the Lost Ark (1981)               18.483661
Return of the Jedi (1983)                    17.947584
Independence Day (ID4) (1996)                17.730422
Back to the Future (1985)                    16.597338
Braveheart (1995)                            16.313714
Indiana Jones and the Last Crusade (1989)    15.855371
Mission: Impossible (1996)                   15.805824
Fugitive, The (1993)                         15.220034
E.T. the Extra-Terrestrial (1982)            15.146415
dtype: float64

(29,)