# Movie Recommender system
# MDS201812
# MovieLens data

In [1]:
import numpy as np
import pandas as pd
from math import sqrt

The data consists of -
<br>943 users
<br>1682 items
<br>100000 ratings (1-5)
<br>Each user has rated at least 20 movies.

# Dataset

In [2]:
data = pd.read_csv("http://files.grouplens.org/datasets/movielens/ml-100k/u.data", delimiter="\t", header = None)
data.columns = ['user_id','item_id','rating','timestamp']
data.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
del data['timestamp']  #delete the column timestamp as it is not needed
data.head(5)

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
#movie data
movie_raw= pd.read_csv("http://files.grouplens.org/datasets/movielens/ml-100k/u.item",
                       delimiter="|", header = None,encoding='latin-1')
movie_raw.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
movie=movie_raw.iloc[:,[0,1,2,4]]
movie.columns=["Movie_id","Movie_title","Release_date","IMDb URL"]
movie.head(5)

Unnamed: 0,Movie_id,Movie_title,Release_date,IMDb URL
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)


# Creating dictionary of movie critics and their ratings of a set of movies
working with movie_ids(item_ids),later the movie names are extracted while recommending

In [5]:
critics = {}
for index,row in data.iterrows():
    try:
        critics[row['user_id']][row['item_id']] = row['rating']
    except Exception:
        critics[row['user_id']] = {row['item_id']:row['rating']}

In [6]:
critics

{196: {242: 3,
  393: 4,
  381: 4,
  251: 3,
  655: 5,
  67: 5,
  306: 4,
  238: 4,
  663: 5,
  111: 4,
  580: 2,
  25: 4,
  286: 5,
  94: 3,
  692: 5,
  8: 5,
  428: 4,
  1118: 4,
  70: 3,
  66: 3,
  257: 2,
  108: 4,
  202: 3,
  340: 3,
  287: 3,
  116: 3,
  382: 4,
  285: 5,
  1241: 3,
  1007: 4,
  411: 4,
  153: 5,
  13: 2,
  762: 3,
  173: 2,
  1022: 4,
  845: 4,
  269: 3,
  110: 1},
 186: {302: 3,
  566: 5,
  250: 1,
  148: 4,
  263: 3,
  470: 5,
  983: 3,
  281: 4,
  385: 4,
  588: 4,
  406: 1,
  925: 5,
  977: 3,
  322: 5,
  53: 1,
  333: 3,
  591: 4,
  742: 3,
  770: 2,
  550: 4,
  237: 2,
  1277: 4,
  1253: 4,
  71: 5,
  554: 1,
  257: 4,
  44: 5,
  117: 5,
  327: 3,
  288: 1,
  225: 4,
  988: 4,
  31: 4,
  939: 5,
  546: 4,
  100: 4,
  338: 3,
  717: 3,
  118: 2,
  226: 5,
  300: 5,
  299: 3,
  596: 4,
  95: 3,
  243: 2,
  1016: 5,
  79: 5,
  306: 4,
  106: 2,
  829: 4,
  934: 3,
  1399: 2,
  754: 2,
  595: 3,
  121: 2,
  568: 4,
  303: 3,
  332: 4,
  540: 4,
  1046: 3,
  98

We will now find the similarity between 2 users based on their movie ratings. There are different functions to find similarity between users. 3 types of similarity functions are given here.
<br>
# Finding Similarity using Euclidean distance 

The basis of many measures of similarity and dissimilarity is euclidean distance.Euclidean distance is the square root of the sum of squared differences between corresponding elements of the two vectors.Here two vectors are two persons and elements of the vectors are the ratings given by them on the common movies.

In [7]:
def sim_distance(prefs,user_1,user_2):  # Get the list of shared_items  
    si={}       #stores 1 if the movie is rated by both the users, else stores 0
    for item in prefs[user_1]:    
        if item in prefs[user_2]:       
            si[item]=1
    # if they have no ratings in common, return 0  
    if len(si)==0: 
        return 0
    # Add up the squares of all the differences  
    for item in prefs[user_1]:
        if item in prefs[user_2]:
            sum_of_squares=sum([pow(prefs[user_1][item]-prefs[user_2][item],2)])                      
    return(1/(1+sum_of_squares)) 

# Finding Similarity using Jaccard's Distance

The Jaccard index, also known as Intersection over Union and the Jaccard similarity coefficient (originally coined coefficient de communautÃ© by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.

In [8]:
def sim_jaccard(prefs, p1, p2):
    numerator = 0
    difference = 0
    for movie in prefs[p1]:
        if movie in prefs[p2]:
            numerator = numerator + 1
        else:
            difference = difference + 1
    denominator = len(prefs[p1].keys()) + difference        
    return numerator/denominator 

# Finding Similarity using Pearson's correlation coefficient

A slightly more sophisticated way to determine the similarity between peopleâ€™s inter-
ests is to use a Pearson correlation coefficient. The correlation coefficient is a mea-
sure of how well two sets of data fit on a straight line. The formula for this is more
complicated than the Euclidean distance score, but it tends to give better results in
situations where the data isnâ€™t well normalizedâ€”for example, if criticsâ€™ movie rank-
ings are routinely more harsh than average.

In [9]:
def sim_pearson(prefs,p1,p2):  # Get the list of mutually rated items  
    si={}      #stores 1 if the movie is rated by both the users, else stores 0
    for item in prefs[p1]:    
        if item in prefs[p2]: 
            si[item]=1
    # Find the number of elements  
    n=len(si)
    # if they are no ratings in common, return 0  
    if n==0: 
        return 0
    # Add up all the preferences  
    sum1=sum([prefs[p1][it] for it in si])  
    sum2=sum([prefs[p2][it] for it in si])
    # Sum up the squares  
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])  
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    # Sum up the products  
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score  
    num=pSum-(sum1*sum2/n)  
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))  
    if den==0: 
        return 0
    r=num/den
    return r

In [10]:
sim_distance(critics,451,266) #Similarity score(Euclidean) of users having userid 451 and userid 266

0.5

In [11]:
sim_jaccard(critics,451,266)  #Similarity score(Jacard) of users having userid 451 and userid 266

0.0425531914893617

In [12]:
sim_pearson(critics,451,266)  #Similarity score(Pearson) of users having userid 451 and userid 266

-0.46188021535170054

# Function to find the top matching users for a particular user

In [13]:
# Number of results and similarity function are optional params. 
def topMatches(prefs,person,n,similarity):  # n = No. of outputs needed
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
  # Sort the list so the highest scores appear at the top  
    scores.sort()  
    scores.reverse()  
    return scores[0:n]

In [14]:
topMatches(critics,2,5,sim_pearson) # Top matches for user with userid 196

[(1.0, 914), (1.0, 607), (1.0, 426), (1.0, 187), (1.0, 167)]

# Function to get recommendations of top 10 unwatched movies 

In [16]:
def getRecommendations(prefs,person,similarity):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings[:10]

In [17]:
critics[166].keys()  #Movie_id of watched(rated) movies for user with userid 166

dict_keys([346, 328, 322, 288, 258, 343, 300, 894, 323, 748, 751, 984, 294, 286, 243, 347, 313, 688, 315, 687])

In [18]:
getRecommendations(critics,166,sim_pearson) 
#gives 10 recommended movies(movie ids) with rating for the user with user id 166

[(5.0, 1656),
 (5.0, 1629),
 (5.0, 1607),
 (5.0, 1592),
 (5.0, 1529),
 (5.0, 1500),
 (5.0, 1467),
 (5.0, 1462),
 (5.0, 1448),
 (5.0, 1429)]

In [19]:
def Recommendations(user_id):  # Function to find Recommended movie names for the given user id
 l=[]   
 for rating,item_id in getRecommendations(critics,user_id,sim_pearson):
    l.append((movie['Movie_title'][item_id-1],rating))
 return l   
Recommendations(166) #gives 10 recommended movies(movie names) with rating for the user with user id 166

[('Little City (1998)', 5.0),
 ('Nico Icon (1995)', 5.0),
 ('Hurricane Streets (1998)', 5.0),
 ('Magic Hour, The (1998)', 5.0),
 ('Underground (1995)', 5.0),
 ('Santa with Muscles (1996)', 5.0),
 ('Saint of Fort Washington, The (1993)', 5.0),
 ('Thieves (Voleurs, Les) (1996)', 5.0),
 ('My Favorite Season (1993)', 5.0),
 ('Sliding Doors (1998)', 5.0)]