### This project is to build a movie recommender model based on the MovieLens dataset. 

In [4]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr

 Load Movielist and Ratings from **Movielens dataset**

In [78]:
movies =pd.read_csv('https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/movies.csv')
ratings =pd.read_csv('https://raw.githubusercontent.com/srajeev1/MSDA-IS643/master/projects/project1/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
n_users = ratings.userId.unique().shape[0]
n_items = ratings.movieId.unique().shape[0]
print str(n_users) + ' users'
print str(n_items) + ' items'

668 users
10325 items


Replace MovieID with name for clarity

In [10]:
def replace_name(x):
        return movies[movies['movieId']==x].title.values[0]
ratings.movieId= map(replace_name,ratings.movieId)


In [69]:
M=ratings.pivot_table(index=['userId'],columns=['movieId'],values='rating')
M.shape

(668, 10323)

Keep a copy of the original dataframe

In [12]:
M_copy =M.copy()
M_copy.shape

(668, 10323)

Similarity between two products can be given by the correlation existing
between their variables. **Pearson's correlation coefficient** is a popular correlation
coefficient calculated between two variables as the covariance of the two variables
divided by the product of their standard deviations.

In [62]:
def pearson(s1,s2):
    #print s1.mean()
    #print s2.mean()
    s1_c = s1-s1.mean()
    #print s1_c
    #print s2-s2.mean()
    
    s2_c = s2-s2.mean()
    return np.sum(s1_c * s2_c)/np.sqrt(np.sum(s1_c **2) * np.sum(s2_c **2))

In [63]:
pearson(M['Usual Suspects, The (1995)'],M['Casino (1995)'])

0.29069554555262817

In [64]:
pearson(M['Usual Suspects, The (1995)'],M['Powder (1995)'])

0.056083498644980023

*Using Scipy pearsonr function

In [66]:
pearsonr(M['Usual Suspects, The (1995)'],M['Powder (1995)'])

(0.056083498644980023, 0.1476332553826446)

In [76]:
# get recommends for the selected movie
def get_recs(movie_name,M,num):
    import numpy as np
    reviews =[]
    for title in M.columns:
        if title== movie_name:
            continue
        cor = pearson(M[movie_name],M[title])
        if np.isnan(cor):
            continue
        else:
            reviews.append((title,cor))
                
    reviews.sort(key= lambda tup: tup[1], reverse =True)
    return reviews[:num]

In [77]:
recs =get_recs('Mission: Impossible (1996)', M,10)
recs

[('Independence Day (a.k.a. ID4) (1996)', 0.56787052106043145),
 ('Rock, The (1996)', 0.55509403159985216),
 ('Twister (1996)', 0.52861718815040704),
 ('Eraser (1996)', 0.47034986078645913),
 ('Broken Arrow (1996)', 0.43399207967286502),
 ('Nutty Professor, The (1996)', 0.42605946764517272),
 ('Star Trek: First Contact (1996)', 0.41781114898052496),
 ('Executive Decision (1996)', 0.41079617539506708),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 0.39886539941422233),
 ('Ransom (1996)', 0.39124400082342853)]

Calculating Sparsity of the data frame

In [67]:
# total non zero elements in the dataframe
count1 =M_copy.count(axis=1, level=None, numeric_only=True)
sparsity= count1.sum()
print sparsity
sparsity /= (M.shape[0] * M.shape[1])
sparsity *= 100
print 'Sparsity: {:4.2f}%'.format(sparsity)

105335
Sparsity: 0.00%


sparsity is almost 0%

In [72]:
#convert all Nan's to 0's
M[np.isnan(M)]=0


In [None]:
Since sparsity is almost 0% Cosine similarity would be a better choice for recommendation

In [73]:
def cosine_similarity(ratings, kind='user'):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) 
       
    elif kind == 'item':
        sim = ratings.T.dot(ratings)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    #print sim
    #print norms   
    print sim/(norms * norms.T)
    #return (sim / norms / norms.T)
    

In [59]:
#M.shape

In [74]:
s1= cosine_similarity(M, kind='item')
s1


movieId                                             '71 (2014)  \
movieId                                                          
'71 (2014)                                            1.000000   
'Hellboy': The Seeds of Creation (2004)               0.000000   
'Round Midnight (1986)                                0.000000   
'Til There Was You (1997)                             0.000000   
'burbs, The (1989)                                    0.000000   
'night Mother (1986)                                  0.000000   
(500) Days of Summer (2009)                           0.000000   
*batteries not included (1987)                        0.000000   
...And Justice for All (1979)                         0.342682   
10 (1979)                                             0.000000   
10 Items or Less (2006)                               0.000000   
10 Things I Hate About You (1999)                     0.000000   
10 to Midnight (1983)                                 0.000000   
10,000 BC 

In [75]:
s1= cosine_similarity(M, kind='user')
s1

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.101113  0.210044  0.128766  0.057896  0.077130  0.358090   
2       0.101113  1.000000  0.115559  0.034610  0.032705  0.028305  0.062914   
3       0.210044  0.115559  1.000000  0.058208  0.044426  0.012816  0.084522   
4       0.128766  0.034610  0.058208  1.000000  0.019298  0.005781  0.059089   
5       0.057896  0.032705  0.044426  0.019298  1.000000  0.053378  0.080822   
6       0.077130  0.028305  0.012816  0.005781  0.053378  1.000000  0.099185   
7       0.358090  0.062914  0.084522  0.059089  0.080822  0.099185  1.000000   
8       0.097434  0.471918  0.066620  0.024420  0.041536  0.024964  0.095269   
9       0.239189  0.194232  0.459703  0.050572  0.023168  0.010270  0.175259   
10      0.026663  0.000000  0.068454  0.000000  0.011915  0.054996  0.023508   
11      0.175067  0.159791  0.230443  0.