In [303]:
# We use MovieRatings dataset that was generated by the class for this project. This system recommends movies for the active user (the user whom the prediction is for)

# We make an attempt to use (user-item) collaborative filtering technique for project-1

# Typically, the workflow of a collaborative filtering system is: (Source: Wikipedia)

# A user expresses his or her preferences by rating items (e.g. books, movies or CDs) of the system. These ratings can be viewed as an approximate representation of the user's interest in the corresponding domain.
# The system matches this user’s ratings against other users’ and finds the people with most "similar" tastes.
# With similar users, the system recommends items that the similar users have rated highly but not yet being rated by this user (presumably the absence of rating is often considered as the unfamiliarity of an item)

In [130]:
#Load data into a pandas dataframe
import csv
#import os
import pandas as pd
import numpy as np
#os.chdir('C:\\Users\ppadebettu\Documents\GitHub\IS-643-Recommender Systems\Week_1')
url = 'https://raw.githubusercontent.com/ppadebettu/CUNY/Master/IS_643_Recommender_Systems/Project_1/MovieRatings.csv'
df = pd.read_csv(url, sep = "," , header = 0, na_values='NaN')

In [131]:
#Display data
df
#df = df.fillna(0)

Unnamed: 0,Critic,CaptainAmerica,Deadpool,Frozen,JungleBook,PitchPerfect2,StarWarsForce
0,Burton,,,,4.0,,4.0
1,Charley,4.0,5.0,4.0,3.0,2.0,3.0
2,Dan,,5.0,,,,5.0
3,Dieudonne,5.0,4.0,,,,5.0
4,Matt,4.0,,2.0,,2.0,5.0
5,Mauricio,4.0,,3.0,3.0,4.0,
6,Max,4.0,4.0,4.0,2.0,2.0,4.0
7,Nathan,,,,,,4.0
8,Param,4.0,4.0,1.0,,,5.0
9,Parshu,4.0,3.0,5.0,5.0,2.0,3.0


In [132]:
#Convert the dataframe into dictionary format
#file.set_index('CaptainAmerica').to_dict()
movie_dict = df.set_index('Critic').T.to_dict('dict')

In [133]:
#Display data in the dictionary format
movie_dict['Parshu']

{'CaptainAmerica': 4.0,
 'Deadpool': 3.0,
 'Frozen': 5.0,
 'JungleBook': 5.0,
 'PitchPerfect2': 2.0,
 'StarWarsForce': 3.0}

In [134]:
from math import sqrt

#Function that returns a distance-base similarity score for person1 and person2

def sim_distance(dict, p1, p2):
#Get the list of mutually rated items, ignore items that were not rated by either person
 si = {}
 for item in movie_dict[p1]:
       if (item in movie_dict[p2]) and not(np.isnan(movie_dict[p2][item])) and not(np.isnan(movie_dict[p1][item])) :
          si[item] = 1
#If they have no rating in common, return 0
 if len(si) == 0: 
  return 0

#Add up the squares of all differences for common ratings
 sum_of_squares = sum([pow(movie_dict[p1][item]-movie_dict[p2][item],2) for item in si.keys()])

 return 1 / (1 + sum_of_squares)

In [135]:
# Test the distance-based similarity function for two users
sim_distance(movie_dict, 'Prashanth','Parshu')

0.14285714285714285

In [136]:
#Function that returns the Pearson correlation coefficient for person1 and person2 
def sim_pearson(dict,p1,p2):
 #Get the list of mutually rated items, ignore items that were not rated by either person
 si = {}
 for item in movie_dict[p1]:
  if item in movie_dict[p2] and not(np.isnan(movie_dict[p2][item])) and not(np.isnan(movie_dict[p1][item])): 
   si[item] = 1

 #if they are no rating in common, return 0
 if len(si) == 0:
  return 0

 #sum calculations
 n = len(si)
            
 #sum of all preferences
 sum1 = sum([movie_dict[p1][it] for it in si])
 sum2 = sum([movie_dict[p2][it] for it in si])
 #Sum of the squares
 sum1Sq = sum([pow(movie_dict[p1][it],2) for it in si])
 sum2Sq = sum([pow(movie_dict[p2][it],2) for it in si])
 #Sum of the products
 pSum = sum([movie_dict[p1][it] * movie_dict[p2][it] for it in si])
 #Calculate r (Pearson score)
 num = pSum - (sum1 * sum2)/n
 den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
 if den == 0:
  return 0

 r = num/den

 return r


In [119]:
# Test the Pearson-based similarity function
sim_pearson(movie_dict,'Prashanth','Parshu')

0.55901699437494845

In [120]:
#Function which returns the top 5 best matches for the person from the movie dictionary
#Number of the results and similiraty function are optional parameters
def topMatches(dict,person,n=5,similarity=sim_distance):
     scores = [(similarity(dict,person,other),other) for other in dict if other != person]
     scores.sort()
     scores.reverse()
     return scores[0:n]

In [137]:
# Test top best matches for a person from the movie dictionary
topMatches(movie_dict, 'Parshu', similarity=sim_pearson)

[(0.86602540378443904, 'Shipra'),
 (0.55901699437494845, 'Prashanth'),
 (0.50000000000000266, 'Dieudonne'),
 (0.31491832864888686, 'Charley'),
 (0.10660035817780605, 'Max')]

In [138]:
#Function that gets recommendations for a person by using a weighted average of every other user's rankings

def getRecommendations(dict,person,similarity=sim_distance):
 totals = {}
 simSums = {}

 for other in dict:
  #don't compare me to myself
  if other == person:
   continue
  sim = similarity(dict,person,other)
  #ignore scores of zero or lower
  if sim <= 0: 
   continue
  for item in dict[other]:
   #only score movies I haven't rated yet
   if (item not in dict[person]) or (dict[person][item] == 0) or (np.isnan(dict[person][item])) :
    #Similarity * score
    totals.setdefault(item,0)
    totals[item] += not(np.isnan(dict[other][item] * sim))
    #Sum of similarities
    simSums.setdefault(item,0)
    simSums[item] += sim

    #Create the normalized list
 rankings = [(total/simSums[item],item) for item,total in totals.items()]

 #Return the sorted list
 rankings.sort()
 rankings.reverse()
 return rankings


In [139]:
#Test getRecommendations function
getRecommendations(movie_dict,'Mauricio',similarity=sim_distance)

[(2.9148736532177177, 'StarWarsForce'), (2.1199081114310676, 'Deadpool')]

In [140]:
#Now we validate the results with python graphlab package
#now reshape from wide to long format
#df_long = pd.wide_to_long(df, ['Rating'], i = 'Critic', j = 'Movie')
df_long = pd.melt(df, id_vars=['Critic'])
df_long.columns = ['Critic', 'Movie', 'Rating']
#Remove NaN's as graph lab doesn't seem to like NaN's or zeroes
df_long = df_long.dropna()
df_long["Rating"] = df_long["Rating"].astype(int)

In [141]:
#Display dataframe in long format
df_long.head()

Unnamed: 0,Critic,Movie,Rating
1,Charley,CaptainAmerica,4
3,Dieudonne,CaptainAmerica,5
4,Matt,CaptainAmerica,4
5,Mauricio,CaptainAmerica,4
6,Max,CaptainAmerica,4


In [142]:
#Let's use graphlab package
import graphlab
from graphlab import SFrame
#Convert pandas dataframe to graphlab SFrame object
sf = graphlab.SFrame(df_long)

In [143]:
#Display SFrame data
sf.head()

Critic,Movie,Rating
Charley,CaptainAmerica,4
Dieudonne,CaptainAmerica,5
Matt,CaptainAmerica,4
Mauricio,CaptainAmerica,4
Max,CaptainAmerica,4
Param,CaptainAmerica,4
Parshu,CaptainAmerica,4
Prashanth,CaptainAmerica,5
Sreejaya,CaptainAmerica,5
Steve,CaptainAmerica,4


In [144]:
#Create recommender model with default similarity_type = jaccard
model = graphlab.recommender.create(sf, user_id="Critic", item_id="Movie")

In [145]:
#Display model
model

Class                           : ItemSimilarityRecommender

Schema
------
User ID                         : Critic
Item ID                         : Movie
Target                          : None
Additional observation features : 0
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 61
Number of users                 : 16
Number of items                 : 6

Training summary
----------------
Training time                   : 0.003

Model Parameters
----------------
Model class                     : ItemSimilarityRecommender
threshold                       : 0.001
similarity_type                 : jaccard
training_method                 : auto

Other Settings
--------------
degree_approximation_threshold  : 4096
sparse_density_estimation_sample_size: 4096
max_data_passes                 : 4096
target_memory_usage             : 8589934592
seed_item_set_size              : 50
nearest_neighbors_interaction_proporti

In [146]:
##Create recommender model with similarity_type = cosine
model_cosine = graphlab.item_similarity_recommender.create(sf, user_id="Critic", item_id="Movie", target="Rating",
                                                           similarity_type="cosine")

In [147]:
#Display model
model_cosine

Class                           : ItemSimilarityRecommender

Schema
------
User ID                         : Critic
Item ID                         : Movie
Target                          : Rating
Additional observation features : 0
Number of user side features    : 0
Number of item side features    : 0

Statistics
----------
Number of observations          : 61
Number of users                 : 16
Number of items                 : 6

Training summary
----------------
Training time                   : 0.002

Model Parameters
----------------
Model class                     : ItemSimilarityRecommender
threshold                       : 0.001
similarity_type                 : cosine
training_method                 : auto

Other Settings
--------------
degree_approximation_threshold  : 4096
sparse_density_estimation_sample_size: 4096
max_data_passes                 : 4096
target_memory_usage             : 8589934592
seed_item_set_size              : 50
nearest_neighbors_interaction_proport

In [148]:
#Make some recommedations using survey data from the class
recommend_result = model.recommend(users=None, k=10)
recommend_result.head()

Critic,Movie,score,rank
Dieudonne,Frozen,0.576923092206,1
Dieudonne,PitchPerfect2,0.474747459094,2
Dieudonne,JungleBook,0.466346144676,3
Matt,JungleBook,0.558238640428,1
Matt,Deadpool,0.557775571942,2
Mauricio,Deadpool,0.530303031206,1
Mauricio,StarWarsForce,0.467708334327,2
Param,JungleBook,0.537259608507,1
Param,PitchPerfect2,0.515151500702,2
Prashanth,PitchPerfect2,0.521212112904,1
