# User-based collaborative filtering method 

## Read in grouplens 100k movie datasets and explore properties

In [1]:
movies={}
for x in open('Desktop/ml-100k/u.item'):
    (id,title,rlsdate)=x.split('|')[0:3]
    movies[id]=title
#create a dictionary of ratings with userid as the first key and movie name as the second key
rate={}
for x in open('Desktop/ml-100k/u.data'):
    (userid,itemid,rating,time)=x.split('\t')
    rate.setdefault(userid,{})
    rate[userid][movies[itemid]]=float(rating)
####################################################
mvrate={}
for x in open('Desktop/ml-100k/u.data'):
    (userid,itemid,rating,time)=x.split('\t')
   
    #some movies are not rated by anyone
    mvrate.setdefault(movies[itemid],{})
    mvrate[movies[itemid]][userid]=float(rating)

In [7]:
import matplotlib.pyplot as plt

#plot a figure with two users as axis and each point is the rating by the two users on a single movie
x=[]
y=[]
mvname=[]

#movie1 and movie2 are movie names
for movie1 in rate['344']:
    for movie2 in rate['345']:
        if(movie1==movie2):
            x.append(rate['344'][movie1])
            y.append(rate['345'][movie2])
            mvname.append(movie1)
fig=plt.figure()
plt.scatter(x,y)
plt.axis([0,6,0,6])
plt.xlabel('user 344')
plt.ylabel('user 345')
for i,txt in enumerate(mvname):
        plt.annotate(txt,(x[i],y[i]))
plt.plot()

[]

In [3]:
#Plot a figure with two movies as axis and each point is the rating by a user for the two movies
a=[]
b=[]
username=[]
for user1 in mvrate[movies['340']]:
    for user2 in mvrate[movies['345']]:
        if(user1==user2):
            a.append(mvrate[movies['340']][user1])
            b.append(mvrate[movies['345']][user2])
            username.append(user1)

fig=plt.figure()
plt.scatter(a,b)
plt.axis([0,6,0,6])
plt.xlabel(movies['340'])
plt.ylabel(movies['345'])
for i,txt in enumerate(username):
        plt.annotate(txt,(a[i],b[i]))
plt.plot()

[]

## Define methods to find 'neighbors' for a certain user based on several distance metrics

In [4]:
#where the method parameter is a pearson or euclid as defined in the other file
def simiuser(dict,person,n,method):
    t=[]
    for id in dict:
        if(id!=person):
            value=float(method(dict,person,id))
            t.append((id,value))   
            
    t.sort(key=lambda tup:tup[1],reverse=True)  
    top=t[:n]
    print top
##################################################    
#CALCULATE EUCLIDEAN DISTANCE BETWEEN TWO DATA SETS
#dict is a 2d dictionary 
#person1 and person2 are user ids in the form of 'id'
def euclid(dict,person1,person2):
    c=[]
    d=[]
    for movie1 in dict[person1]:
        for movie2 in dict[person2]:
            if(movie1==movie2):
                c.append(dict[person1][movie1])
                d.append(dict[person2][movie2])
    a=np.array(c)
    b=np.array(d)
    dist = np.linalg.norm(a-b)
    return dist
##############################################    
#CALCULATE PEARSON CORRELATION BETWEEN TWO DATA SETS
#dict is a 2d dictionary
#person1 and person2 are user ids in the form of 'id'
import math
def pearson(dict,person1,person2):
    c=[]
    d=[]
    x=[]
    y=[]
    for movie1 in dict[person1]:
        for movie2 in dict[person2]:
            if(movie1==movie2):
                c.append(dict[person1][movie1])
                d.append(dict[person2][movie2])
    if c and d:
        cmean=sum(c)/len(c)
        dmean=sum(d)/len(d)
        x=numpy.array(c)-cmean
        y=numpy.array(d)-dmean
        denom=math.sqrt(sum(numpy.array(x)**2)*sum(numpy.array(y)**2))
        num=sum(x*y)
        #if one of the variance is zero, then there is no correlation 
        if(denom==0):
            return 0
        return float(num/denom)

In [8]:
# predict user rating of movies he/she has not rated yet and include the predicted value in the dictionary
# depend on dictionaries "movies" and "mvrate"

def mean(dict,key):
    import numpy 
    val = []
    for each in dict[key].itervalues():
        val.append(each)     
    return numpy.mean(val)
    
def std(dict,key):
    import numpy
    val = []
    for each in dict[key].itervalues():
        val.append(each)   
    return numpy.std(val)
   
def zscore(dict,movie,user):
    return (dict[user][movie]-mean(dict,user))/std(dict,user)

def predict(dict,user,method):
#mv is the movie name
    num=0
    denom=0
    for mv in movies.itervalues():
        if mv not in dict[user]:
        #a list of users who have rated the mv
            u=mvrate[mv].keys()
            for i in u:
                num=num+(zscore(dict,mv,i)*method(dict,user,i))
                denom=denom+pearson(dict,user,i)
            dict[user][mv]=int(mean(dict,user)+std(dict,user)*(num/denom))
         

## The final recommender

In [5]:
import numpy
def recommender(dict, user, distance_metric, n):
    #recommend items for the user in the format of (predicted rating, item_name)
    weighted_rating={}
    weights={}
    threshold=0.2
    for neib in dict:
        if(neib != user):
            dist=distance_metric(dict,user,neib)
            for mv in dict[neib]:
                if(mv not in dict[user] and dist>threshold):
                    if(mv not in weighted_rating):
                        weighted_rating.setdefault(mv,0)
                        weights.setdefault(mv,0)
                        
                    weighted_rating[mv] += dist*dict[neib][mv]
                    weights[mv] += dist
    result = [(weighted_rating[i]/weights[i],i)for i in weights]
    result.sort(key=lambda tup:tup[0],reverse=True)
    top=result[:n]
    return top

## Using the recommender to suggest top 4 movies for user id '344'

In [6]:
recommender(rate,'344',pearson,4)

[(5.000000000000001, 'Angel Baby (1995)'),
 (5.0, 'Aparajito (1956)'),
 (5.0, 'They Made Me a Criminal (1939)'),
 (5.0, "Someone Else's America (1995)")]

## Calculate root mean square error (RMSE) of the recommender for one user 

In [23]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(dict,user,distance_metric):
    predicted=[]
    real=[]
    weighted_rating={}
    weights={}
    threshold=0.2
    for neib in dict:
        if(neib!=user):
            dist=distance_metric(dict,user,neib)
            for mv in dict[neib]:
                if(mv in dict[user] and dist>threshold):
                    if(mv not in weighted_rating):
                        weighted_rating.setdefault(mv,0)
                        weights.setdefault(mv,0)
                    
                    weighted_rating[mv] += dist*dict[neib][mv]
                    weights[mv] += dist
                
    predicted=[(weighted_rating[i]/weights[i]) for i in weights]
    real=[dict[user][i] for i in weights]
    rms = sqrt(mean_squared_error(real,predicted))
    print(rms)

In [24]:
rmse(rate,'344',pearson)

0.668173115811


## Calculate root mean square error (RMSE) of the the recommender for all users

In [11]:
from collections import OrderedDict
import numpy
from math import sqrt
from sklearn.metrics import mean_squared_error

def rmse(dict,distance_metric):
    predicted=OrderedDict()
    real=OrderedDict()
    for user in dict:
        weighted_rating={}
        weights={}
        threshold=0.2
        for neib in dict:
            if(neib!=user):
                dist=distance_metric(dict,user,neib)
                for mv in dict[neib]:
                    if(mv in dict[user] and dist>threshold):
                        if(mv not in weighted_rating):
                            weighted_rating.setdefault(mv,0)
                            weights.setdefault(mv,0)

                        weighted_rating[mv] += dist*dict[neib][mv]
                        weights[mv] += dist

        predicted[user]=[(weighted_rating[i]/weights[i]) for i in weights]
        real[user]=[dict[user][i] for i in weights]
    predicted_list=[item for sublist in predicted.values() for item in sublist]
    real_list=[item for sublist in real.values() for item in sublist]
    rms = sqrt(mean_squared_error(real_list,predicted_list))
    print(rms)

In [12]:
rmse(rate,pearson)

0.884595068534


# Compared with RMSE of KNN taking into account the z-score normalization of each user

In [5]:
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import KNNWithZScore

In [6]:
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)  #For 3 fold cross-validaiton

In [7]:
sim_options = {'name': 'cosine',  # Using the Cosine distance metric
               'user_based': True  # compute  similarities between users
               }
algo = KNNWithZScore(sim_options=sim_options)


In [8]:
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

Evaluating RMSE, MAE of algorithm KNNWithZScore.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9660
MAE:  0.7580
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9621
MAE:  0.7573
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9562
MAE:  0.7509
------------
------------
Mean RMSE: 0.9614
Mean MAE : 0.7554
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9660  0.9621  0.9562  0.9614  
MAE     0.7580  0.7573  0.7509  0.7554  
