In [87]:
from math import sqrt
from numpy import *
from numpy import linalg as la
import numpy as np
import math
import pandas as pd

In [88]:
cd '/Users/anikaghosh/Desktop/'

/Users/anikaghosh/Desktop


In [89]:
movieCols=['movie_id', 'Name', 'release_date', 'video_release_date',"IMDb_URL","unknown","Action","Adventure","Animation", 
              "Childrens","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
               "Romance","Sci-Fi","Thriller","War","Western"]

In [90]:
items=pd.read_csv(r"u.item", sep='|', names=movieCols, encoding='latin-1')

In [91]:
movie_names = items.Name

In [92]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [93]:
#Number unique users
num_users = df.user_id.unique().shape[0]
#number unique movies
num_movies = df.item_id.unique().shape[0]

In [94]:
#Creating an array of the users and the ratings
ratings = np.zeros((num_users, num_movies))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [113]:
def getCorrelation(dat, a1, a2):
    t1 = np.nonzero(dat[a1])[0]
    t2 = np.nonzero(dat[a2])[0]

    # find overlap indices
    items = []
    for i in t1:
        if i in t2:
            items.append(i)
    if len(items) == 0: return 0

    # calculate correlations on items that are mutually rated
    v1 = np.array(dat[0][items])
    v2 = np.array(dat[1][items])

    correlation = np.corrcoef(v1, v2)[0][1] # returns only a number

    if math.isnan(correlation): 
        return 0
    elif correlation <= 0:
        correlation = 0

    return correlation

In [114]:
def FindSimUsers(dat, user, n = 5, similarity=getcorrelation):
    scores = []
    for other in range(len(dat)):
        if other != user-1:
            scores.append([similarity(dat, user-1, other), other])
    # sort the list
    scores.sort()
    scores.reverse()

    return scores[:n]


In [97]:
#Pearson similiarity
def pearsSim(inA,inB):
    if len(inA) < 3:
        return 1.0
    return .5 + .5 * np.corrcoef(inA,inB,rowvar =0)[0]

In [116]:
#function to get similarities between each movie using the Pearson Similarity measure
def GetSimUsers(data, simMeas = pearsSim):
    n = np.shape(data)[1] # number of items
    dat = data.T # movie-user format
    L = {}

    for item in range(n):
        temp = {}
        L[item] = {}
        for j in range(n):
            t1 = np.nonzero(dat[item])[0]
            t2 = np.nonzero(dat[j])[0]

            items = []
            for i in t1:
                if i in t2: 
                    items.append(i)

            v1 = np.array(dat[item][items])
            v2 = np.array(dat[j][items])

            if len(items) != 0:
                correlation = np.corrcoef(v1, v2)[0][1]
            else:
                correlation = 0

            if math.isnan(correlation): 
                correlation = 0

            temp[j] = correlation
                # returns r between item and j
            if item not in simL:
                L[item] = temp
            else:
                L[item].update(temp)

    return L

In [128]:
#estimating how the user will rate movies they haven't seen yet
def EstMovie (data, user, similarity, item):
    n = np.shape(data)[1] # number of items
    simTotal = 0.0
    ratSimTotal = 0.0

    for j in range(n):
        userRating = data[user, j]
        if userRating == 0: 
            continue

        sim = similarity[item][j]
        if sim <= 0: sim = 0

        simTotal += sim
        ratSimTotal += sim * userRating

    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal

In [129]:
#function to recommend 3 movie id's to the given user
def recommend(data, user, simMeas, N=3, estMethod=EstMovie):
    unratedItems = nonzero(np.matrix(data[user,:]).A==0)[1] #find unrated items 
    
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    
    for item in unratedItems:
        estimatedScore = estMethod(data, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]


In [130]:
#function to calculate the absolute errors for each user and the items they haven't rated
def cross_validate_user(data, user, test_ratio, simMeas, estMethod=EstMovie):
    number_of_items = np.shape(data)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if data[user,i]>0])
    test_size = test_ratio * len(rated_items_by_user)
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(data[user])
    data[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

    # Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(data, user, simMeas, item)
        error_u = error_u + abs(estimatedScore - original_user_profile[item])

    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        data[user, item] = original_user_profile[item]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [131]:
#Function to give mean average error for all the prediction it has made in 
#the function uses cross_validate_user to come up with the MAE
def test(data, test_ratio, simMeas):
    error = 0
    count = 0
    for i in range(np.shape(data)[0]):
        er, ct = cross_validate_user(data, i, test_ratio, simMeas)
        error += er
        count += ct
    MAE = error/count
    print ("The MAE  is for user-based collaborative filtering is: %0.5f" % MAE)
    return MAE

In [103]:
#testing to see the correlation worked for 2 users
getCorrelation(ratings,0,1)

0.16084123285436974

In [115]:
#testing to see if the function finsimusers gave 5 similar users
FindSimUsers(ratings,1)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)


[[1.0, 819], [1.0, 810], [1.0, 430], [1.0, 350], [1.0, 165]]

In [118]:
#defining the variable simMat to get all the movies in the data set
s1 = GetSimUsers(ratings)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c *= 1. / np.float64(fact)
  c *= 1. / np.float64(fact)


In [119]:
#Testing to see the similarity between movie_id 1 and all the other movie_id's
s1[1]

{0: 0.22178411018797145,
 1: 1.0,
 2: 0.23053558994446013,
 3: 0.24455570679626382,
 4: 0.21755935989534117,
 5: -0.158113883008419,
 6: 0.17544466993777041,
 7: 0.34052531432277777,
 8: -0.22759907267933821,
 9: 0.19908260831942443,
 10: 0.30991452931972147,
 11: 0.30288051695145152,
 12: 0.34881578763539894,
 13: -0.30803425146260754,
 14: 0.16229394952999107,
 15: 0.71443450831176036,
 16: -0.16948052018933252,
 17: 0,
 18: -0.34299717028501764,
 19: 0.090210979560879007,
 20: 0.36136295981259853,
 21: 0.096375436497989894,
 22: -0.28439822949726057,
 23: 0.25465502175810295,
 24: 0.2141896620284631,
 25: -0.002515136364329775,
 26: 0.29507650006014408,
 27: 0.26331207356061243,
 28: 0.24687688602488875,
 29: 0.35675303400633784,
 30: 0.39182610809011437,
 31: 0.25509611848429814,
 32: 0.20092682164515469,
 33: -0.86602540378443871,
 34: -0.15617376188860604,
 35: 0.27803533720054396,
 36: 0.26363525200414828,
 37: 0.39965694682675834,
 38: 0.20510441006980812,
 39: 0.34864452580526

In [132]:
#testing to see how the user 1 will rate movie_id 300
EstMovie(ratings, 1, s1, 300)

3.8541885669963092

In [110]:
#testing to see how the recommend function on user 1 and the 3 movies it spits plus the ratings of those movies
recommend(ratings, 1, s1)

[(441, 5.0), (599, 5.0), (776, 5.0)]

In [134]:
#testing again to see recommend function on user 10
recommend(ratings, 10, s1)

[(1476, 5.0), (1601, 5.0), (1651, 5.0)]

In [133]:
test(ratings,.2,s1)



The MAE  is for user-based collaborative filtering is: 0.72110


0.72110447696354762