# Use collaborative filtering to build recommendation system

In [1]:
import pandas as pd 
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')
import random

In [2]:
df = pd.read_csv('ratings_small.csv',sep = ',',names = ['UserID','movieID','rating','timestamp'])
df = df.drop(columns = ['timestamp'])
df = df.drop([0],axis=0)
df.head()

Unnamed: 0,UserID,movieID,rating
1,1,31,2.5
2,1,1029,3.0
3,1,1061,3.0
4,1,1129,2.0
5,1,1172,4.0


In [3]:
df.shape

(100004, 3)

In [4]:
df.dtypes

UserID     object
movieID    object
rating     object
dtype: object

In [5]:
df = df.astype({'rating':'float'})

In [6]:
df.dtypes

UserID      object
movieID     object
rating     float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,rating
count,100004.0
mean,3.543608
std,1.058064
min,0.5
25%,3.0
50%,4.0
75%,4.0
max,5.0


In [8]:
mu = df.rating.mean()
mu

3.543608255669773

In [9]:
df["rating"] = df["rating"].convert_objects(convert_numeric=True)
movie_ratings = pd.DataFrame(df.groupby('movieID')['rating'].mean())
movie_ratings.head()

Unnamed: 0_level_0,rating
movieID,Unnamed: 1_level_1
1,3.87247
10,3.45082
100,3.428571
100017,3.0
100032,2.0


In [10]:
user_ratings = pd.DataFrame(df.groupby('UserID')['rating'].mean())
user_ratings.head()

Unnamed: 0_level_0,rating
UserID,Unnamed: 1_level_1
1,2.55
10,3.695652
100,3.4
101,3.9
102,3.974926


In [11]:
movie_matrix = df.pivot_table(index='UserID', columns='movieID', values='rating')
movie_matrix.head()

movieID,1,10,100,100017,100032,100034,100083,100106,100159,100163,...,99795,998,99811,99813,99839,99846,999,99912,99917,99992
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
100,4.0,,,,,,,,,,...,,,,,,,,,,
101,,,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,,,,,,,,,,


### Define a function that can predict the rating
#### The prediction function is based on the equation from the class

In [13]:
def rating_calculate(uid,movid):
    movie_vector = movie_matrix[movid]
    movie_matrix_T = movie_matrix.T
    movie_user_matrix_T = movie_matrix_T[movie_matrix_T[uid].notnull()]
    movie_user_matrix = movie_user_matrix_T.T
    movie_similarity = movie_user_matrix.corrwith(movie_vector,method='pearson')
    similar_movie = pd.DataFrame(movie_similarity, columns=['similarity'])
    similar_movie.dropna(inplace=True)
    similar_movie = similar_movie.sort_values(by='similarity',ascending = False)
    similar_movie = similar_movie.reset_index()
    count = 0
    num = 0
    den = 0
    rat = 0
    user_base = user_ratings.loc[uid]
    movie_base = movie_ratings.loc[movid]
    for rec in similar_movie.itertuples():
        if rec[1] != movid:
            if count <= 25:
                sim = float(rec[2])
                mid = rec[1]
                item_base = movie_ratings.loc[mid]
                prd = (movie_matrix[mid][uid])-mu-(item_base-mu)-(user_base-mu)
                if not math.isnan(prd):
                    num = num + (sim*prd)
                    den = den+sim
                    rat = num/den
                    count = count+1
    rat = rat+mu+(user_base-mu)+(movie_base-mu)
    return rat

In [14]:
rating = rating_calculate('100','10')
rating

rating    3.277517
dtype: float64

### Evaluation of the model accuracy with mean absolute error

In [None]:
count = 0
total_ab_error = 0
for UserID in movie_matrix.index:
    #print(UserID)
    for movieID in movie_matrix.columns:
        #print(movieID)
        pred = rating_calculate(UserID, movieID)
        #print(pred)
        tru = movie_matrix.loc[UserID,movieID]
        print(tru)
        if not math.isnan(tru):
            absolute_error = abs(pred - tru)
            count = count +1
            total_ab_error = total_ab_error + absolute_error
            # print(total_ab_error)
print (total_ab_error/count)