In [1]:
# Problem Statement: Build Movie Recommendation Engine
# Input data: Movielens dataset
# Output result: Recommend movies based on User behaviour
# Machine learning algorithm: Turi Create
# Evaluation metric: RMSE
# Programming language: Python
# Author: Thirupathi Jadi
# Designation: Lead Data Scientist

In [2]:
import os
import pandas as pd
import numpy as np
import pandas_profiling as pp

In [3]:
cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('data1/ml-100k/ua.base', sep='\t', names=cols, encoding='latin-1')
ratings_test = pd.read_csv('data1/ml-100k/ua.test', sep='\t', names=cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [4]:
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [5]:
ratings_train.tail()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275
90569,943,1330,3,888692465


In [6]:
ratings_train.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
count,90570.0,90570.0,90570.0,90570.0
mean,461.494038,428.104891,3.523827,883507300.0
std,266.004364,333.088029,1.126073,5341684.0
min,1.0,1.0,1.0,874724700.0
25%,256.0,174.0,3.0,879448400.0
50%,442.0,324.0,4.0,882814300.0
75%,682.0,636.0,4.0,888204900.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
df = pp.ProfileReport(ratings_train)

In [8]:
df

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=18.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






In [9]:
df.to_file('movedata_eda_report.html')

HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [10]:
n_users = ratings_train.user_id.unique().shape[0]
n_items = ratings_train.movie_id.unique().shape[0]

In [11]:
data_matrix = np.zeros((n_users, n_items))
# for line in ratings_train.itertuples():
#     data_matrix[line[1]-1, line[2]-1] = line[3]

In [12]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [13]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [14]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [15]:
# Building a simple popularity and collaborative filtering model using Turicreate
import turicreate
# import graphlab
train_data = turicreate.SFrame(ratings_train)

In [16]:
test_data = turicreate.SFrame(ratings_test)

In [17]:
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [18]:
popularity_recomm = popularity_model.predict(test_data)
# popularity_recomm.print_rows(num_rows=25)

In [19]:
popularity_recomm

dtype: float
Rows: 9430
[3.3442622950819674, 3.460674157303371, 3.857142857142857, 3.690625, 3.09375, 3.4262295081967213, 3.9298245614035086, 4.135593220338983, 3.755813953488372, 3.868292682926829, 3.451219512195122, 4.365656565656566, 4.2439024390243905, 3.1466666666666665, 3.130081300813008, 3.077777777777778, 3.5795454545454546, 3.8974358974358974, 3.1363636363636362, 1.0, 3.0964467005076144, 3.1155778894472363, 2.9010416666666665, 3.4327731092436973, 3.5681818181818183, 3.412280701754386, 3.480769230769231, 2.764705882352941, 3.1176470588235294, 3.0476190476190474, 4.365656565656566, 2.607142857142857, 2.7228915662650603, 3.4740932642487046, 3.1155778894472363, 3.6788990825688073, 3.433333333333333, 3.359550561797753, 4.292181069958848, 2.8333333333333335, 3.8596938775510203, 3.1983471074380163, 3.0941176470588236, 4.311428571428571, 2.5517241379310347, 2.94, 2.75, 1.2941176470588236, 1.0, 2.923076923076923, 3.9565217391304346, 4.114457831325301, 3.8493150684931505, 3.934306569343

In [20]:
test_data

user_id,movie_id,rating,unix_timestamp
1,20,4,887431883
1,33,4,878542699
1,61,4,878542420
1,117,3,874965739
1,155,2,878542201
1,160,4,875072547
1,171,5,889751711
1,189,3,888732928
1,202,5,875072442
1,265,4,878542441


In [21]:
train_data

user_id,movie_id,rating,unix_timestamp
1,1,5,874965758
1,2,3,876893171
1,3,4,878542960
1,4,3,876893119
1,5,3,889751712
1,6,5,887431973
1,7,4,875071561
1,8,1,875072484
1,9,5,878543541
1,10,3,875693118


In [22]:
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5],k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1656   |  5.0  |  1   |
|    1    |   1201   |  5.0  |  2   |
|    1    |   1189   |  5.0  |  3   |
|    1    |   1122   |  5.0  |  4   |
|    1    |   814    |  5.0  |  5   |
|    2    |   1656   |  5.0  |  1   |
|    2    |   1201   |  5.0  |  2   |
|    2    |   1189   |  5.0  |  3   |
|    2    |   1122   |  5.0  |  4   |
|    2    |   814    |  5.0  |  5   |
|    3    |   1656   |  5.0  |  1   |
|    3    |   1201   |  5.0  |  2   |
|    3    |   1189   |  5.0  |  3   |
|    3    |   1122   |  5.0  |  4   |
|    3    |   814    |  5.0  |  5   |
|    4    |   1656   |  5.0  |  1   |
|    4    |   1201   |  5.0  |  2   |
|    4    |   1189   |  5.0  |  3   |
|    4    |   1122   |  5.0  |  4   |
|    4    |   814    |  5.0  |  5   |
|    5    |   1656   |  5.0  |  1   |
|    5    |   1201   |  5.0  |  2   |
|    5    |   1189   |  5.0  |  3   |
|    5    | 

In [23]:
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

In [24]:
#prediction the model
item_sim_model_pred = item_sim_model.predict(test_data)

In [25]:
item_sim_model_pred

dtype: float
Rows: 9430
[0.07667772210281314, 0.09289209123786168, 0.11223183811165904, 0.32998430182915606, 0.04633317876408118, 0.00751790882066916, 0.029499306933570453, 0.028445738645000312, 0.9342170230304921, 0.6399967674990646, 0.5005106306993045, 1.1256258487701416, 0.1397810337635187, 0.07519697569883786, 0.15787250720537627, 0.01990543305873871, 0.16270652069495276, 0.13475935046489423, 0.2664458350493358, 0.029005882831720207, 0.28679302063855255, 0.3661807829683477, 0.2324104756116867, 0.6032880300825293, 0.5355071858926252, 0.5316696112806146, 0.17572341046550058, 0.03258989615873857, 0.0, 0.2519236992705952, 1.1311477082116264, 0.38413800086293903, 0.1421016071523939, 1.0487151145935059, 0.8652071569647107, 0.1946025448186057, 0.07532134652137756, 0.0, 0.11448526382446289, 0.08254919733319964, 0.372647808898579, 0.2515438376051007, 0.04473305615511808, 0.6682871944976575, 0.030017099235997057, 0.02003733931165753, 0.009042515537955544, 0.0017141981558366256, 0.02793988206

In [26]:
#prediction the model
item_sim_model_acc = item_sim_model.evaluate(test_data, metric='rmse')


Overall RMSE: 3.3644480290611423

Per User RMSE (best)
+---------+-------------------+-------+
| user_id |        rmse       | count |
+---------+-------------------+-------+
|   774   | 1.628063346220653 |   10  |
+---------+-------------------+-------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---------+-------------------+-------+
| user_id |        rmse       | count |
+---------+-------------------+-------+
|   200   | 4.601684290464379 |   10  |
+---------+-------------------+-------+
[1 rows x 3 columns]


Per Item RMSE (best)
+----------+--------------------+-------+
| movie_id |        rmse        | count |
+----------+--------------------+-------+
|   1255   | 0.8007337326804796 |   1   |
+----------+--------------------+-------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+----------+------+-------+
| movie_id | rmse | count |
+----------+------+-------+
|   1357   | 5.0  |   1   |
+----------+------+-------+
[1 rows x 3 columns]



In [27]:
#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[774,200,1255,1357],k=5)
item_sim_recomm.print_rows(num_rows=100)

+---------+----------+---------------------+------+
| user_id | movie_id |        score        | rank |
+---------+----------+---------------------+------+
|   774   |   195    |  0.7521793574930351 |  1   |
|   774   |    89    |  0.6493423447430691 |  2   |
|   774   |   173    |  0.5937344028013889 |  3   |
|   774   |   216    |  0.540165545505898  |  4   |
|   774   |   132    |  0.5073842102679137 |  5   |
|   200   |   181    |  1.1902387422844045 |  1   |
|   200   |   403    |  1.0913706181697476 |  2   |
|   200   |   168    |  1.0351332345055144 |  3   |
|   200   |    96    |  1.0032620158010317 |  4   |
|   200   |   216    |  0.9427129721757278 |  5   |
|   1255  |    50    | 0.40388604044914245 |  1   |
|   1255  |   174    | 0.39985553741455077 |  2   |
|   1255  |   181    |  0.388565993309021  |  3   |
|   1255  |   172    | 0.37732303380966187 |  4   |
|   1255  |   204    | 0.36805203557014465 |  5   |
|   1357  |    50    | 0.40388604044914245 |  1   |
|   1357  | 