# Recommender for Amazon Books with Matrix Factorisation



## Load data into SQLite database


In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import datetime as dt

disk_engine = create_engine('sqlite:///ratings.db')

def load_data(chunksize = 20000):
    start = dt.datetime.now()
    j = 0
    index_start = 1
    data_columns = ['UserId', 'ItemId', 'Rating', 'Time']

    for df in pd.read_csv('dist/ratings_Amazon_Instant_Video.csv', names=data_columns, chunksize=chunksize, iterator=True, encoding='utf-8'):
        df.index += index_start
        j+=1
        print('{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize))

        df.to_sql('data', disk_engine, if_exists='append')
        index_start = df.index[-1] + 1

#load_data()

## Query data from database

In [6]:
ratings_df = pd.read_sql_query('SELECT `UserId`, `ItemId`, `Rating` FROM `data` WHERE `UserId` IN (SELECT `UserId` '
                       'FROM `data`'
                       'GROUP BY `UserId`'
                       'HAVING COUNT(`Rating`) > 20)', disk_engine)

R_df = ratings_df.pivot(index = 'UserId', columns ='ItemId', values = 'Rating').fillna(0)
R_df.head()

ItemId,B000GIOPK2,B000H00VBQ,B000H0YRNY,B000H2DMME,B000H2DTWM,B000H4YNM0,B000HAB4NK,B000HKWE3O,B000HMPU0Q,B000HZEHL6,...,B00LEAOLBY,B00LERQAB6,B00LFE2SX2,B00LG0DKBO,B00LGIO8X0,B00LIRFK5E,B00LM493J2,B00LPWPMCS,B00LSWLQQQ,B00LTMJ29S
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A109ME7C09HM2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10L7F2V8368DO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11A75FIE3396D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11KYS8T5NLKZ1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A11Q7PYQ87R5Y2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Play with SVD

In [7]:
from scipy.sparse.linalg import svds

# Convert DataFrame into a matrix R (ratings matrix)
R = R_df.as_matrix()

# Normalise by each user's mean (de-mean)
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Decompose the ratings matrix R into three matrices; U, sigma and Vt
U, sigma, Vt = svds(R_demeaned, k = 50)

# Create the weights matrix
sigma = np.diag(sigma)

# Make the item predictions for every user
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predictions_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
predictions_df.head()

ItemId,B000GIOPK2,B000H00VBQ,B000H0YRNY,B000H2DMME,B000H2DTWM,B000H4YNM0,B000HAB4NK,B000HKWE3O,B000HMPU0Q,B000HZEHL6,...,B00LEAOLBY,B00LERQAB6,B00LFE2SX2,B00LG0DKBO,B00LGIO8X0,B00LIRFK5E,B00LM493J2,B00LPWPMCS,B00LSWLQQQ,B00LTMJ29S
0,-0.182849,0.097859,0.176759,0.01076,0.044868,-0.393424,-0.046331,0.011959,-0.013472,0.001993,...,-0.086395,-0.116069,0.278942,0.572357,-0.037268,0.26604,-0.139249,0.290469,-0.078081,-0.078081
1,0.030255,-0.036093,0.005157,0.013058,0.100626,0.062815,0.060889,0.016115,-0.007328,-0.068117,...,0.010282,-0.031029,0.043443,0.020496,-0.007276,0.211908,-0.010811,0.058353,0.003493,0.003493
2,0.014636,-0.055053,-0.021936,0.015925,0.087449,-0.041807,-0.140005,0.017264,-0.012891,0.239965,...,0.063238,0.061892,-0.038406,0.069892,0.015058,0.295229,-0.02861,0.076819,-0.007985,-0.007985
3,-0.029734,-0.113602,-0.030719,0.009671,-0.021162,-0.036356,-0.180246,0.022234,-0.026043,0.087187,...,0.130157,-0.041179,-0.061042,-0.002333,0.00851,0.107216,0.192227,0.149434,0.047383,0.047383
4,0.077772,0.002395,0.036813,0.048273,-0.000613,0.076149,-0.060275,0.055293,-0.011666,0.033975,...,0.000722,0.045453,0.024704,0.026968,0.019133,-0.002508,-0.050527,-0.065608,0.00437,0.00437


In [46]:
user_id = 'A11Q7PYQ87R5Y2'

user_row_index = R_df.index.get_loc(user_id)

# Get and sort the selected user's predictions
sorted_user_predictions = predictions_df.iloc[user_row_index].sort_values(ascending=False)
sorted_user_predictions

ItemId
B000OGTRC2    0.294273
B0013FJOG2    0.234751
B009C46X8I    0.231253
B00FPHSMHQ    0.231253
B00G31T4SY    0.209890
B00E1BYFP0    0.175654
B004UP3D8M    0.161001
B00B09HOGI    0.159671
B00B8PEM3C    0.159529
B00EJ630FS    0.152624
B00FCQPG2Y    0.148221
B008XFALEY    0.148221
B00870ZAPU    0.148221
B00C4QC2NQ    0.148221
B00I881186    0.148221
B008OKGZKM    0.148221
B00CW8CIB2    0.148221
B006LWHBDA    0.148221
B00FWLT7QA    0.148221
B00BF8AH0Y    0.148221
B00CDBTQCW    0.146120
B00IYSEPGU    0.145473
B003NRYJDQ    0.139888
B00IKT36S6    0.136929
B0086T6RH2    0.131998
B005DPJ6VQ    0.130932
B00E4H55BE    0.126858
B00BEBNO16    0.126858
B00GOQANS8    0.126643
B004VB5464    0.125040
                ...   
B00FIZIY3M   -0.107775
B004M1UERW   -0.109352
B00IAKL5S8   -0.113238
B0083IJGBU   -0.115629
B00F0SWPSC   -0.115763
B000VU2SW2   -0.116275
B00C7KXUOE   -0.116473
B00ETKG8AI   -0.116816
B008EQHT4M   -0.120720
B0099JKR6U   -0.121918
B00I3MPDP4   -0.122728
B00F2CESXG   -0.123624
B008

In [57]:
# Find the items that the selected user has rated
sql = 'SELECT `UserId`, `ItemId`, `Rating` FROM `data` WHERE `UserId` = "{0}"'.format(user_id)
user_ratings_df = pd.read_sql_query(sql, disk_engine)
user_ratings_df.shape

(24, 3)

In [79]:
user_ratings_df.head(24)

Unnamed: 0,UserId,ItemId,Rating
0,A11Q7PYQ87R5Y2,B0011EQBQQ,4.0
1,A11Q7PYQ87R5Y2,B0018SOVIA,5.0
2,A11Q7PYQ87R5Y2,B0032UQR70,5.0
3,A11Q7PYQ87R5Y2,B003336P1Y,5.0
4,A11Q7PYQ87R5Y2,B004D1VQU0,5.0
5,A11Q7PYQ87R5Y2,B00568MMIY,5.0
6,A11Q7PYQ87R5Y2,B0063T7IMK,5.0
7,A11Q7PYQ87R5Y2,B008LRBCJW,5.0
8,A11Q7PYQ87R5Y2,B009651C9S,4.0
9,A11Q7PYQ87R5Y2,B009DB8108,4.0


In [55]:
print('User {0} has already rated {1} items'.format(user_id, user_ratings_df.shape[0]))

User A11Q7PYQ87R5Y2 has already rated 24 items


In [58]:
sorted_user_predictions.shape

(4851,)

In [76]:
# Remove predicted ratings 
new_recommendations = sorted_user_predictions[~sorted_user_predictions.index.isin(user_ratings_df['ItemId'])]
new_recommendations.shape

(4827,)

In [78]:
# Select the The 20 highest predicted rating items, that the user hasn't rated yet
new_recommendations.head(20)

ItemId
B000OGTRC2    0.294273
B0013FJOG2    0.234751
B009C46X8I    0.231253
B00FPHSMHQ    0.231253
B00G31T4SY    0.209890
B00E1BYFP0    0.175654
B004UP3D8M    0.161001
B00B8PEM3C    0.159529
B00FCQPG2Y    0.148221
B008XFALEY    0.148221
B00870ZAPU    0.148221
B00C4QC2NQ    0.148221
B00I881186    0.148221
B008OKGZKM    0.148221
B00CW8CIB2    0.148221
B006LWHBDA    0.148221
B00FWLT7QA    0.148221
B00BF8AH0Y    0.148221
B00CDBTQCW    0.146120
B00IYSEPGU    0.145473
Name: 4, dtype: float64