In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.neighbors import NearestNeighbors
import warnings; warnings.simplefilter('ignore')
%matplotlib inline
plt.style.use('ggplot')

import json
import re

from time import time
import random

In [2]:
# define all the name of each columns
reviewerID = []
productID = []
liked_and_seen = []
reviewText = []
rating = []
summary = []
unixTime = []
date = []


In [3]:
# import the data
with open('data.json') as json_data:
    df = json.load(json_data)

In [4]:
# assign names to each columns
for i in range(len(df)):
    productID.append(df[i]['asin'])
    reviewerID.append(df[i]['reviewerID'])
    reviewText.append(df[i]['reviewText'])
    rating.append(df[i]['overall'])
    summary.append(df[i]['summary'])
    unixTime.append(df[i]['unixReviewTime'])
    liked_and_seen.append(df[i]['helpful'])
    date.append(df[i]['reviewTime'])

In [5]:
data = pd.DataFrame({'reviewerID': reviewerID, 'productID': productID, 'liked_and_seen': liked_and_seen, 'reviewText': reviewText, 'summary': summary, 'unixTime': unixTime, 'date': date, 'rating': rating})
data.head()

Unnamed: 0,reviewerID,productID,liked_and_seen,reviewText,summary,unixTime,date,rating
0,A30TL5EWN6DFXT,120401325X,"[0, 0]",They look good and stick good! I just don't li...,Looks Good,1400630400,"05 21, 2014",4.0
1,ASY55RVNIL0UD,120401325X,"[0, 0]",These stickers work like the review says they ...,Really great product.,1389657600,"01 14, 2014",5.0
2,A2TMXE2AFO7ONB,120401325X,"[0, 0]",These are awesome and make my phone look so st...,LOVE LOVE LOVE,1403740800,"06 26, 2014",5.0
3,AWJ0WZQYMYFQ4,120401325X,"[4, 4]",Item arrived in great time and was in perfect ...,Cute!,1382313600,"10 21, 2013",4.0
4,ATX7CZYFXI1KW,120401325X,"[2, 3]","awesome! stays on, and looks great. can be use...",leopard home button sticker for iphone 4s,1359849600,"02 3, 2013",5.0


In [6]:
df1 = pd.read_csv('scores.csv')

In [7]:

df1.head()

Unnamed: 0,0,score
0,0.009881,0.009881
1,0.259091,0.259091
2,0.006298,0.006298
3,0.012597,0.012597
4,0.00697,0.00697


In [8]:
df1.describe()

Unnamed: 0,0,score
count,6721.0,6721.0
mean,0.106861,0.106861
std,0.167541,0.167541
min,3.9e-05,3.9e-05
25%,0.007676,0.007676
50%,0.033156,0.033156
75%,0.126078,0.126078
max,0.988598,0.988598


In [9]:
# spliting the dataset
from sklearn.model_selection import train_test_split
TRAIN_SIZE = 0.8 
df_train, df_test = train_test_split(data, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 26896
TEST size: 6724


In [10]:
data = pd.DataFrame(df_test)
data.head()

Unnamed: 0,reviewerID,productID,liked_and_seen,reviewText,summary,unixTime,date,rating
30872,APW0OINC8LISZ,B004886C1G,"[0, 0]",I recently ordered a Samsung Epic 4G phone fro...,Unbelievable.......,1343088000,"07 24, 2012",5.0
3792,AFM7FK7NFEKFZ,B000S5VI8A,"[0, 0]",This is the 2nd bluetooth speakerphone that I'...,A solid performing bluetooth speakerphone,1265155200,"02 3, 2010",5.0
18558,AF82CHSZ0R05W,B003SNIR7Y,"[0, 0]",No complaints about the battery itself. It las...,good for the phone since stock battery is terr...,1294444800,"01 8, 2011",4.0
31828,AW4NQ4KTZMEZ7,B004AFVEOC,"[0, 0]",Its a very good phone and works perfect. I wis...,so-so but review can change,1355443200,"12 14, 2012",4.0
14937,ARPE117ZCZACS,B0037BVSCI,"[0, 0]",I bought this for my mom because her charger f...,Great price!,1341792000,"07 9, 2012",5.0


In [11]:
df1['tmp'] = 1
data['tmp'] = 1

In [12]:
data = pd.merge(df1, data, on=['tmp'])

In [13]:
data.head()

Unnamed: 0,0,score,tmp,reviewerID,productID,liked_and_seen,reviewText,summary,unixTime,date,rating
0,0.009881,0.009881,1,APW0OINC8LISZ,B004886C1G,"[0, 0]",I recently ordered a Samsung Epic 4G phone fro...,Unbelievable.......,1343088000,"07 24, 2012",5.0
1,0.009881,0.009881,1,AFM7FK7NFEKFZ,B000S5VI8A,"[0, 0]",This is the 2nd bluetooth speakerphone that I'...,A solid performing bluetooth speakerphone,1265155200,"02 3, 2010",5.0
2,0.009881,0.009881,1,AF82CHSZ0R05W,B003SNIR7Y,"[0, 0]",No complaints about the battery itself. It las...,good for the phone since stock battery is terr...,1294444800,"01 8, 2011",4.0
3,0.009881,0.009881,1,AW4NQ4KTZMEZ7,B004AFVEOC,"[0, 0]",Its a very good phone and works perfect. I wis...,so-so but review can change,1355443200,"12 14, 2012",4.0
4,0.009881,0.009881,1,ARPE117ZCZACS,B0037BVSCI,"[0, 0]",I bought this for my mom because her charger f...,Great price!,1341792000,"07 9, 2012",5.0


In [14]:
data = data.drop(columns=['0', 'tmp', 'liked_and_seen', 'unixTime', 'summary', 'date', 'rating'])

- Analysis of the Dataset
Find duplicatate ratings

In [36]:
# summary statistics of rating variable
data.score.describe().transpose()

count    4.519200e+07
mean     1.068614e-01
std      1.675287e-01
min      3.894679e-05
25%      7.675969e-03
50%      3.315638e-02
75%      1.260776e-01
max      9.885982e-01
Name: score, dtype: float64

In [37]:
# Handling Missing values
print('Number of missing values across columns: \n', data.isnull().sum())

Number of missing values across columns: 
 score         0
reviewerID    0
productID     0
reviewText    0
dtype: int64


### ratings analysis in dataset

In [15]:
counts=data.reviewerID.value_counts()
data_final= data[data.reviewerID.isin(counts[counts>=15].index)]

In [16]:
data.drop_duplicates()

Unnamed: 0,score,reviewerID,productID,reviewText
0,0.009881,APW0OINC8LISZ,B004886C1G,I recently ordered a Samsung Epic 4G phone fro...
1,0.009881,AFM7FK7NFEKFZ,B000S5VI8A,This is the 2nd bluetooth speakerphone that I'...
2,0.009881,AF82CHSZ0R05W,B003SNIR7Y,No complaints about the battery itself. It las...
3,0.009881,AW4NQ4KTZMEZ7,B004AFVEOC,Its a very good phone and works perfect. I wis...
4,0.009881,ARPE117ZCZACS,B0037BVSCI,I bought this for my mom because her charger f...
...,...,...,...,...
45191999,0.456090,A186697K4XKXQL,B0042X8XJ6,One of the selling points of this headset is i...
45192000,0.456090,A1B2U6CRG99A1N,B0048YB5B2,This product is amazing! It came before it was...
45192001,0.456090,A3HHUZFWBWLWSK,B0042X8XJ6,I had actually purchased two headsets to compa...
45192002,0.456090,A1A4GB1UOBQC9G,B0044WTQVE,i dont recommend to buy screen protectors on A...


In [21]:
#constructing the pivot table
final_score_matrix = data_final.reset_index().pivot_table(index = 'reviewerID', columns ='productID', values = 'score', aggfunc='mean').fillna(0)

In [22]:
final_score_matrix.head()

productID,120401325X,3998899561,6073894996,7887421268,8288853439,8288855504,8288862993,8288878881,9658231950,9819958431,...,B004E3087E,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9TLVM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01623621DS97QCLQANL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05463141Q2NQ2L1TYQGE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A101VKDCZVFIAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Building Popularity Recommder model

In [23]:
#Calucating the density of the score matrix
given_num_of_score = np.count_nonzero(final_score_matrix)
print('given_num_of_score = ', given_num_of_score)
possible_num_of_score = final_score_matrix.shape[0] * final_score_matrix.shape[1]
print('possible_num_of_score = ', possible_num_of_score)
density = (given_num_of_score/possible_num_of_score)
density *= 100
print ('density: {:4.2f}%'.format(density))

given_num_of_score =  6724
possible_num_of_score =  9042186
density: 0.07%


In [25]:
#Count of user_id for each unique product as recommendation score 
df_grouped = data.groupby('productID').agg({'reviewerID': 'count'}).reset_index()
df_grouped.rename(columns = {'reviewerID': 'score'},inplace=True)
df_grouped.head(10)

Unnamed: 0,productID,score
0,120401325X,13442
1,3998899561,13442
2,6073894996,47047
3,7887421268,6721
4,8288853439,6721
5,8288855504,13442
6,8288862993,33605
7,8288878881,26884
8,9658231950,6721
9,9819958431,13442


In [26]:
#Sort the products on recommendation score 
df_sort = df_grouped.sort_values(['score', 'productID'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
df_sort['rank'] = df_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = df_sort.head(5) 
popularity_recommendations

Unnamed: 0,productID,score,rank
1321,B0042FV2SI,967824,1.0
191,B000S5Q9CA,873730,2.0
263,B0015RB39O,658658,3.0
469,B001XXUOQI,490633,4.0
1410,B0044WTQVE,470470,5.0


In [27]:
# Use popularity based recommender model to make predictions
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userId'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations

In [28]:
find_recom = [10,100,150]   # This list is user choice.
for i in find_recom:
    print("The list of recommendations for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n")

The list of recommendations for the userId: 10

      userId   productID   score  rank
1321      10  B0042FV2SI  967824   1.0
191       10  B000S5Q9CA  873730   2.0
263       10  B0015RB39O  658658   3.0
469       10  B001XXUOQI  490633   4.0
1410      10  B0044WTQVE  470470   5.0


The list of recommendations for the userId: 100

      userId   productID   score  rank
1321     100  B0042FV2SI  967824   1.0
191      100  B000S5Q9CA  873730   2.0
263      100  B0015RB39O  658658   3.0
469      100  B001XXUOQI  490633   4.0
1410     100  B0044WTQVE  470470   5.0


The list of recommendations for the userId: 150

      userId   productID   score  rank
1321     150  B0042FV2SI  967824   1.0
191      150  B000S5Q9CA  873730   2.0
263      150  B0015RB39O  658658   3.0
469      150  B001XXUOQI  490633   4.0
1410     150  B0044WTQVE  470470   5.0




# Building a Collaborative Filtering Recommender Model.

### - User Based Collaborative Filtering Model

In [29]:
# Matrix with row per 'reviewer' and column per 'item'
pivot_df = data_final.reset_index().pivot_table(index = 'reviewerID', columns ='productID', values = 'score', aggfunc='mean').fillna(0)

pivot_df.head()

productID,120401325X,3998899561,6073894996,7887421268,8288853439,8288855504,8288862993,8288878881,9658231950,9819958431,...,B004E3087E,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9TLVM
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01623621DS97QCLQANL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A05463141Q2NQ2L1TYQGE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A101VKDCZVFIAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
pivot_df.shape

(5561, 1626)

In [31]:
# define reviewer index form 0 to 10
pivot_df['reviewerID_index'] = np.arange(0, pivot_df.shape[0], 1)
pivot_df.head()

productID,120401325X,3998899561,6073894996,7887421268,8288853439,8288855504,8288862993,8288878881,9658231950,9819958431,...,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9TLVM,reviewerID_index
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A01623621DS97QCLQANL3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A05463141Q2NQ2L1TYQGE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
A101VKDCZVFIAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [32]:
# actual ratings given by reviewer
pivot_df.set_index(['reviewerID_index'], inplace = True)
pivot_df.head()

productID,120401325X,3998899561,6073894996,7887421268,8288853439,8288855504,8288862993,8288878881,9658231950,9819958431,...,B004E3087E,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9TLVM
reviewerID_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


since this is a sparse matrix, we will use the singular value decomposition (SVD)

In [33]:
# svd
U, sigma, Vt = svds(pivot_df, k = 9)
print('Left singular matrix: \n', U)

Left singular matrix: 
 [[ 1.15210840e-08 -2.59443761e-08 -1.99833080e-08 ...  4.99883694e-13
   1.47792682e-13 -2.66563493e-12]
 [ 7.12353968e-09  1.01969142e-08 -2.01483640e-08 ... -6.43964072e-11
  -5.32326470e-13 -7.12833098e-10]
 [ 7.33257773e-04 -1.75716255e-03 -1.68999463e-03 ...  3.18169839e-07
   1.01857350e-07 -4.44974107e-07]
 ...
 [ 1.97981388e-20  3.26101584e-19  3.77128695e-19 ...  9.08838711e-20
  -1.33518230e-19 -6.11732634e-20]
 [ 1.33370998e-05  9.90832374e-05 -6.67263492e-05 ...  6.19526606e-07
   4.64538417e-09 -2.95801371e-07]
 [-8.47781261e-20  3.71428254e-20 -8.86604078e-21 ...  3.88989488e-21
  -7.37808034e-21  1.55978473e-20]]


In [34]:
print('Sigma: \n', sigma)

Sigma: 
 [0.81876525 0.82412103 0.85276372 0.89315496 0.89561226 0.91597875
 1.06056228 1.21975327 1.28451052]


now we have to convert sigma into a diagonal matrix

In [35]:
sigma = np.diag(sigma)
print('Diagonal matrix: \n', sigma)

Diagonal matrix: 
 [[0.81876525 0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.82412103 0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.85276372 0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.89315496 0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.89561226 0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.91597875
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  1.06056228 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         1.21975327 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.28451052]]


In [36]:
print('Right sigular matrix: \n', Vt)

Right sigular matrix: 
 [[ 2.76326575e-05  1.45706074e-04  1.55162184e-08 ...  2.07967106e-06
   3.55374721e-06  5.80285912e-18]
 [ 2.36861707e-04 -2.24609392e-05  1.60504209e-07 ... -4.66329698e-06
   3.91230159e-05  1.91560583e-17]
 [-1.76937656e-04  1.09808057e-04 -9.77904636e-08 ... -4.19766315e-06
  -2.25756079e-05  2.00047695e-17]
 ...
 [ 1.19198711e-06 -1.16888265e-08  2.35716497e-11 ...  1.29093138e-06
   6.12359966e-06 -4.63696511e-18]
 [ 1.32380871e-08  8.34707766e-05  4.12931499e-11 ...  4.79088883e-09
   7.39919162e-07  7.96881402e-18]
 [-6.16634937e-07 -5.63360285e-08 -4.05107561e-11 ... -1.25121407e-08
  -4.65804355e-08  2.68829860e-18]]


In [37]:
# predict the new score
predicted_score = np.dot(np.dot(U, sigma), Vt)
# convert predicted score to dataframe
preds_df = pd.DataFrame(predicted_score, columns = pivot_df.columns)
preds_df.head()

productID,120401325X,3998899561,6073894996,7887421268,8288853439,8288855504,8288862993,8288878881,9658231950,9819958431,...,B004E3087E,B004E30BIA,B004E30BJO,B004E329L2,B004E5KYJE,B004E5ZVL0,B004E75R3K,B004E7EKV0,B004E9SZP0,B004E9TLVM
0,-1.752804e-12,4.086964e-13,-1.603561e-15,1.103775e-14,-1.372715e-13,-1.387556e-25,1.9878790000000001e-25,-5.858299999999999e-26,1.700114e-16,8.52685e-15,...,-5.729522999999999e-26,9.958283e-27,5.207126e-25,-2.349786e-15,1.8239810000000002e-25,3.030738e-13,-4.3477729999999995e-26,1.911476e-13,-4.020003e-13,-6.868807e-25
1,5.256293e-12,-4.041096e-13,3.147092e-15,-8.297138e-15,-2.022417e-13,-4.734641e-26,-1.626609e-25,2.062059e-26,-1.090052e-16,-6.792952e-15,...,-3.327784e-26,5.295985e-27,1.233947e-25,5.274936e-15,7.004478e-26,2.367598e-12,-1.115598e-26,4.61603e-14,7.444655e-13,-2.596171e-25
2,-6.730334e-08,1.06887e-08,-8.040447e-11,3.549256e-10,-1.062265e-08,-9.664614e-21,1.170967e-20,-3.481271e-21,5.954157e-12,2.641649e-10,...,-4.236622e-21,7.333722e-22,3.9201709999999995e-20,-1.109225e-10,1.372e-20,2.284658e-08,-3.306347e-21,1.408471e-08,-1.980707e-08,-5.206587e-20
3,2.65811e-10,-2.195815e-11,1.682595e-13,-5.171357e-13,-3.354322e-13,4.413484e-25,-1.015172e-23,1.756022e-24,-7.370718e-15,-4.119909e-13,...,-2.808989e-25,2.797891e-26,-3.405745e-24,2.763278e-13,-5.645521e-25,-7.726169e-12,2.3530640000000003e-25,-1.165512e-12,4.058746e-11,5.638754e-24
4,1.230218e-09,-9.090128e-12,-2.365498e-13,-5.746233e-14,-2.397253e-09,8.949978e-22,-3.9976580000000003e-22,-2.9193410000000003e-22,6.347456e-16,-7.344497e-14,...,2.3625520000000003e-22,-4.251226e-23,-6.0108269999999996e-21,1.139271e-13,-1.208687e-21,1.576764e-07,5.517988e-22,1.459589e-09,-1.301624e-08,-5.682206e-21


In [38]:
# recommend the item with the highest predicted ratings
def recommend_item (reviewerid, pivot_df, preds_df, num_recommendations):
    # index starts at 0
    revieweridx = reviewerid-1
    # get and sort the reviewer's score
    sort_reviewer_score = pivot_df.iloc[revieweridx].sort_values(ascending = False)
    # sorted reviewer score
    sort_reviewer_prediction = preds_df.iloc[revieweridx].sort_values(ascending = False)
     #sort reviewer_predictions
    temp = pd.concat([sort_reviewer_score, sort_reviewer_prediction], axis = 1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['reviewer_score', 'reviewer_prediction']
    temp = temp.loc[temp.reviewer_score == 0]
    temp = temp.sort_values('reviewer_prediction', ascending = False)
    print('\nBelow are the recommended items for reviewer(reviewerid = {}):\n'. format(reviewerid))
    print(temp.head(num_recommendations))

In [39]:
# recommend the item with the highest predicted ratings
def recommend_item (reviewerid, pivot_df, preds_df, num_recommendations):
    # index starts at 0
    revieweridx = reviewerid-1
    # get and sort the reviewer's ratings
    sort_reviewer_rating = pivot_df.iloc[revieweridx].sort_values(ascending = False)
    # sorted reviewer rating
    sort_reviewer_prediction = preds_df.iloc[revieweridx].sort_values(ascending = False)
     #sort reviewer_predictions
    temp = pd.concat([sort_reviewer_rating, sort_reviewer_prediction], axis = 1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['reviewer_ratings', 'reviewer_prediction']
    temp = temp.loc[temp.reviewer_ratings == 0]
    temp = temp.sort_values('reviewer_prediction', ascending = False)
    print('\nBelow are the recommended items for reviewer(reviewerid = {}):\n'. format(reviewerid))
    print(temp.head(num_recommendations))

In [40]:
reviewerid = 8
num_recommendations = 6
recommend_item(reviewerid, pivot_df, preds_df, num_recommendations)


Below are the recommended items for reviewer(reviewerid = 8):

                   reviewer_ratings  reviewer_prediction
Recommended Items                                       
B0044WTQVE                      0.0         2.845988e-12
B0035R2QS4                      0.0         9.490274e-13
B003UC93WQ                      0.0         9.678599e-14
B00409E4JK                      0.0         8.073854e-14
B002OKLUGW                      0.0         6.988639e-14
B004889CKO                      0.0         6.421653e-14


In [41]:
reviewerid = 10
num_recommendations = 6
recommend_item(reviewerid, pivot_df, preds_df, num_recommendations)


Below are the recommended items for reviewer(reviewerid = 10):

                   reviewer_ratings  reviewer_prediction
Recommended Items                                       
B0041ST5L2                      0.0             0.001983
B003TRCTB4                      0.0             0.000066
B0042TY68C                      0.0             0.000061
B001UO651S                      0.0             0.000055
B0043M668G                      0.0             0.000051
B003ELOOZO                      0.0             0.000051


all the three reviewers are given different recommendations based on reviewers past behaviour since it is a collaborative recommender model

### - Evaluation of Collaborative Recommender Model

In [42]:
# average of actual rating for each item
final_score_matrix.mean().head()

productID
120401325X    0.000038
3998899561    0.000038
6073894996    0.000135
7887421268    0.000019
8288853439    0.000019
dtype: float64

In [43]:
# predicted ratings
preds_df.mean().head()

productID
120401325X    3.253839e-07
3998899561    7.487647e-07
6073894996    1.700571e-10
7887421268    1.060105e-08
8288853439    1.958532e-05
dtype: float64

In [44]:
rmse_df = pd.concat([final_score_matrix.mean(), preds_df.mean()], axis = 1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
rmse_df['productID_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head()

Unnamed: 0_level_0,Avg_actual_ratings,Avg_predicted_ratings,productID_index
productID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
120401325X,3.8e-05,3.253839e-07,0
3998899561,3.8e-05,7.487647e-07,1
6073894996,0.000135,1.700571e-10,2
7887421268,1.9e-05,1.060105e-08,3
8288853439,1.9e-05,1.958532e-05,4


In [45]:
RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings)**2).mean()**0.5),5)
print('\nRMSE SVD Model = {}\n'.format(RMSE))


RMSE SVD Model = 0.0001

