# Recommender System for E-commerce Use Case

In [1]:
!pip install h2o4gpu

Collecting h2o4gpu
  Downloading h2o4gpu-0.4.1-cp37-cp37m-manylinux1_x86_64.whl (346.3 MB)
[K     |████████████████████████████████| 346.3 MB 9.2 kB/s 
[?25hCollecting scikit-learn==0.21.2
  Downloading scikit_learn-0.21.2-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 59.2 MB/s 
Installing collected packages: scikit-learn, h2o4gpu
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
umap-learn 0.5.1 requires scikit-learn>=0.22, but you have scikit-learn 0.21.2 which is incompatible.
tsfresh 0.18.0 requires scikit-learn>=0.22.0, but you have scikit-learn 0.21.2 which is incompatible.
tpot 0.11.7 requires scikit-learn>=0.22.0,

In [2]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import h2o4gpu
import itertools
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from math import sqrt
from scipy import sparse

In [3]:
#Reading data
review_data=pd.read_csv('../input/amazon-reviews-dataset/review_data.csv')
#metadata=pd.read_csv('./Data/meta_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#Dropping unwanted column
review_data=review_data.drop(['Unnamed: 0'],axis=1)
#metadata=metadata.drop(['Unnamed: 0'],axis=1)

We have two dataframes-review_data and metadata. review_data is information about the product reviews posted by each user. It has following columns:

    reviewerID - ID of the reviewer
    asin - ID of the product
    reviewerName - name of the reviewer
    vote - helpful votes of the review
    style - a dictionary of the product metadata
    reviewText - text of the review
    overall - rating of the product
    summary - summary of the review
    unixReviewTime - time of the review (unix time)
    reviewTime - time of the review (raw)
    image - images that users post after they have received the product

metadata includes information about the product:

    asin - ID of the product
    title - name of the product
    feature - bullet-point format features of the product
    description - description of the product
    price - price in US dollars (at time of crawl)
    image - url of the product image
    related - related products (also bought, also viewed, bought together, buy after viewing)
    salesRank - sales rank information
    brand - brand name
    categories - list of categories the product belongs to
    tech1 - the first technical detail table of the product
    tech2 - the second technical detail table of the product
    similar - similar product table
    
Citation: 
Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
pdf

In [5]:
review_data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 20, 2014",A1D4G1SNUZWQOT,7106116521,Tracy,Exactly what I needed.,perfect replacements!!,1413763200,,,
1,2.0,True,"09 28, 2014",A3DDWDH9PX2YX2,7106116521,Sonja Lau,"I agree with the other review, the opening is ...","I agree with the other review, the opening is ...",1411862400,3.0,,
2,4.0,False,"08 25, 2014",A2MWC41EW7XL15,7106116521,Kathleen,Love these... I am going to order another pack...,My New 'Friends' !!,1408924800,,,
3,2.0,True,"08 24, 2014",A2UH2QQ275NV45,7106116521,Jodi Stoner,too tiny an opening,Two Stars,1408838400,,,
4,3.0,False,"07 27, 2014",A89F3LQADZBS5,7106116521,Alexander D.,Okay,Three Stars,1406419200,,,


In [6]:
#metadata.head()

Taking a subset of the large review dataset as per our requirement

In [7]:
sub_matrix = review_data[['reviewerID', 'asin', 'overall']].copy().drop_duplicates()
#sub_matrix.set_index('reviewerID')

In [8]:
sub_matrix.shape

(1466296, 3)

In [9]:
#Saving user data into pickle files
u=np.unique(sub_matrix.reviewerID)
user_dict = { k : [] for k in u }
for row in sub_matrix.iterrows():
    new_lst=(row[1]['asin'],row[1]['overall'])
    user_dict[row[1]['reviewerID']].append(new_lst)

## Pre-processing

In [10]:
#Taking a sub-set of data for training
train=sub_matrix[0:500000]

We have to filter the test set to have rows for only those products and users present in the training data.

In [11]:
#Preparing the test set
test=sub_matrix[500000:]
#test=remaining[sub_matrix['reviewerID'].isin(user_train) & sub_matrix['asin'].isin(item_train)]
#test=test[test.reviewerID.isin(train.reviewerID) & test.asin.isin(train.asin)]
test = test[test.reviewerID.isin(train.reviewerID)]
test = test[test.asin.isin(train.asin)]

In [12]:
user_le = LabelEncoder()
item_le = LabelEncoder()

train.reviewerID = user_le.fit_transform(train.reviewerID)
test.reviewerID = user_le.transform(test.reviewerID)

train.asin = item_le.fit_transform(train.asin)
test.asin = item_le.transform(test.asin)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
print(train.shape)
print(test.shape)

(500000, 3)
(10816, 3)


In [14]:
user_train, user_train_indices = np.unique(np.array(train['reviewerID'], dtype="int32"), return_inverse=True)
item_train, item_train_indices = np.unique(np.array(train['asin'], dtype="int32"), return_inverse=True)

In [15]:
#Now for test set
user_test,user_test_indices= np.unique(np.array(test['reviewerID'], dtype="int32"), return_inverse=True)
item_test,item_test_indices=np.unique(np.array(test['asin'], dtype="int32"),return_inverse=True)

In [16]:
test=test.reset_index(drop=True)

In [17]:
print(len(user_train))
print(len(item_train))
print(len(user_test))
print(len(item_test))

444449
47580
9820
5639


#### Creating and computing error values for simple baseline models

In [18]:
pred_global_avg = [train.overall.mean()] * test.shape[0]

In [19]:
product_avg_preds = train.groupby('asin')['overall'].mean().reset_index().rename(
    columns={'overall': 'pred_product_avg'}
)

In [20]:
user_avg_preds = train.groupby('reviewerID')['overall'].mean().reset_index().rename(
    columns={'overall': 'pred_user_avg'}
)

In [21]:
#Merging with test
test = test.merge(product_avg_preds, on='asin')
test = test.merge(user_avg_preds, on='reviewerID')
test['pred_global_avg'] = pred_global_avg

In [22]:
sparse_train=sparse.coo_matrix((np.array(train.overall.values, dtype='float32'),(user_train_indices,item_train_indices)),shape=(len(user_train), len(item_train)))
sparse_test=sparse.coo_matrix((np.ones(len(test), dtype="float32"),(user_test_indices,item_test_indices)),shape=(sparse_train.shape))

In [23]:
print(sparse_train.shape)
print(sparse_test.shape)

(444449, 47580)
(444449, 47580)


## Using h2o4GPU

#### Hyperparameter Tuning

In [24]:
components=[10, 20, 30, 50, 70]
lambda_values=[0.01, 0.05, 0.1, 0.5]
max_iterations=[50, 100, 200]

In [25]:
parameter_values=list(itertools.product(components,lambda_values,max_iterations))

In [26]:
#Factorization function
def execute_mf(parameters,X,X_test,model=False):
    factorization = h2o4gpu.solvers.FactorizationH2O(parameters[0], parameters[1], parameters[2])
    factorization.fit(X, X_BATCHES=10, THETA_BATCHES=10)
    predictions=factorization.predict(X_test).data
    if model:
        return predictions,factorization
    else:
        return predictions

In [27]:
results=dict()
c=0
for i in parameter_values:#parameter_values[0:math.floor(len(parameter_values)/2)]:
    c=c+1
    preds=execute_mf(i,sparse_train,sparse_test)
    if not np.isnan(preds).any():
        results[i]=np.sqrt(mean_squared_error(test.overall, preds))
    if c%5==0:
        print('Processing done. combinations left : ',len(parameter_values)-c)

Processing done. combinations left :  55
Processing done. combinations left :  50
Processing done. combinations left :  45
Processing done. combinations left :  40
Processing done. combinations left :  35
Processing done. combinations left :  30
Processing done. combinations left :  25
Processing done. combinations left :  20
Processing done. combinations left :  15
Processing done. combinations left :  10
Processing done. combinations left :  5
Processing done. combinations left :  0


In [28]:
#Fetching the parameter combination giving least RMSE value
print(min(results, key=results.get))

(70, 0.01, 200)


Thus, we infer that (70, 0.01, 200) is the best combination of parameter values. Hence we run our algorithm using that.

In [29]:
#pred_mf,model=execute_mf([70,0.01,200],sparse_train,sparse_test,True)

In [30]:
print(f'RMSE Product Avg: {np.sqrt(mean_squared_error(test.overall, test.pred_product_avg))}')
print(f'RMSE User Avg: {np.sqrt(mean_squared_error(test.overall, test.pred_user_avg))}')
print(f'RMSE Global Avg: {np.sqrt(mean_squared_error(test.overall, test.pred_global_avg))}')
print(f'RMSE H2O-MF: ',results[(70, 0.01, 200)])#np.sqrt(mean_squared_error(test.overall, test.pred_mf))}')

RMSE Product Avg: 1.4675429714630877
RMSE User Avg: 1.6234457217469518
RMSE Global Avg: 1.4346134925714396
RMSE H2O-MF:  2.0201818047054627


#### Saving our final model alongwith required data

In [31]:
user_encode = LabelEncoder()
item_encode = LabelEncoder()

sub_matrix.reviewerID = user_encode.fit_transform(sub_matrix.reviewerID)
sub_matrix.asin = item_encode.fit_transform(sub_matrix.asin)

user, user_indices = np.unique(np.array(sub_matrix['reviewerID'], dtype="int32"), return_inverse=True)
item, item_indices = np.unique(np.array(sub_matrix['asin'], dtype="int32"), return_inverse=True)

sparse_mat=sparse.coo_matrix((np.array(sub_matrix.overall.values, dtype='float32'),(user_indices,item_indices)),shape=(len(user), len(item)))

In [32]:
i=item_encode.inverse_transform(item)
#Preparing Item-Index mapping dictionary
items=dict(zip(i,item))

In [33]:
u=user_encode.inverse_transform(user)
#Preparing User-Index mapping dictionary
users=dict(zip(u,user))

In [34]:
print(len(users))
print(len(items))
print(sparse_mat.shape)

1239946
216441
(1239946, 216441)


In [35]:
rec = h2o4gpu.solvers.FactorizationH2O(70, 0.01, 200)

In [36]:
rec.fit(sparse_mat, X_BATCHES=10, THETA_BATCHES=10)

<h2o4gpu.solvers.factorization.FactorizationH2O at 0x7f6c2c506f50>

In [37]:
#Saving user data into pickle files
#items=item_encode.inverse_transform(item)
d_dict={'user':user_dict,'user_indices':users,'item_indices':items}
with open('review_data.pickle','wb') as f:
    pickle.dump(d_dict,f,pickle.HIGHEST_PROTOCOL)

In [38]:
#Extracting User's features and Item's features from model
#user_matrix=rec.XT
#item_matrix=rec.thetaT
#Getting users and items
#users=user_encode.inverse_transform(user)
#items=item_encode.inverse_transform(item)
#users=list(map(str,users))
#items=list(map(str,items))
#Serializing predictions and users and items
#recommend={'user_matrix':user_matrix,'item_matrix':item_matrix,'users':users,'items':items}
#with open('recommend_amazon.pickle', 'wb') as f:
#    # Pickle the 'data' dictionary using the highest protocol available.
#    pickle.dump(recommend, f, pickle.HIGHEST_PROTOCOL)

In [39]:
#Saving model file as pickle
with open('recommend_model.pickle','wb') as f:
    pickle.dump(rec,f,pickle.HIGHEST_PROTOCOL)

In [40]:
#Create dictionary of mini dataframes
#user_data=dict()
#user_data={row[1]['reviewerID']:sub_matrix[sub_matrix['reviewerID']==row[1]['reviewerID']] for row in sub_matrix.iterrows()}
#user_data={key['reviewerID']:pd.DataFrame.from_records([l for l in df_dict if l['reviewerID']==key['reviewerID']]) for key in df_dict}

#u=user_encode.inverse_transform(user)
#user_data={val:sub_matrix[sub_matrix['reviewerID']==val][['asin','overall']] for val in u}