

# Project - Product Recommendation Systems

### <center> <div style="text-align: center"> by Mohan Raju</div>  </center>

Domain - E-commerce
Context - Everyday a million products are being recommended to users based on
popularity and other metrics on e-commerce websites. The most popular e-commerce
website boosts average order value by 50%, increases revenues by 300%, and
improves conversion. In addition to being a powerful tool for increasing revenues,
product recommendations are so essential that customers now expect to see similar
features on all other eCommerce sites.

Objective - To make a recommendation system that recommends at least five(5)
new products based on the user's habits.

In [54]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
from sklearn.impute import SimpleImputer
import statistics
from sklearn.tree import DecisionTreeClassifier

In [56]:
# Load Data

df = pd.read_csv("ratings_Electronics1.csv", names=["userId", "productId", "rating", "timestamp"])

In [57]:
df1 = df.drop(['timestamp'], axis=1)
df1.head(5)

Unnamed: 0,userId,productId,rating
0,AKM1MP6P0OYPR,132793040,5
1,A2CX7LUOHB2NDG,321732944,5
2,A2NWSAGRHCP8N5,439886341,1
3,A2WNBOD3WNDNKT,439886341,3
4,A1GI0U4ZRJA8WN,439886341,1


In [58]:
df1.shape

(627879, 3)

In [59]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627879 entries, 0 to 627878
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     627879 non-null  object
 1   productId  627879 non-null  object
 2   rating     627879 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 14.4+ MB


In [60]:
df1.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,627879.0,3.978,1.399,1.0,3.0,5.0,5.0,5.0


In [61]:
df1.isnull().sum()

userId       0
productId    0
rating       0
dtype: int64

In [62]:
# Total Number of missing value
df1.isnull().sum().sum()

0

In [63]:
df1.count()

userId       627879
productId    627879
rating       627879
dtype: int64

In [64]:
df1.head()

Unnamed: 0,userId,productId,rating
0,AKM1MP6P0OYPR,132793040,5
1,A2CX7LUOHB2NDG,321732944,5
2,A2NWSAGRHCP8N5,439886341,1
3,A2WNBOD3WNDNKT,439886341,3
4,A1GI0U4ZRJA8WN,439886341,1


In [65]:

product_ratingCount = (df1.
     groupby(by = ['productId'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'TotalRatingCount'})
     [['productId', 'TotalRatingCount']]
    )
product_ratingCount .head()

Unnamed: 0,productId,TotalRatingCount
0,059400232X,3
1,089933623X,2
2,094339676X,1
3,1034385789,5
4,1039869017,2


In [66]:
rating_with_totalRatingCount = df1.merge(product_ratingCount, left_on = 'productId', right_on = 'productId', how = 'left')
rating_with_totalRatingCount.head()


Unnamed: 0,userId,productId,rating,TotalRatingCount
0,AKM1MP6P0OYPR,132793040,5,1
1,A2CX7LUOHB2NDG,321732944,5,1
2,A2NWSAGRHCP8N5,439886341,1,3
3,A2WNBOD3WNDNKT,439886341,3,3
4,A1GI0U4ZRJA8WN,439886341,1,3


In [67]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(product_ratingCount['TotalRatingCount'].describe())

count   38143.000
mean       16.461
std        89.982
min         1.000
25%         1.000
50%         3.000
75%         9.000
max      9487.000
Name: TotalRatingCount, dtype: float64


In [68]:
# Taking subset of the dataset , users only who has given 50 or more number of ratings

In [69]:
popularity_threshold = 50
rating_popular_product= rating_with_totalRatingCount.query('TotalRatingCount >= @popularity_threshold')
rating_popular_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
183,A1BKC6B7YHYTVV,972683275,4,1051
184,AWVFSIB42LHKJ,972683275,4,1051
185,A36MQBNADRH8YY,972683275,5,1051
186,A3SRXMPLAEZ6DP,972683275,4,1051
187,A20XXTXWF2TCPY,972683275,5,1051


In [70]:
rating_popular_product.shape

(397206, 4)

In [71]:
rating_popular_product.count().sum()

1588824

In [72]:
data = rating_popular_product

In [73]:
data1 = data

# Popularity Based Recommender model

In [74]:
data.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
183,A1BKC6B7YHYTVV,972683275,4,1051
184,AWVFSIB42LHKJ,972683275,4,1051
185,A36MQBNADRH8YY,972683275,5,1051
186,A3SRXMPLAEZ6DP,972683275,4,1051
187,A20XXTXWF2TCPY,972683275,5,1051


In [75]:
user_colab_data = data

In [76]:
data.groupby('productId')['rating'].mean().head()

productId
1400501466   3.560
1400501520   4.244
1400501776   3.885
1400532620   3.684
1400532655   3.727
Name: rating, dtype: float64

In [77]:
data.groupby('productId')['rating'].mean().sort_values(ascending=False).head()

productId
B0000DYV9H   4.947
B000053HC5   4.946
B00009R96C   4.886
B00005LE76   4.879
B0002E52S4   4.861
Name: rating, dtype: float64

In [78]:
data.groupby('productId')['rating'].count().sort_values(ascending=False).head()

productId
B0002L5R78    9487
B0001FTVEK    5345
B00007E7JU    3523
B00007M1TZ    2608
B00004ZCJE    2547
Name: rating, dtype: int64

In [79]:
ratings_mean_count = pd.DataFrame(data.groupby('productId')['rating'].mean())

In [80]:
ratings_mean_count['rating_counts'] = pd.DataFrame(data.groupby('productId')['rating'].count())

In [81]:
ratings_mean_count.head().sum()

rating            19.100
rating_counts   1126.000
dtype: float64

Popularity based recomended products

In [82]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
import surprise
from surprise import Dataset, Reader
from sklearn.model_selection import train_test_split
from collections import defaultdict
from surprise import KNNWithMeans
from surprise import SVD, SVDpp
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

Split the data randomly into a train and test dataset

In [83]:
trainset, testset = train_test_split(ratings_mean_count, test_size = 0.30, random_state=0)

In [84]:
trainset.head()

Unnamed: 0_level_0,rating,rating_counts
productId,Unnamed: 1_level_1,Unnamed: 2_level_1
B000071A7U,3.975,79
B0002XQWCK,4.724,98
B00000J6WY,3.778,90
B00004VXNI,4.47,117
B0000667AP,4.067,60


In [85]:
trainset.shape

(1584, 2)

# Building Collaborative Filtering model

In [86]:
popularity_threshold = 2000
rating_colab_product= rating_with_totalRatingCount.query('TotalRatingCount >= @popularity_threshold')
rating_colab_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
22020,A2YDH4R73MEDIG,B00001P4ZH,5,2075
22021,A34MP4RZMM3JMO,B00001P4ZH,5,2075
22022,A239H0QWSL2825,B00001P4ZH,2,2075
22023,A3LVU7249E0VWJ,B00001P4ZH,5,2075
22024,A3BL7PP815B7F4,B00001P4ZH,5,2075


In [87]:
rating_colab_product.tail()

Unnamed: 0,userId,productId,rating,TotalRatingCount
600838,A2ZD0ME33S4RX,B0007MXZB2,5,2080
600839,A3V9H2X1IFVZDI,B0007MXZB2,4,2080
600840,A108XABRHAA9E7,B0007MXZB2,2,2080
600841,A1JEJEDR7NDZ7C,B0007MXZB2,1,2080
600842,A1HTJZQAMICHDK,B0007MXZB2,4,2080


In [88]:
rating_colab_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
22020,A2YDH4R73MEDIG,B00001P4ZH,5,2075
22021,A34MP4RZMM3JMO,B00001P4ZH,5,2075
22022,A239H0QWSL2825,B00001P4ZH,2,2075
22023,A3LVU7249E0VWJ,B00001P4ZH,5,2075
22024,A3BL7PP815B7F4,B00001P4ZH,5,2075


In [89]:
rating_colab_product.shape

(31957, 4)

# Nearest Neighbor item based

In [90]:
# creating a Pivot matrix

colab_filter_df=rating_colab_product.pivot_table(index='productId',columns='userId',values='rating').fillna(0)
colab_filter_df.head()



userId,A00328742CDZTXNDCB9XW,A00988692Q9ZDJUD8BQSM,A015639027WKW2102QO0S,A04652431DUA6LH3VBSRW,A04679053BE8ZYGJ7SKRM,A064793123POWGN11XQNA,A069069433DCAUXSSN1MY,A0695568PX4DBZOQDN8,A07650012EBB6027Q1NQQ,A0791524DCIZVYY8L45Y,...,AZYH32578YZBF,AZYJJA10OOIMR,AZYX5K95IJOO8,AZZ58AJ3HJCK8,AZZ77XYX8O2WE,AZZD6VSEQQE7O,AZZFCZRH7GP7H,AZZHG7U4YKOQH,AZZMDW27MUJR6,AZZVLOF3WKLFW
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001P4ZH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004ZCJE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
B00007E7JU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007M1TZ,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
B0001FTVEK,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:

from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(colab_filter_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(colab_filter_df)



NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [92]:
colab_filter_df.shape

(9, 31590)

In [93]:
query_index = np.random.choice(colab_filter_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(colab_filter_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

6


In [94]:
colab_filter_df.head()

userId,A00328742CDZTXNDCB9XW,A00988692Q9ZDJUD8BQSM,A015639027WKW2102QO0S,A04652431DUA6LH3VBSRW,A04679053BE8ZYGJ7SKRM,A064793123POWGN11XQNA,A069069433DCAUXSSN1MY,A0695568PX4DBZOQDN8,A07650012EBB6027Q1NQQ,A0791524DCIZVYY8L45Y,...,AZYH32578YZBF,AZYJJA10OOIMR,AZYX5K95IJOO8,AZZ58AJ3HJCK8,AZZ77XYX8O2WE,AZZD6VSEQQE7O,AZZFCZRH7GP7H,AZZHG7U4YKOQH,AZZMDW27MUJR6,AZZVLOF3WKLFW
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001P4ZH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004ZCJE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
B00007E7JU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007M1TZ,0.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
B0001FTVEK,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(colab_filter_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, colab_filter_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for B0002L5R78:

1: B0007MXZB2, with distance of 0.99420168638652:
2: B00004ZCJE, with distance of 0.9942097002205943:
3: B00007M1TZ, with distance of 0.9965208724638603:
4: B00007E7JU, with distance of 0.9970405162795923:
5: B00001P4ZH, with distance of 0.9974705192987161:


Above 5 products are recommended.