

# Project - Product Recommendation Systems

### <center> <div style="text-align: center"> by Mohan Raju</div>  </center>

Domain - E-commerce
Context - Everyday a million products are being recommended to users based on
popularity and other metrics on e-commerce websites. The most popular e-commerce
website boosts average order value by 50%, increases revenues by 300%, and
improves conversion. In addition to being a powerful tool for increasing revenues,
product recommendations are so essential that customers now expect to see similar
features on all other eCommerce sites.

Objective - To make a recommendation system that recommends at least five(5)
new products based on the user's habits.

In [1]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
from sklearn.impute import SimpleImputer
import statistics
from sklearn.tree import DecisionTreeClassifier

In [86]:
# Load Data

df = pd.read_csv("ratings_Electronics.csv", names=["userId", "productId", "rating", "timestamp"])

In [3]:
df1 = df.drop(['timestamp'], axis=1)
df1.head(5)

Unnamed: 0,userId,productId,rating
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [4]:
df1.shape

(7824482, 3)

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     object 
 1   productId  object 
 2   rating     float64
dtypes: float64(1), object(2)
memory usage: 179.1+ MB


In [6]:
df1.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rating,7824482.0,4.012337,1.38091,1.0,3.0,5.0,5.0,5.0


In [7]:
df1.isnull().sum()

userId       0
productId    0
rating       0
dtype: int64

In [8]:
# Total Number of missing value
df1.isnull().sum().sum()

0

In [9]:
df1.count()

userId       7824482
productId    7824482
rating       7824482
dtype: int64

In [10]:
df1.head()

Unnamed: 0,userId,productId,rating
0,AKM1MP6P0OYPR,132793040,5.0
1,A2CX7LUOHB2NDG,321732944,5.0
2,A2NWSAGRHCP8N5,439886341,1.0
3,A2WNBOD3WNDNKT,439886341,3.0
4,A1GI0U4ZRJA8WN,439886341,1.0


In [11]:

product_ratingCount = (df1.
     groupby(by = ['productId'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'TotalRatingCount'})
     [['productId', 'TotalRatingCount']]
    )
product_ratingCount .head()

Unnamed: 0,productId,TotalRatingCount
0,132793040,1
1,321732944,1
2,439886341,3
3,511189877,6
4,528881469,27


In [12]:
rating_with_totalRatingCount = df1.merge(product_ratingCount, left_on = 'productId', right_on = 'productId', how = 'left')
rating_with_totalRatingCount.head()


Unnamed: 0,userId,productId,rating,TotalRatingCount
0,AKM1MP6P0OYPR,132793040,5.0,1
1,A2CX7LUOHB2NDG,321732944,5.0,1
2,A2NWSAGRHCP8N5,439886341,1.0,3
3,A2WNBOD3WNDNKT,439886341,3.0,3
4,A1GI0U4ZRJA8WN,439886341,1.0,3


In [13]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(product_ratingCount['TotalRatingCount'].describe())

count   476002.000
mean        16.438
std        112.703
min          1.000
25%          1.000
50%          2.000
75%          7.000
max      18244.000
Name: TotalRatingCount, dtype: float64


In [14]:
# Taking subset of the dataset , users only who has given 50 or more number of ratings

In [15]:
popularity_threshold = 50
rating_popular_product= rating_with_totalRatingCount.query('TotalRatingCount >= @popularity_threshold')
rating_popular_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
183,A1BKC6B7YHYTVV,972683275,4.0,1051
184,AWVFSIB42LHKJ,972683275,4.0,1051
185,A36MQBNADRH8YY,972683275,5.0,1051
186,A3SRXMPLAEZ6DP,972683275,4.0,1051
187,A20XXTXWF2TCPY,972683275,5.0,1051


In [16]:
rating_popular_product.shape

(5374313, 4)

In [17]:
rating_popular_product.count().sum()

21497252

In [18]:
data = rating_popular_product

In [19]:
data1 = data

# Popularity Based Recommender model

In [20]:
data.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
183,A1BKC6B7YHYTVV,972683275,4.0,1051
184,AWVFSIB42LHKJ,972683275,4.0,1051
185,A36MQBNADRH8YY,972683275,5.0,1051
186,A3SRXMPLAEZ6DP,972683275,4.0,1051
187,A20XXTXWF2TCPY,972683275,5.0,1051


In [21]:
user_colab_data = data

In [22]:
data.groupby('productId')['rating'].mean().head()

productId
0972683275   4.471
1400501466   3.560
1400501520   4.244
1400501776   3.885
1400532620   3.684
Name: rating, dtype: float64

In [23]:
data.groupby('productId')['rating'].mean().sort_values(ascending=False).head()

productId
B002E6R7NG   4.980
B004I763AW   4.967
B003J9QQWU   4.964
B0043ZLFXE   4.956
B000TMFYBO   4.953
Name: rating, dtype: float64

In [24]:
data.groupby('productId')['rating'].count().sort_values(ascending=False).head()

productId
B0074BW614    18244
B00DR0PDNE    16454
B007WTAJTO    14172
B0019EHU8G    12285
B006GWO5WK    12226
Name: rating, dtype: int64

In [25]:
ratings_mean_count = pd.DataFrame(data.groupby('productId')['rating'].mean())

In [26]:
ratings_mean_count['rating_counts'] = pd.DataFrame(data.groupby('productId')['rating'].count())

In [27]:
ratings_mean_count.head().sum()

rating            19.844
rating_counts   1693.000
dtype: float64

Popularity based recomended products

In [28]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
import surprise
from surprise import Dataset, Reader
from sklearn.model_selection import train_test_split
from collections import defaultdict
from surprise import KNNWithMeans
from surprise import SVD, SVDpp
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

Split the data randomly into a train and test dataset

In [29]:
trainset, testset = train_test_split(ratings_mean_count, test_size = 0.30, random_state=0)

In [30]:
trainset.head()

Unnamed: 0_level_0,rating,rating_counts
productId,Unnamed: 1_level_1,Unnamed: 2_level_1
B0081YPX22,3.574,54
B00CE58ZYC,4.179,56
B000NK3H4S,4.145,602
B000SB9K5W,4.68,125
B00B3R4W62,4.136,59


In [31]:
trainset.shape

(18358, 2)

# Building Collaborative Filtering model

In [37]:
popularity_threshold = 2000
rating_colab_product= rating_with_totalRatingCount.query('TotalRatingCount >= @popularity_threshold')
rating_colab_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
22020,A2YDH4R73MEDIG,B00001P4ZH,5.0,2075
22021,A34MP4RZMM3JMO,B00001P4ZH,5.0,2075
22022,A239H0QWSL2825,B00001P4ZH,2.0,2075
22023,A3LVU7249E0VWJ,B00001P4ZH,5.0,2075
22024,A3BL7PP815B7F4,B00001P4ZH,5.0,2075


In [38]:
rating_colab_product.tail()

Unnamed: 0,userId,productId,rating,TotalRatingCount
7721034,AC3AJCO52WW1X,B00GTGETFG,5.0,2667
7721035,A308R0EERX4LWC,B00GTGETFG,5.0,2667
7721036,A2P0IZN1IEYE73,B00GTGETFG,5.0,2667
7721037,A2MDVFESDZCKKJ,B00GTGETFG,5.0,2667
7721038,A25QFY7BTXP68S,B00GTGETFG,4.0,2667


In [39]:
rating_colab_product.head()

Unnamed: 0,userId,productId,rating,TotalRatingCount
22020,A2YDH4R73MEDIG,B00001P4ZH,5.0,2075
22021,A34MP4RZMM3JMO,B00001P4ZH,5.0,2075
22022,A239H0QWSL2825,B00001P4ZH,2.0,2075
22023,A3LVU7249E0VWJ,B00001P4ZH,5.0,2075
22024,A3BL7PP815B7F4,B00001P4ZH,5.0,2075


In [40]:
rating_colab_product.shape

(714930, 4)

# Nearest Neighbor item based

In [41]:
# creating a Pivot matrix

colab_filter_df=rating_colab_product.pivot_table(index='productId',columns='userId',values='rating').fillna(0)
colab_filter_df.head()



userId,A00045341JXVKNK93M6JE,A00062283LKXEZFY9NQ8B,A00067902UUFQLSW80IS2,A000715434M800HLCENK9,A0007626SITFHDTFIVMP,A000798037BDEA3BK3X2X,A0009060FA8P413511WS,A00090962SSLQRASQFP1O,A0009478CBXKUCALUC7U,A0009878M2RGMMHGJH39,...,AZZX63RW29X2S,AZZXC3Z8FMMPM,AZZXCBY0HV6VZ,AZZXMJB7Z92SD,AZZY4W8E5AX2K,AZZYFFDBSSBJC,AZZYFWIX86177,AZZYKX2KZ0Q82,AZZYW4YOE1B6E,AZZZRS1YZ8HVP
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001P4ZH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004ZCJE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007E7JU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007M1TZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0001FTVEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:

from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(colab_filter_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(colab_filter_df)



NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [43]:
colab_filter_df.shape

(186, 612337)

In [44]:
query_index = np.random.choice(colab_filter_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(colab_filter_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

97


In [45]:
colab_filter_df.head()

userId,A00045341JXVKNK93M6JE,A00062283LKXEZFY9NQ8B,A00067902UUFQLSW80IS2,A000715434M800HLCENK9,A0007626SITFHDTFIVMP,A000798037BDEA3BK3X2X,A0009060FA8P413511WS,A00090962SSLQRASQFP1O,A0009478CBXKUCALUC7U,A0009878M2RGMMHGJH39,...,AZZX63RW29X2S,AZZXC3Z8FMMPM,AZZXCBY0HV6VZ,AZZXMJB7Z92SD,AZZY4W8E5AX2K,AZZYFFDBSSBJC,AZZYFWIX86177,AZZYKX2KZ0Q82,AZZYW4YOE1B6E,AZZZRS1YZ8HVP
productId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00001P4ZH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00004ZCJE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007E7JU,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00007M1TZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B0001FTVEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(colab_filter_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, colab_filter_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for B0057OCDQS:

1: B000LRMS66, with distance of 0.9437867667987533:
2: B007WTAJTO, with distance of 0.9952349391388757:
3: B002V88HFE, with distance of 0.9952462470741545:
4: B001V9KG0I, with distance of 0.9954085033360733:
5: B004T9RR6I, with distance of 0.9958947238689431:


Above 5 products are recommended for product Id : AEPH63H17I4QU