In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD

In [2]:
#importing the dataset and removing null values
amazon_ratings_df = pd.read_csv('amazon_ratings.csv')
amazon_ratings_df = amazon_ratings_df.dropna()
amazon_ratings_df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5,1369699200
1,A3JM6GV9MNOF9X,558925278,3,1355443200
2,A1Z513UWSAAO0F,558925278,5,1404691200
3,A1WMRR494NWEWV,733001998,4,1382572800
4,A3IAAVS479H7M7,737104473,1,1274227200


In [3]:
#shape of the dataset
amazon_ratings_df.shape

(1048575, 4)

In [4]:
amazon_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   UserId     1048575 non-null  object
 1   ProductId  1048575 non-null  object
 2   Rating     1048575 non-null  int64 
 3   Timestamp  1048575 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 40.0+ MB


In [5]:
#checking the most popular products in the dataset
popular_prod = pd.DataFrame(amazon_ratings_df.groupby('ProductId')['Rating'].count())
most_popular = popular_prod.sort_values('Rating', ascending=False)
most_popular.head(10)

Unnamed: 0_level_0,Rating
ProductId,Unnamed: 1_level_1
B001MA0QY2,7533
B0009V1YR8,2869
B0000YUXI0,2143
B000ZMBSPE,2041
B003BQ6QXK,1918
B00121UVU0,1838
B000FS05VG,1589
B000142FVW,1558
B001JKTTVQ,1468
B000TKH6G2,1379


In [6]:
#subset of the full dataset
amazon_ratings_subset_df = amazon_ratings_df.head(10000)

In [7]:
#sparce matrix, filling null values with 0
ratings_matrix = amazon_ratings_subset_df.pivot_table(values='Rating', index='UserId', columns='ProductId', fill_value=0)
ratings_matrix.head()

ProductId,1304139212,1304139220,130414089X,130414643X,1304146537,130414674X,1304168522,1304174778,1304174867,1304174905,...,B000052YPE,B000052YPF,B000052YPG,B000052YPH,B000052YPM,B000052YPU,B000052YPV,B000052YPY,B000052YQ0,B000052YQ2
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00205921JHJK5X9LNP42,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A024581134CV80ZBLIZTZ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03056581JJIOL5FSKJY7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A03099101ZRK4K607JVHH,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0505229A7NSH3FRXRR4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#shape of the matrix
ratings_matrix.shape

(9697, 886)

In [9]:
#transposing the matrix
transpose = ratings_matrix.T
transpose.head()

UserId,A00205921JHJK5X9LNP42,A024581134CV80ZBLIZTZ,A03056581JJIOL5FSKJY7,A03099101ZRK4K607JVHH,A0505229A7NSH3FRXRR4,A05492663T95KW63BR75K,A059547920Q3LZVFHLPI3,A07410232KYRFR25CIUGJ,A082796624UNM47DSAI6K,A0864963DOAY7LXGS5I6,...,AZW1HXXYAC15B,AZWRTJPN7NXT,AZWTXHXZXFAYP,AZYQEFB9Y5N22,AZZHB6U54UDYW,AZZHJZP4GQPPZ,AZZNK89PXD006,AZZOFVMQC0BJG,AZZQXL8VDCFTV,AZZTJQ7CQZUD8
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1304139212,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1304139220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130414089X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130414643X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1304146537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
transpose.shape

(886, 9697)

In [11]:
#fitting the Support Vector model 
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(transpose)
decomposed_matrix.shape

(886, 10)

In [12]:
#correlation matrix
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(886, 886)

In [13]:
#assuming the customer buys a random product
transpose.index[94]

'6041134511'

In [14]:
#checking the product id of the chosen product, it should match
i = "6041134511"

product_names = list(transpose.index)
product_ID = product_names.index(i)
product_ID

94

In [15]:
#product id from correlation matrix
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(886,)

In [16]:
#recommendation model of the product greater than 90%
Recommended = list(transpose.index[correlation_product_ID > 0.90])

Recommended[0:9]

['130414089X',
 '1412759676',
 '360211600X',
 '4057553908',
 '5357955786',
 '6041134473',
 '604113449X',
 '6041134511',
 '6175005589']