# Collaborative filtering - memory based using Jackard Similarity

## Importing libraries and initial data checks

In [1]:
# import required libraries
import pandas as pd
import numpy as np

### About the data

This is a dataset related to over 2 Million customer reviews and ratings of Beauty related products sold on Amazon's website.

It contains:
- the unique UserId (Customer Identification),
- the product ASIN (Amazon's unique product identification code for each product),
- Ratings (ranging from 1-5 based on customer satisfaction) and
- the Timestamp of the rating (in UNIX time)

In [2]:
# raed the dataset
df = pd.read_csv('/Users/admin/Desktop/CMPE255-TermProject/ratings_Beauty.csv')
df.shape

(2023070, 4)

In [3]:
# check the first 5 rows
df.head()


Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


Check if there are any duplicate values present

In [4]:
duplicates = df.duplicated(["UserId","ProductId", "Rating", "Timestamp"]).sum()
print(' Duplicate records: ',duplicates)

 Duplicate records:  0


See the number of unique values present

In [5]:
print('unique users:',len(df.UserId.unique()))
print('unique products:',len(df.ProductId.unique()))
print("total ratings: ",df.shape[0])

unique users: 1210271
unique products: 249274
total ratings:  2023070


Check for null values

In [6]:
df.isnull().any()

UserId       False
ProductId    False
Rating       False
Timestamp    False
dtype: bool

Number of rated products per user

In [7]:
products_user= df.groupby(by = "UserId")["Rating"].count().sort_values(ascending =False)
products_user.head()

UserId
A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
Name: Rating, dtype: int64

Number of ratings per product

In [8]:
product_rated = df.groupby(by = "ProductId")["Rating"].count().sort_values(ascending = False)
product_rated.head()

ProductId
B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
Name: Rating, dtype: int64

Number of products rated by each user

In [9]:
rated_users=df.groupby("UserId")["ProductId"].count().sort_values(ascending=False)
print(rated_users)

UserId
A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
                 ... 
A2G8M8PDTN09UZ      1
A2G8MAFIIQSJ42      1
A2G8MTKRE6MV52      1
A2G8MWBXG6JIY6      1
AZZZU2TD7Q3ET       1
Name: ProductId, Length: 1210271, dtype: int64


In [10]:
rated_products=df.groupby("ProductId")["UserId"].count().sort_values(ascending=False)
print(rated_products)

ProductId
B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
              ... 
B004U81OBC       1
B004U7R0EI       1
B004U7Q2O2       1
B004U7NKRE       1
B00LU0LTOU       1
Name: UserId, Length: 249274, dtype: int64


In [11]:
print('Number of products with minimum of 5 reviews/ratings:',rated_products[rated_products>5].count())
print('Number of products with minimum of 4 reviews/ratings:',rated_products[rated_products>4].count())
print('Number of products with minimum of 3 reviews/ratings:',rated_products[rated_products>3].count())
print('Number of products with minimum of 2 reviews/ratings:',rated_products[rated_products>2].count())
print('Number of products with minimum of 1 reviews/ratings:',rated_products[rated_products>1].count())

Number of products with minimum of 5 reviews/ratings: 57722
Number of products with minimum of 4 reviews/ratings: 67345
Number of products with minimum of 3 reviews/ratings: 81247
Number of products with minimum of 2 reviews/ratings: 103581
Number of products with minimum of 1 reviews/ratings: 145790


In [12]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

### To convert alphanumeric data to numeric

In [13]:
dataset = df
dataset['user'] = label_encoder.fit_transform(df['UserId'])
dataset['product'] = label_encoder.fit_transform(df['ProductId'])
dataset.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user,product
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,725046,0
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,814606,1
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,313101,1
3,A1WMRR494NWEWV,733001998,4.0,1382572800,291075,2
4,A3IAAVS479H7M7,737104473,1.0,1274227200,802842,3


In [14]:
# average rating given by each user
average_rating = dataset.groupby(by="user", as_index=False)['Rating'].mean()
print("Average rating given by users: \n",average_rating.head())
print("----------------------------------------------------------\n")


# let's merge it with the dataset as we will be using that later
dataset = pd.merge(dataset, average_rating, on="user")
print("Modified dataset: \n", dataset.head())
print("----------------------------------------------------------\n")

# renaming columns
dataset = dataset.rename(columns={"Rating_x": "real_rating", "Rating_y": "average_rating"})
print("Dataset: \n", dataset.head())
print("----------------------------------------------------------\n")

Average rating given by users: 
    user  Rating
0     0     5.0
1     1     5.0
2     2     3.0
3     3     5.0
4     4     5.0
----------------------------------------------------------

Modified dataset: 
            UserId   ProductId  Rating_x   Timestamp    user  product  Rating_y
0  A39HTATAQ9V7YF  0205616461       5.0  1369699200  725046        0      4.25
1  A39HTATAQ9V7YF  B002OVV7F0       3.0  1369699200  725046    81854      4.25
2  A39HTATAQ9V7YF  B0031IH5FQ       5.0  1369699200  725046    89013      4.25
3  A39HTATAQ9V7YF  B006GQPZ8E       4.0  1369699200  725046   154092      4.25
4  A3JM6GV9MNOF9X  0558925278       3.0  1355443200  814606        1      3.50
----------------------------------------------------------

Dataset: 
            UserId   ProductId  real_rating   Timestamp    user  product  \
0  A39HTATAQ9V7YF  0205616461          5.0  1369699200  725046        0   
1  A39HTATAQ9V7YF  B002OVV7F0          3.0  1369699200  725046    81854   
2  A39HTATAQ9V7YF  B0

Certain users tend to give higher ratings while others tend to gibve lower ratings. To negate this bias, we normalise the ratings given by the users.

In [15]:
dataset['normalized_rating'] = dataset['real_rating'] - dataset['average_rating']
print("Data with adjusted rating: \n", dataset.head())

Data with adjusted rating: 
            UserId   ProductId  real_rating   Timestamp    user  product  \
0  A39HTATAQ9V7YF  0205616461          5.0  1369699200  725046        0   
1  A39HTATAQ9V7YF  B002OVV7F0          3.0  1369699200  725046    81854   
2  A39HTATAQ9V7YF  B0031IH5FQ          5.0  1369699200  725046    89013   
3  A39HTATAQ9V7YF  B006GQPZ8E          4.0  1369699200  725046   154092   
4  A3JM6GV9MNOF9X  0558925278          3.0  1355443200  814606        1   

   average_rating  normalized_rating  
0            4.25               0.75  
1            4.25              -1.25  
2            4.25               0.75  
3            4.25              -0.25  
4            3.50              -0.50  


# Jackard Similarity

Comparing two sets of data and see how similar they are. The values ranges from 0 to 1. The closer to 1 the better.

## Filter based on number of ratings available

In [16]:
rating_of_product = dataset.groupby('product')['real_rating'].count() # apply groupby 
ratings_of_products_df = pd.DataFrame(rating_of_product)
print("Real ratings:\n",ratings_of_products_df.head()) # check for real rating for products

Real ratings:
          real_rating
product             
0                  1
1                  2
2                  1
3                  1
4                  1


In [17]:
filtered_ratings_per_product = ratings_of_products_df[ratings_of_products_df.real_rating >= 200]
print(filtered_ratings_per_product.head())
print(filtered_ratings_per_product.shape)

         real_rating
product             
704              558
719              377
754              288
834              412
843              313
(934, 1)


In [18]:
# build a list of products to keep
popular_products = filtered_ratings_per_product.index.tolist()
print("Popular product count which have ratings over average rating count: ",len(popular_products))
print("--------------------------------------------------------------------------------")

filtered_ratings_data = dataset[dataset["product"].isin(popular_products)]
print("Filtered rated product in the dataset: \n",filtered_ratings_data.head())
print("---------------------------------------------------------------------------------")

print("The size of dataset has changed from ", len(dataset), " to ", len(filtered_ratings_data))
print("---------------------------------------------------------------------------------")

Popular product count which have ratings over average rating count:  934
--------------------------------------------------------------------------------
Filtered rated product in the dataset: 
             UserId   ProductId  real_rating   Timestamp     user  product  \
1   A39HTATAQ9V7YF  B002OVV7F0          3.0  1369699200   725046    81854   
18   AKJHHD5VEH7VG  B0000UTUVU          5.0  1232323200  1073169     2237   
20   AKJHHD5VEH7VG  B000F8HWXU          5.0  1379721600  1073169    16510   
45   AKJHHD5VEH7VG  B001LF4I8I          4.0  1232841600  1073169    65074   
47   AKJHHD5VEH7VG  B001OMI93S          5.0  1236643200  1073169    67333   

    average_rating  normalized_rating  
1         4.250000          -1.250000  
18        4.222222           0.777778  
20        4.222222           0.777778  
45        4.222222          -0.222222  
47        4.222222           0.777778  
---------------------------------------------------------------------------------
The size of dataset 

## Creating the User-item matrix

In [19]:
jac_similarity = pd.pivot_table(filtered_ratings_data,values='real_rating',index='UserId',columns='product')
jac_similarity = jac_similarity.fillna(0)

In [20]:
print(jac_similarity.shape)
jac_similarity.head()

(306165, 934)


product,704,719,754,834,843,858,861,873,944,981,...,241604,242018,242048,243416,244376,244448,245600,247603,249109,249211
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0010876CNE3ILIM9HV0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0011102257KBXODKL24I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00120381FL204MYH7G3B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00126503SUWI86KZBMIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A001573229XK5T8PI0OKA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [22]:
selecting_users = list(jac_similarity.index)
selecting_users = selecting_users[:10000]

In [23]:
selected_mat = jac_similarity.loc[selecting_users]
selected_mat.head()

product,704,719,754,834,843,858,861,873,944,981,...,241604,242018,242048,243416,244376,244448,245600,247603,249109,249211
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0010876CNE3ILIM9HV0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0011102257KBXODKL24I,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00120381FL204MYH7G3B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00126503SUWI86KZBMIN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A001573229XK5T8PI0OKA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
selecting_product = selected_mat.columns
mu_matrix = np.array(selected_mat.values, dtype=int)