# Item based collaborative filtering using pearson correlation similarity.

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import operator

In [2]:
# reading the dataset
df = pd.read_csv('ratings_Beauty.csv')
df

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200
3,A1WMRR494NWEWV,0733001998,4.0,1382572800
4,A3IAAVS479H7M7,0737104473,1.0,1274227200
...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200


In [3]:
df.drop(['Timestamp'], axis=1, inplace=True)

In [4]:
#Dropping products with ratings less than 900. This is done to make the model more robust and also to reduce the number of dimensions.
rating_of_product = df.groupby('ProductId')['Rating'].count() 
ratings_of_products_df = pd.DataFrame(rating_of_product)
filtered_ratings_per_product = ratings_of_products_df[ratings_of_products_df.Rating >= 900]
filtered_ratings_per_product

Unnamed: 0_level_0,Rating
ProductId,Unnamed: 1_level_1
B0000YUXI0,2143
B000142FVW,1558
B00021DVCQ,1051
B0002JKPA4,1046
B0009V1YR8,2869
B000FS05VG,1589
B000L596FE,992
B000TKH6G2,1379
B000UVZU1S,1105
B000VPPUEA,1074


In [5]:
# Filtering dataset with records with products which have atleast 900 records.
popular_products = filtered_ratings_per_product.index.tolist()
print("Popular product count which have ratings over average rating count: ",len(popular_products))
filtered_ratings_data = df[df["ProductId"].isin(popular_products)]
print("The size of dataset has changed from ", len(df), " to ", len(filtered_ratings_data))

Popular product count which have ratings over average rating count:  41
The size of dataset has changed from  2023070  to  62991


In [6]:
filtered_ratings_data.shape

(62991, 3)

In [7]:
#Generating the utility matrix
util_mat = filtered_ratings_data.pivot_table(index = 'UserId', columns = 'ProductId', values = 'Rating')


In [8]:
util_mat

ProductId,B0000YUXI0,B000142FVW,B00021DVCQ,B0002JKPA4,B0009V1YR8,B000FS05VG,B000L596FE,B000TKH6G2,B000UVZU1S,B000VPPUEA,...,B0058E3XJI,B005BF1M10,B00639DLV2,B0069FDR96,B006IBM21K,B006L1DNWY,B007BLN17K,B007Q0WW0S,B008U12YV4,B008U1Q4DI
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00328401T70RFN4P1IT6,,,,,,,,,,,...,,,,,,,,,,
A00454102SR84NOYTI0JS,,,,,,,,,,,...,,,,,,,,,,
A00491723IYKW5UI74AEX,,,,,,,,,,,...,,,,,,,,,,
A00612582Z6ZU2SDMRQ07,,,,,,,,,,,...,,,,,5.0,,,,,
A00773851NXKGCZRY43PG,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZFV6A16356H,,,,,,,,,,,...,,,,,,,,,,
AZZS69U1WDPXV,,,,,,,,,,,...,,,,,,,,,,
AZZSAMMJPJKJ1,,,,,,,,,,,...,,,,,,,,,,
AZZVZL4QEHEHO,,,,,,,,,,,...,,,,,,,,,,


In [9]:
item_util_matrix = util_mat.copy()
# We will fill the row wise NaN's with the corresponding user's mean ratings, so that we can carry out Pearson correlation.
# Here we assume avg ratings for the movie that is not rated.
item_util_matrix = item_util_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
item_util_matrix.head(5)

ProductId,B0000YUXI0,B000142FVW,B00021DVCQ,B0002JKPA4,B0009V1YR8,B000FS05VG,B000L596FE,B000TKH6G2,B000UVZU1S,B000VPPUEA,...,B0058E3XJI,B005BF1M10,B00639DLV2,B0069FDR96,B006IBM21K,B006L1DNWY,B007BLN17K,B007Q0WW0S,B008U12YV4,B008U1Q4DI
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00328401T70RFN4P1IT6,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
A00454102SR84NOYTI0JS,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
A00491723IYKW5UI74AEX,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
A00612582Z6ZU2SDMRQ07,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
A00773851NXKGCZRY43PG,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [10]:
#This shows every row has atleast one non-null value. 
item_util_matrix.isna().sum().sum()

0

In [11]:
item_util_matrix.corr()

ProductId,B0000YUXI0,B000142FVW,B00021DVCQ,B0002JKPA4,B0009V1YR8,B000FS05VG,B000L596FE,B000TKH6G2,B000UVZU1S,B000VPPUEA,...,B0058E3XJI,B005BF1M10,B00639DLV2,B0069FDR96,B006IBM21K,B006L1DNWY,B007BLN17K,B007Q0WW0S,B008U12YV4,B008U1Q4DI
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B0000YUXI0,1.0,0.999439,0.999706,0.999682,0.99961,0.999567,0.999777,0.999651,0.999524,0.999628,...,0.999817,0.99913,0.999784,0.998562,0.999772,0.998917,0.998498,0.999565,0.99968,0.999145
B000142FVW,0.999439,1.0,0.99937,0.999349,0.999211,0.999235,0.999468,0.999321,0.999126,0.999292,...,0.999487,0.99881,0.99945,0.998189,0.999434,0.998524,0.998163,0.999229,0.99936,0.998831
B00021DVCQ,0.999706,0.99937,1.0,0.999574,0.999496,0.999459,0.999734,0.999552,0.999458,0.999551,...,0.999751,0.999072,0.999696,0.998474,0.999705,0.998851,0.998431,0.999498,0.999627,0.9991
B0002JKPA4,0.999682,0.999349,0.999574,1.0,0.999517,0.999477,0.999711,0.999564,0.999408,0.999538,...,0.99973,0.999048,0.999694,0.998472,0.999679,0.998827,0.998405,0.999475,0.999604,0.999076
B0009V1YR8,0.99961,0.999211,0.999496,0.999517,1.0,0.999353,0.999628,0.999498,0.999363,0.999457,...,0.999658,0.998976,0.999622,0.998389,0.99961,0.998755,0.998335,0.999403,0.999532,0.999004
B000FS05VG,0.999567,0.999235,0.999459,0.999477,0.999353,1.0,0.999598,0.999424,0.999317,0.99942,...,0.999591,0.998937,0.999579,0.998348,0.999567,0.998712,0.998293,0.999357,0.999486,0.998937
B000L596FE,0.999777,0.999468,0.999734,0.999711,0.999628,0.999598,1.0,0.999684,0.99954,0.999657,...,0.999848,0.999164,0.999812,0.998588,0.9998,0.998941,0.998526,0.999593,0.999717,0.999192
B000TKH6G2,0.999651,0.999321,0.999552,0.999564,0.999498,0.999424,0.999684,1.0,0.999386,0.99951,...,0.999701,0.998996,0.999665,0.998443,0.999646,0.998797,0.998379,0.999443,0.999556,0.999048
B000UVZU1S,0.999524,0.999126,0.999458,0.999408,0.999363,0.999317,0.99954,0.999386,1.0,0.99934,...,0.999572,0.99888,0.999536,0.998288,0.999524,0.998626,0.998255,0.999273,0.999446,0.998908
B000VPPUEA,0.999628,0.999292,0.999551,0.999538,0.999457,0.99942,0.999657,0.99951,0.99934,1.0,...,0.999649,0.998996,0.999637,0.998418,0.999628,0.998773,0.998354,0.99941,0.999539,0.999011


In [12]:
item_corr_matrix = item_util_matrix.corr()

In [22]:
def get_top_5_similar_items(product_id,corr_matrix):
    input_item_corr = corr_matrix[product_id]
    input_item_corr = input_item_corr.sort_values(ascending=False)
    input_item_corr.dropna(inplace=True)
    items_similar = pd.DataFrame(data=input_item_corr.values, columns=['Correlation'],index = input_item_corr.index)
    items_similar = items_similar[1:]
    top_5_similar_items = items_similar.sort_values(ascending=False, by=['Correlation']).head(5)
    return top_5_similar_items

In [23]:
get_top_5_similar_items('B0000YUXI0',item_corr_matrix)

Unnamed: 0_level_0,Correlation
ProductId,Unnamed: 1_level_1
B003BQ6QXK,0.99982
B0058E3XJI,0.999817
B00639DLV2,0.999784
B001AO0WCG,0.99978
B000L596FE,0.999777
