# Collaborative filtering - memory based using cosine distance and kNN

Recommender systems are an integral part of many online systems. From e-commerce to online streaming platforms.
Recommender systems employ the past purchase patters on it's user to predict which other products they may in interested in and likey to purchase. Recommending the right products gives a significat advantage to the business. A mojor portion of the revenue is generated through recommendations.


## Importing required libraries and EDA

In [1]:
import pandas as pd
import numpy as np

### Features of data
This is a dataset with around 2 Million customer reviews and ratings of Beauty related products sold on Amazon's website.

Dataset contains:
- the unique UserId (Customer Identification),
- the product ASIN (Amazon's unique product identification code for each product),
- Ratings (ranging from 1-5 based on customer reviews) and
- the Timestamp of the rating (in UNIX time)

In [2]:
# Read the dataset
df = pd.read_csv('Beauty_ratings.csv')
df.shape

(231619, 4)

In [3]:
df.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [4]:
# Duplicates examination
duplicates=df.duplicated(['UserId','ProductId','Rating','Timestamp']).sum()
print("The number of duplicates: ", duplicates)

The number of duplicates:  0


In [5]:
# Null values examination
df.isnull().any()

UserId       False
ProductId    False
Rating       False
Timestamp    False
dtype: bool

In [6]:
# Number of ratings of each product
rated_product=df.groupby(by = 'ProductId')['Rating'].count().sort_values()
rated_product

ProductId
0205616461       1
B0006LCRKY       1
B0006L5IP0       1
B0006L4ZTK       1
B0006L3BHW       1
              ... 
B0002JKPA4    1046
B00021DVCQ    1051
B000142FVW    1558
B0000YUXI0    2143
B0009V1YR8    2869
Name: Rating, Length: 15833, dtype: int64

In [7]:
# Number of ratings given by each user
rated_product=df.groupby(by = 'UserId')['Rating'].count().sort_values(ascending=False)
rated_product

UserId
A3M174IC0VXOS2    126
A281NPSIMI1C2R    121
A1RRMZKOMZ2M7J     82
A3GU3CXHONZJQ6     47
AKMEY1BSHSDG7      42
                 ... 
A2BNVCO9CFZQTP      1
A2BNW78Q8GAL9T      1
A2BNW8FC87QKUB      1
A2BNWLRJ99G8YP      1
AZZZLM1E5JJ8C       1
Name: Rating, Length: 202484, dtype: int64

In [8]:
print("Number of products with minimum of 5 ratings: ", rated_product[rated_product>5].count())
print('Number of products with minimum of 4 ratings:', rated_product[rated_product>4].count())
print('Number of products with minimum of 3 ratings:', rated_product[rated_product>3].count())
print('Number of products with minimum of 2 ratings:', rated_product[rated_product>2].count())
print('Number of products with minimum of 1 ratings:', rated_product[rated_product>1].count())

Number of products with minimum of 5 ratings:  527
Number of products with minimum of 4 ratings: 893
Number of products with minimum of 3 ratings: 1755
Number of products with minimum of 2 ratings: 4776
Number of products with minimum of 1 ratings: 19457


## Data visualization

In [10]:
# Plot data
import plotly.graph_objects as go
index = ['Total size of records', "Number of unique users","Number of unique products"]
values =[len(df),len(df['UserId'].unique()),len(df['ProductId'].unique())]

plot = go.Figure([go.Bar(x=index, y=values,textposition='auto')])
plot.update_layout(title_text='Number of Users and Products w.r.to Total size of Data',
                    xaxis_title="Records",
                    yaxis_title="Total number of Records")

plot.show()


### Ratings given by users

In [15]:
print("Rate of raings: ", df['Rating'].value_counts())
print(list(df['Rating'].value_counts()))

values = list(df['Rating'].value_counts())

plot = go.Figure([go.Bar(x = df['Rating'].value_counts().index, y = values,textposition='auto')])

plot.update_layout(title_text='Ratings given by user',
                    xaxis_title="Rating",
                    yaxis_title="Total number of Ratings")

plot.show()



Rate of raings:  5.0    149202
4.0     31446
1.0     22028
3.0     16935
2.0     12008
Name: Rating, dtype: int64
[149202, 31446, 22028, 16935, 12008]


### The most popular product 

In [19]:
print("Frequent rated product: ", df['ProductId'].value_counts().nlargest(5))

values=list(df['ProductId'].value_counts().nlargest(5))
plot=go.Figure([go.Bar(x= df['ProductId'].value_counts().nlargest(5).index, y=values)])
plot.update_layout(title_text='5 Most rated product',
                   xaxis_title='Ratings',
                   yaxis_title='Total number of ratings')
plot.show()

Frequent rated product:  B0009V1YR8    2869
B0000YUXI0    2143
B000142FVW    1558
B00021DVCQ    1051
B0002JKPA4    1046
Name: ProductId, dtype: int64


### Average ratings given by each user

In [20]:
ratings_per_user = df.groupby('UserId')['Rating'].count().sort_values(ascending=False)
print("Average rating given by each user: ",ratings_per_user.head())

plot = go.Figure(data=[go.Histogram(x=ratings_per_user)])
plot.show()


Average rating given by each user:  UserId
A3M174IC0VXOS2    126
A281NPSIMI1C2R    121
A1RRMZKOMZ2M7J     82
A3GU3CXHONZJQ6     47
AKMEY1BSHSDG7      42
Name: Rating, dtype: int64


In [23]:
ratings_per_product = df.groupby('ProductId')['Rating'].count().sort_values(ascending=False)
plot = go.Figure(data=[go.Histogram(x=ratings_per_product)])
plot.show(title_text='Number of ratings per product',
                    xaxis_title="Product",
                    yaxis_title="Number of ratings")

### Product with less ratings

In [28]:
rating_of_products=df.groupby('ProductId')['Rating'].count()
# Convert into data frame for analysis
number_of_ratings= pd.DataFrame(rating_of_products)
print("Products with ratings given by users: \n ", number_of_ratings.head())

less_than_ten = []
less_than_fifty_greater_than_ten = []
greater_than_fifty_less_than_hundred = []
greater_than_hundred = []
average_rating = []

for ratings in number_of_ratings['Rating']:
  if ratings<10:
    less_than_ten.append(ratings)
  if 10<ratings<50:
    less_than_fifty_greater_than_ten.append(ratings)
  if ratings > 50 and ratings <= 100:
      greater_than_fifty_less_than_hundred.append(ratings)
  if ratings > 100:
      greater_than_hundred.append(ratings)

average_rating.append(ratings)

print("Ratings less than 10: ", len(less_than_ten))
print("Ratings greater than 10 but less than 50: ", len(less_than_fifty_greater_than_ten))
print("Ratings greater than 50 but less than 100: ", len(greater_than_fifty_less_than_hundred))
print("Ratings greater than 100: ", len(greater_than_hundred))
print("Average number of products rated by users: ", np.mean(average_rating))



Products with ratings given by users: 
              Rating
ProductId         
0205616461       1
0558925278       2
0733001998       1
0737104473       1
0762451459       1
Ratings less than 10:  11996
Ratings greater than 10 but less than 50:  2623
Ratings greater than 50 but less than 100:  536
Ratings greater than 100:  417
Average number of products rated by users:  345.0


In [29]:
# Plot the result

x_values = ["Ratings_count_less_than_ten","Ratings_count_greater_than_ten_less_than_fifty",
           "Ratings_count_greater_than_fifty_less_than_hundred","Ratings_count_greater_than_hundred"]
y_values = [len(less_than_ten),len(less_than_fifty_greater_than_ten),len(greater_than_fifty_less_than_hundred),
            len(greater_than_hundred)]


plot = go.Figure([go.Bar(x = x_values, y = y_values, textposition='auto')])

plot.add_annotation(
        x=1,
        y=100000,
        xref="x",
        yref="y")

plot.update_layout(title_text='Ratings Count on Products',
                    xaxis_title="Ratings Range",
                    yaxis_title="Count of Rating")
plot.show()

### Convert string data into numeric (Encoder)

In [30]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

In [31]:
# Convert 'UserId' & 'ProductId'
dataset = df
dataset['user'] = label_encoder.fit_transform(df['UserId'])
dataset['product'] = label_encoder.fit_transform(df['ProductId'])
dataset.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp,user,product
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,121458,0
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,136609,1
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,52359,1
3,A1WMRR494NWEWV,733001998,4.0,1382572800,48656,2
4,A3IAAVS479H7M7,737104473,1.0,1274227200,134621,3


In [32]:
# average rating given by each user
average_rating = dataset.groupby(by="user", as_index=False)['Rating'].mean()
print("Average rating given by users: \n",average_rating.head())
print("----------------------------------------------------------\n")


# Merge it with the dataset as it will be using later
dataset = pd.merge(dataset, average_rating, on="user")
print("Modified dataset: \n", dataset.head())
print("----------------------------------------------------------\n")

# Rename columns
dataset = dataset.rename(columns={"Rating_x": "real_rating", "Rating_y": "average_rating"})
print("Dataset: \n", dataset.head())
print("----------------------------------------------------------\n")


Average rating given by users: 
    user  Rating
0     0     4.0
1     1     4.0
2     2     5.0
3     3     5.0
4     4     5.0
----------------------------------------------------------

Modified dataset: 
            UserId   ProductId  Rating_x   Timestamp    user  product  Rating_y
0  A39HTATAQ9V7YF  0205616461       5.0  1369699200  121458        0       5.0
1  A3JM6GV9MNOF9X  0558925278       3.0  1355443200  136609        1       3.0
2  A1Z513UWSAAO0F  0558925278       5.0  1404691200   52359        1       5.0
3  A1WMRR494NWEWV  0733001998       4.0  1382572800   48656        2       4.0
4  A3IAAVS479H7M7  0737104473       1.0  1274227200  134621        3       1.0
----------------------------------------------------------

Dataset: 
            UserId   ProductId  real_rating   Timestamp    user  product  \
0  A39HTATAQ9V7YF  0205616461          5.0  1369699200  121458        0   
1  A3JM6GV9MNOF9X  0558925278          3.0  1355443200  136609        1   
2  A1Z513UWSAAO0F  05

Some of users may tend to give higher ratings while others tend to gibve lower ratings. To adjust this bias, standardize the ratings given by the users.

In [33]:
dataset['standardized_rating'] = dataset['real_rating'] - dataset['average_rating']
print("Data with adjusted rating: \n", dataset.head())


Data with adjusted rating: 
            UserId   ProductId  real_rating   Timestamp    user  product  \
0  A39HTATAQ9V7YF  0205616461          5.0  1369699200  121458        0   
1  A3JM6GV9MNOF9X  0558925278          3.0  1355443200  136609        1   
2  A1Z513UWSAAO0F  0558925278          5.0  1404691200   52359        1   
3  A1WMRR494NWEWV  0733001998          4.0  1382572800   48656        2   
4  A3IAAVS479H7M7  0737104473          1.0  1274227200  134621        3   

   average_rating  standardized_rating  
0             5.0                  0.0  
1             3.0                  0.0  
2             5.0                  0.0  
3             4.0                  0.0  
4             1.0                  0.0  


# Cosine Similarity

A distance based metric - cosine similarity to identify similar users. It is important to remove products that have very low number of ratings in the beginning.

## Filter some products with low amount of ratings

In [34]:
rating_of_product = dataset.groupby('product')['real_rating'].count() 
ratings_of_products_df = pd.DataFrame(rating_of_product)
 # check for real rating for products
print("Real ratings:\n",ratings_of_products_df.head())


Real ratings:
          real_rating
product             
0                  1
1                  2
2                  1
3                  1
4                  1


In [35]:
# Set 200 as threshold
filtered_ratings_per_product = ratings_of_products_df[ratings_of_products_df.real_rating >= 200]
print(filtered_ratings_per_product.head())
print(filtered_ratings_per_product.shape)

         real_rating
product             
704              558
719              377
754              288
834              412
843              313
(158, 1)


In [36]:
# Build a list of products to keep
popular_products = filtered_ratings_per_product.index.tolist()
print("Popular product count which have ratings more than 200: ",len(popular_products))
print("--------------------------------------------------------------------------------")

filtered_ratings_data = dataset[dataset["product"].isin(popular_products)]
print("Filtered rated product in the dataset: \n",filtered_ratings_data.head())
print("---------------------------------------------------------------------------------")

print("The size of dataset has changed from ", len(dataset), " to ", len(filtered_ratings_data))
print("---------------------------------------------------------------------------------")

Popular product count which have ratings more than 200:  158
--------------------------------------------------------------------------------
Filtered rated product in the dataset: 
              UserId   ProductId  real_rating   Timestamp    user  product  \
8     AKJHHD5VEH7VG  B0000UTUVU          5.0  1232323200  179764     2237   
58   A274NIJWOQWE30  B0009OAGXI          5.0  1404345600   64136    10081   
59   A274NIJWOQWE30  B000C1ZDTU          5.0  1389225600   64136    13736   
126  A1QWS6V26JZ5OX  B00027DMI8          1.0  1359072000   40156     5105   
138  A39QH03VD9U048  B000052YM7          5.0  1373155200  121842      834   

     average_rating  standardized_rating  
8          5.000000             0.000000  
58         5.000000             0.000000  
59         5.000000             0.000000  
126        3.000000            -2.000000  
138        3.666667             1.333333  
---------------------------------------------------------------------------------
The size of da

## Create user-item matrix

In [38]:
similarity = pd.pivot_table(filtered_ratings_data,values='standardized_rating',index='UserId',columns='product')
similarity = similarity.fillna(0)
print("Updated Dataset: \n",similarity.head())

Updated Dataset: 
 product                704    719    754    834    843    858    861    873    \
UserId                                                                          
A00222842T0ZYI86C9LHU    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
A00275441WYR3489IKNAB    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
A00700212KB3K0MVESPIY    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
A0072717335KA6520NEMI    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
A00995931BE16NG4F52QC    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

product                944    981    ...  14274  14792  15234  15243  15316  \
UserId                               ...                                      
A00222842T0ZYI86C9LHU    0.0    0.0  ...    0.0    0.0    0.0    0.0    0.0   
A00275441WYR3489IKNAB    0.0    0.0  ...    0.0    0.0    0.0    0.0    0.0   
A00700212KB3K0MVESPIY    0.0    0.0  ...    0.0    0.0    0.0    0.0    0.0   
A0072717335KA6520N

Above result shows this is a sparse matrix

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [41]:
selecting_users = list(similarity.index)
selecting_users = selecting_users[:100]
print("Users can be selected from the below list:\n",selecting_users)

Users can be selected from the below list:
 ['A00222842T0ZYI86C9LHU', 'A00275441WYR3489IKNAB', 'A00700212KB3K0MVESPIY', 'A0072717335KA6520NEMI', 'A00995931BE16NG4F52QC', 'A01362343O2D2DRZLC42E', 'A013805820H8FMU1TKEK4', 'A0140494QSPWAFGBI083', 'A01456542S5QPYUEGJXR8', 'A02157553CY714JSIXQMJ', 'A02227212BVIA0DTOZZTZ', 'A02247533O5ZV8QY08RNV', 'A0235417OVQ79DHUZH39', 'A0265436JMR91F9LHBXT', 'A0266076X6KPZ6CCHGVS', 'A02705761H0YNK5IS0LUC', 'A03210862P03QLU6UW053', 'A03454732N8VEYJAMGTTH', 'A0350213217E5G6OX4DUH', 'A03666331WS5WUZMM0AUD', 'A03900532XT2E5T10WV0U', 'A04106502YY56MOA04DE2', 'A04115122DP2N2XP2RK9F', 'A042274212BJJVOBS4Q85', 'A044043016M3YS4XB0CQV', 'A0473259F6GQNBD88IYN', 'A05080451TVS7AIOS7X51', 'A05198112VVJ1HUQ3SWU5', 'A05263441BXXNFL1C9JUK', 'A05306962T0DL4FS2RA7L', 'A05434242PMPAAU2UN4JG', 'A05440783CO15ST0IL21B', 'A057523234N4Y49RPS4G2', 'A058255625HJL78EIFBA7', 'A0594505INUAD8R4XWQX', 'A0609045JCZE9D27DY91', 'A06263011FDVMGFVAALWO', 'A06508832VWTDFS1KLFKW', 'A06520532QB

In [43]:
def getting_top_5_similar_users(user_id, similarity_table, k=5):
    '''

    user_id: the user we want to recommend
    similarity_table: the user-item matrix
    Similar users to the user_id.
    '''

    # create a dataframe of the current user
    user = similarity_table[similarity_table.index == user_id]
    # Add another dataframe of all other users
    other_users = similarity_table[similarity_table.index != user_id]
    # Calculate cosine similarity between user and each other user
    similarities = cosine_similarity(user, other_users)[0].tolist()

    indices = other_users.index.tolist()
    index_similarity = dict(zip(indices, similarities))

    # Sort by similarity
    index_similarity_sorted = sorted(index_similarity.items(), key=operator.itemgetter(1))
    index_similarity_sorted.reverse()

    # Retrieve users id
    top_users_similarities = index_similarity_sorted[:k]
    users = []
    for user in top_users_similarities:
        users.append(user[0])

    return users


In [46]:
user_id = "A00275441WYR3489IKNAB"
similar_users = getting_top_5_similar_users(user_id, similarity)


In [47]:
print("Top 5 similar users for user_id:",user_id," are: ",similar_users)

Top 5 similar users for user_id: A00275441WYR3489IKNAB  are:  ['AZZZLM1E5JJ8C', 'AZZVCBG5G4EV8', 'AZZTJQ7CQZUD8', 'AZZSAMMJPJKJ1', 'AZZS7W015QH2L']


## Products that can be recommended based on these highly similar users

In [49]:
def getting_top_5_recommendations_based_on_users(user_id, similar_users, similarity_table, top_recommendations=5):
    '''

    user_id: user for whom we want to recommend
    similar_users: top 5 similar users
    similarity_table: the user-item matrix
    top_recommendations: no. of recommendations
    return: top_5_recommendations
    '''

    # Taking the data for similar users
    similar_user_products = dataset[dataset.UserId.isin(similar_users)]
#     print("Products used by other users: \n", similar_user_products.head())
#     print("---------------------------------------------------------------------------------")

    # Getting all similar users
    similar_users = similarity_table[similarity_table.index.isin(similar_users)]

    # Obtaining mean ratings given by users
    similar_users = similar_users.mean(axis=0)


    similar_users_df = pd.DataFrame(similar_users, columns=['mean'])

    # For the current user data
    user_df = similarity_table[similarity_table.index == user_id]


    # Transpose it so its easier to filter
    user_df_transposed = user_df.transpose()


    # Rename the column as 'rating'
    user_df_transposed.columns = ['rating']

    # rows with a 0 value.
    user_df_transposed = user_df_transposed[user_df_transposed['rating'] == 0]


    # Generate a list of products that the users have not used yet
    products_not_rated = user_df_transposed.index.tolist()
#     print("Products not used by target user: ", products_not_rated)
#     print("-------------------------------------------------------------------")

    # Filter avg ratings of similar users for only products the current user has not rated
    similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(products_not_rated)]

    # Order the dataframe
    similar_users_df_ordered = similar_users_df_filtered.sort_values(by=['mean'], ascending=False)



    # Take the top products
    top_products = similar_users_df_ordered.head(top_recommendations)
    top_products_indices = top_products.index.tolist()


    return top_products_indices



In [50]:
print("Top 5 productID recommended are: ", getting_top_5_recommendations_based_on_users(user_id, similar_users, similarity))


Top 5 productID recommended are:  [1727, 704, 10516, 9504, 9531]


In [51]:
filtered_ratings_data.head()

Unnamed: 0,UserId,ProductId,real_rating,Timestamp,user,product,average_rating,standardized_rating
8,AKJHHD5VEH7VG,B0000UTUVU,5.0,1232323200,179764,2237,5.0,0.0
58,A274NIJWOQWE30,B0009OAGXI,5.0,1404345600,64136,10081,5.0,0.0
59,A274NIJWOQWE30,B000C1ZDTU,5.0,1389225600,64136,13736,5.0,0.0
126,A1QWS6V26JZ5OX,B00027DMI8,1.0,1359072000,40156,5105,3.0,-2.0
138,A39QH03VD9U048,B000052YM7,5.0,1373155200,121842,834,3.666667,1.333333


In [53]:
# Take out the data from any users
filtered_ratings_data[filtered_ratings_data['UserId']=="AKJHHD5VEH7VG"]

Unnamed: 0,UserId,ProductId,real_rating,Timestamp,user,product,average_rating,standardized_rating
8,AKJHHD5VEH7VG,B0000UTUVU,5.0,1232323200,179764,2237,5.0,0.0


# Model deployment

In [58]:
def recommend_products_for_user(userId, similarity_matrix):
    similar_users = getting_top_5_similar_users(user_id, similarity_matrix)
    print("Top 5 similar users for user_id:",user_id," are: ",similar_users)
    product_list = getting_top_5_recommendations_based_on_users(user_id, similar_users, similarity)
    print("Top 5 productID recommended are: ", product_list)

In [59]:
recommend_products_for_user("A1QWS6V26JZ5OX", similarity)

Top 5 similar users for user_id: A00275441WYR3489IKNAB  are:  ['AZZZLM1E5JJ8C', 'AZZVCBG5G4EV8', 'AZZTJQ7CQZUD8', 'AZZSAMMJPJKJ1', 'AZZS7W015QH2L']
Top 5 productID recommended are:  [1727, 704, 10516, 9504, 9531]
