# Recommendation engine using Amazon Beauty product ratings data to suggest items frequently bought together.

In [3]:
import pandas as pd
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

# Specify the path to your CSV file
file_path = '/content/drive/My Drive/AmazonBeautyRatings/ratings_Beauty.csv'

df = pd.read_csv(file_path)

# Display basic info and first few rows
df.info(), df.head()

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB


(None,
            UserId   ProductId  Rating   Timestamp
 0  A39HTATAQ9V7YF  0205616461     5.0  1369699200
 1  A3JM6GV9MNOF9X  0558925278     3.0  1355443200
 2  A1Z513UWSAAO0F  0558925278     5.0  1404691200
 3  A1WMRR494NWEWV  0733001998     4.0  1382572800
 4  A3IAAVS479H7M7  0737104473     1.0  1274227200)

**Data Cleaning**

Remove any duplicates of the user-product pairs

In [4]:

# Remove duplicate user-product pairs (keeping the latest rating if duplicates exist)
df_sorted = df.sort_values(by='Timestamp', ascending=False)
df_cleaned = df_sorted.drop_duplicates(subset=['UserId', 'ProductId'], keep='first')

# Check how many unique users and products are left
num_users = df_cleaned['UserId'].nunique()
num_products = df_cleaned['ProductId'].nunique()
df_cleaned.shape, num_users, num_products


((2023070, 4), 1210271, 249274)

**Build a co-occurrence matrix**


In [5]:
from collections import defaultdict
from itertools import combinations
import numpy as np
import pandas as pd

co_occurrence = defaultdict(lambda: defaultdict(int))

# Group by UserId and get the list of ProductIds per user
user_products = df_cleaned.groupby('UserId')['ProductId'].apply(list)

# Count co-occurrences
for products in user_products:
    unique_products = list(set(products))  # Remove duplicates in one user's basket
    for prod1, prod2 in combinations(unique_products, 2):
        co_occurrence[prod1][prod2] += 1
        co_occurrence[prod2][prod1] += 1  # symmetric

# Convert to a more usable structure: dictionary of top co-occurring products
recommendations = {}
for product, related_products in co_occurrence.items():
    sorted_related = sorted(related_products.items(), key=lambda x: x[1], reverse=True)
    recommendations[product] = sorted_related[:10]  # top 10 frequently co-bought products

# Display a sample product and its recommendations
sample_product = next(iter(recommendations))
sample_recommendations = recommendations[sample_product]
sample_product, sample_recommendations


('B0055MYJ0U',
 [('B006K9OQSC', 39),
  ('B004367X70', 17),
  ('B004WDV2XA', 16),
  ('B009T47YZ2', 8),
  ('B008FWTPL0', 8),
  ('B00912CL5K', 7),
  ('B008RVYJS8', 6),
  ('B009CS493U', 5),
  ('B0069FDR96', 5),
  ('B00CGKJ7QU', 5)])

**Recommendations**

In [6]:
# Define a function to get top-N related products for a given product ID
def get_recommendations(product_id, top_n=10):
    if product_id not in recommendations:
        return f"No recommendations found for Product ID: {product_id}"
    return recommendations[product_id][:top_n]



In [7]:

# Test the function with the earlier sample product
get_recommendations('B0047SBPSW')


[('B00A51LI1O', 2),
 ('B0030O3VRW', 2),
 ('B001C6H6F8', 2),
 ('B0032O52QS', 1),
 ('B00B406WDY', 1),
 ('B003JKA2CE', 1),
 ('B00EYSNWXG', 1),
 ('B008J2846I', 1),
 ('B00DHE53NK', 1),
 ('B007JT7AIK', 1)]

In [8]:

# Test the function with the earlier sample product
get_recommendations('A1WMRR494NWEWV')


'No recommendations found for Product ID: A1WMRR494NWEWV'