In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! ls drive/MyDrive/ShoppingPulse/datasets/

interactions_test_data.parquet	    interactions_validation_data.parquet  raw
interactions_training_data.parquet  processed


In [None]:
# reading training, validation and test data
train_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_training_data.parquet")

In [None]:
val_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_validation_data1.parquet")
test_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_test_data1.parquet")

In [None]:
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,category
0,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,B0C2Z1WDGW,5.0,1560014006319,2019-06-08 17:13:26.319,Automotive
1,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,B0719J5ZNY,5.0,1574097084236,2019-11-18 17:11:24.236,Automotive
2,AERGOXGAFZ2J3ZMHXVLZQGTW36RA,B07XF3R82L,1.0,1618857584664,2021-04-19 18:39:44.664,Automotive
3,AFVNEEPDEIH5SPUN5BWC6NKL3WNQ,B00VMVWAW2,5.0,1580484465419,2020-01-31 15:27:45.419,Automotive
4,AFEIN7QWSQ6EFW45LKD7ZOCSMLGQ,B00CNZF3X4,5.0,1560371039064,2019-06-12 20:23:59.064,Automotive


In [None]:
'''
Recommends the top-rated items by average rating while discarding the bottom 25 percentile by count.
1. Calculate the average rating for each item.
2. Discard items in the bottom 25th percentile by count for each category.
3. Recommend the top-rated items from the remaining items.
'''

In [None]:
K = 2000

In [None]:
def get_top_rated_items(train_df, top_k=K):

  top_k_by_category = top_k//train_df.category.nunique()
  top_rated_items = []
  for category in train_df.category.unique():
    train_df_category = train_df[train_df.category == category]
    item_counts = train_df_category['parent_asin'].value_counts()
    item_ratings = train_df_category.groupby('parent_asin')['rating'].mean()

    # Discard items in the bottom 25th percentile by count for each category.
    cutoff = np.percentile(item_counts, 25)
    filtered_items = item_counts[item_counts > cutoff].index

    top_rated_items.extend(item_ratings[filtered_items].sort_values(ascending=False).head(top_k_by_category).index.tolist())
  return top_rated_items

In [None]:
test_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,user_in_train,parent_asin_in_train,category
0,AFE337D2J37YRU5U6MVTVKNDKWDA,B0B2WGS5ND,5.0,1659124303053,2022-07-29 19:51:43.053,False,False,Automotive
1,AHWWLSPCJMALVHDDVSUGICL6RUCA,B092QX3F49,5.0,1664746954617,2022-10-02 21:42:34.617,False,False,Automotive
2,AHWWLSPCJMALVHDDVSUGICL6RUCA,B00LD1F410,5.0,1664747094922,2022-10-02 21:44:54.922,False,True,Automotive
3,AHWWLSPCJMALVHDDVSUGICL6RUCA,B00LD1F3UW,5.0,1664747107780,2022-10-02 21:45:07.780,False,True,Automotive
4,AHREXOGQPZDA6354MHH4ETSF3MCQ,B097BJDMSC,2.0,1678370208958,2023-03-09 13:56:48.958,False,True,Automotive


In [None]:
val_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,user_in_train,parent_asin_in_train,category
0,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,B08C27WWVG,2.0,1651801619265,2022-05-06 01:46:59.265,True,True,Automotive
1,AF4BOHAQZZBMDYP7B6IWIKQNHTCA,B008Y84LF0,5.0,1643769472122,2022-02-02 02:37:52.122,False,True,Automotive
2,AHCPZDDPHJE3G7M6ST5WGRPLXHOA,B097SWTNR5,5.0,1646224097585,2022-03-02 12:28:17.585,False,True,Automotive
3,AHDZ3PFHIGTE7EOQBUMVRBFRMWJQ,B00Z7N0DHA,5.0,1647205966235,2022-03-13 21:12:46.235,False,True,Automotive
4,AEQAYV7RXZEBXMQIQPL6KCT2CFWQ,B00I8IP7FO,5.0,1630752881248,2021-09-04 10:54:41.248,True,True,Automotive


In [None]:
# Convert ratings to numeric and ignore None values
train_df['rating'] = pd.to_numeric(train_df['rating'], errors='coerce')
val_df['rating'] = pd.to_numeric(val_df['rating'], errors='coerce')
test_df['rating'] = pd.to_numeric(test_df['rating'], errors='coerce')

train_df.dropna(subset=['rating'], inplace=True)
val_df.dropna(subset=['rating'], inplace=True)
test_df.dropna(subset=['rating'], inplace=True)

In [None]:
# Generate recommendations
popular_items = get_top_rated_items(train_df)

In [None]:
print(len(popular_items))
print(popular_items[:10])

1000
['B01C9A3RRG', 'B0060VIX1Q', 'B07G5Y5FY2', 'B0932VLY4C', 'B009G09OQQ', 'B00HVMQH8Y', 'B074NCVJHL', 'B075BSWYNF', 'B01LW2HXU5', 'B07GZPXBDB']


In [None]:
# Calculate recall@K and precision@K
def recall_precision_at_k(recommendations_lst, ground_truth, k=K):
    recall = []
    precision = []
    for user_id in ground_truth['user_id'].unique():
        actual_items = set(ground_truth[ground_truth['user_id'] == user_id]['parent_asin'])
        recommended_items = set(recommendations_lst) #set(recommendations[user_id][:k])

        true_positives = len(actual_items & recommended_items)
        recall.append(true_positives / len(actual_items))
        precision.append(true_positives / k)

    return np.mean(recall), np.mean(precision)

In [None]:
# Evaluate on validation data

valid_recall, valid_precision = recall_precision_at_k(popular_items, val_df)

print(f"Validation Recall@K: {valid_recall:.6f}")
print(f"Validation Precision@K: {valid_precision:.6f}")

Validation Recall@K: 0.000101
Validation Precision@K: 0.000000


In [None]:
# Evaluate on test data

test_recall, test_precision = recall_precision_at_k(popular_items, test_df)

print(f"Test Recall@K: {test_recall:.6f}")
print(f"Test Precision@K: {test_precision:.6f}")


Test Recall@K: 0.000101
Test Precision@K: 0.000000


In [2]:
for k in [200, 1000, 2000]:
  popular_items = get_top_rated_items(train_df, top_k=k)
  #valid_recall, valid_precision = recall_precision_at_k(popular_items, val_df)
  #print(f"Validation Recall@{k}: {valid_recall:.6f}")
  #print(f"Validation Precision@{k}: {valid_precision:.6f}")
  print("\n")
  test_recall, test_precision = recall_precision_at_k(popular_items, test_df)
  print(f"Test Recall@{k}: {test_recall:.6f}")
  print(f"Test Precision@{k}: {test_precision:.6f}")
  print("\n")


Test Recall@200: 0.000000
Test Precision@200: 0.000000

Test Recall@1000: 0.000101
Test Precision@1000: 0.000000

Test Recall@2000: 0.000101
Test Precision@2000: 0.000000

