In [3]:
! pip install pandas numpy scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357238 sha256=0f2ced12b94e502230906a5ec68eadde529f6c229fbbc9831d9b9cd06982bfcb
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [4]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split
from collections import defaultdict

import pickle

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
! ls drive/MyDrive/ShoppingPulse/datasets/

content_based_recommended_items_dict.pkl  processed
content_based_train_item_metadata.pkl	  raw
interactions_test_data1.parquet		  svd_trainset.pkl
interactions_test_data.parquet		  test_metadata.parquet
interactions_training_data1.parquet	  train_metadata.parquet
interactions_training_data.parquet	  train_reviews.parquet
interactions_validation_data1.parquet	  valid_metadata.parquet
interactions_validation_data.parquet


In [7]:
# reading training, validation and test data
train_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_training_data1.parquet")


In [19]:
valid_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_validation_data1.parquet")


In [50]:
test_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_test_data1.parquet")


In [8]:
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,category
0,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B09BF693L6,5.0,1547316703224,2019-01-12 18:11:43.224,Automotive
1,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B01KP4X3UO,5.0,1575402993972,2019-12-03 19:56:33.972,Automotive
2,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B07Q3HTGC7,4.0,1589395842176,2020-05-13 18:50:42.176,Automotive
3,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B08X1F42NG,4.0,1590212444932,2020-05-23 05:40:44.932,Automotive
4,AFZUK3MTBIBEDQOPAK3OATUOUKLA,B0BZJ6PFBB,3.0,1590215175314,2020-05-23 06:26:15.314,Automotive


In [9]:
# Convert ratings to numeric and ignore None values
train_df['rating'] = pd.to_numeric(train_df['rating'], errors='coerce')
train_df.dropna(subset=['rating'], inplace=True)


In [20]:
valid_df['rating'] = pd.to_numeric(valid_df['rating'], errors='coerce')
valid_df.dropna(subset=['rating'], inplace=True)


In [51]:
test_df['rating'] = pd.to_numeric(test_df['rating'], errors='coerce')
test_df.dropna(subset=['rating'], inplace=True)


In [10]:
# Prepare data for Surprise library
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
data = Dataset.load_from_df(train_df[['user_id', 'parent_asin', 'rating']], reader)

In [11]:
# Split the data into training and test sets
#trainset, _ = train_test_split(data, test_size=0.1)  # Using 10% of the data for test set
trainset = data.build_full_trainset()

In [12]:
# Save trainset to drive
with open('drive/MyDrive/ShoppingPulse/datasets/svd_trainset.pkl', 'wb') as file:
    pickle.dump(trainset, file)

print("trainset saved to svd_trainset.pkl")

trainset saved to svd_trainset.pkl


In [13]:
trainset = pickle.load(open('drive/MyDrive/ShoppingPulse/datasets/svd_trainset.pkl', 'rb'))

In [14]:
# Build the SVD model
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7b461b2940d0>

In [15]:
# Save the model to drive
with open('drive/MyDrive/ShoppingPulse/models/svd_model.pkl', 'wb') as file:
    pickle.dump(algo, file)

print("Model saved to svd_model.pkl")

Model saved to svd_model.pkl


In [5]:
algo = pickle.load(open('drive/MyDrive/ShoppingPulse/models/svd_model.pkl', 'rb'))

In [16]:
K = 2000

In [21]:
# Function to generate recommendations for each user
def generate_recommendations(algo, trainset, user_ids, k=K):
    recommendations = defaultdict(list)
    for user_id in user_ids:
        # Get all items that the user has not rated yet
        user_inner_id = trainset.to_inner_uid(user_id)
        user_rated_items = set([j for (j, _) in trainset.ur[user_inner_id]])
        all_items = set(range(trainset.n_items))
        items_to_predict = all_items - user_rated_items

        # Predict ratings for all items
        predictions = [(iid, algo.predict(user_id, trainset.to_raw_iid(iid)).est) for iid in items_to_predict]
        top_k_items = [iid for (iid, _) in sorted(predictions, key=lambda x: x[1], reverse=True)[:k]]
        recommendations[user_id] = [trainset.to_raw_iid(iid) for iid in top_k_items]

    return recommendations

In [31]:
# Generate recommendations for validation and test sets
valid_df = valid_df[valid_df.user_in_train == True].sample(500)
valid_df.reset_index(drop=True, inplace=True)
user_ids_valid = valid_df['user_id'].unique()

In [32]:
%time
recommendations_valid = generate_recommendations(algo, trainset, user_ids_valid, K)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [33]:
# Save trainset to drive
with open('drive/MyDrive/ShoppingPulse/datasets/svd_recommendations_valid.pkl', 'wb') as file:
    pickle.dump(recommendations_valid, file)

print("recommendations saved to recommendations_valid.pkl")

recommendations saved to recommendations_valid.pkl


In [34]:
print(len(recommendations_valid))

499


In [28]:
# Evaluation metrics functions
def recall_precision_at_k(recommendations, ground_truth, k=K):
    recall = []
    precision = []
    for user_id in ground_truth['user_id'].unique():
        actual_items = set(ground_truth[ground_truth['user_id'] == user_id]['parent_asin'])
        recommended_items = set(recommendations[user_id][:k])

        true_positives = len(actual_items & recommended_items)
        recall.append(true_positives / len(actual_items))
        precision.append(true_positives / k)

    return np.mean(recall), np.mean(precision)

In [37]:
K

2000

In [38]:
recommendations_valid = generate_recommendations(algo, trainset, user_ids_valid, K)

In [39]:
# Evaluate on validation and test data
for k in [100, 200, 500, 1000, 2000]:
  print(f"K: {k}")
  valid_recall, valid_precision = recall_precision_at_k(recommendations_valid, valid_df, k = k)
  print(f"Validation Recall@K: {valid_recall:.6f}")
  print(f"Validation Precision@K: {valid_precision:.6f}")
  print("-------------------------")

K: 100
Validation Recall@K: 0.002004
Validation Precision@K: 0.000020
-------------------------
K: 200
Validation Recall@K: 0.004008
Validation Precision@K: 0.000020
-------------------------
K: 500
Validation Recall@K: 0.008016
Validation Precision@K: 0.000016
-------------------------
K: 1000
Validation Recall@K: 0.012024
Validation Precision@K: 0.000012
-------------------------
K: 2000
Validation Recall@K: 0.024048
Validation Precision@K: 0.000012
-------------------------


In [45]:
K

2000

In [52]:
test_df = test_df[test_df.user_in_train == True].sample(1500)
test_df.reset_index(drop=True, inplace=True)

In [53]:
user_ids_test = test_df['user_id'].unique()
recommendations_test = generate_recommendations(algo, trainset, user_ids_test, K)

In [54]:
# Save trainset to drive
with open('drive/MyDrive/ShoppingPulse/datasets/svd_recommendations_test.pkl', 'wb') as file:
    pickle.dump(recommendations_test, file)

print("recommendations saved to recommendations_test.pkl")

recommendations saved to recommendations_test.pkl


In [55]:
# Evaluate on validation and test data
for k in [100, 200, 500, 1000, 2000]:
  print(f"K: {k}")
  valid_recall, valid_precision = recall_precision_at_k(recommendations_test, valid_df, k = k)
  print(f"Validation Recall@K: {valid_recall:.6f}")
  print(f"Validation Precision@K: {valid_precision:.6f}")
  print("-------------------------")

K: 100
Validation Recall@K: 0.000000
Validation Precision@K: 0.000000
-------------------------
K: 200
Validation Recall@K: 0.000000
Validation Precision@K: 0.000000
-------------------------
K: 500
Validation Recall@K: 0.000000
Validation Precision@K: 0.000000
-------------------------
K: 1000
Validation Recall@K: 0.000000
Validation Precision@K: 0.000000
-------------------------
K: 2000
Validation Recall@K: 0.000000
Validation Precision@K: 0.000000
-------------------------
