In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from collections import defaultdict
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! ls drive/MyDrive/ShoppingPulse/datasets

content_based_recommended_items_dict.pkl       interactions_validation_data.parquet
content_based_recommended_items_dict_test.pkl  processed
content_based_train_item_metadata.pkl	       raw
interactions_test_data1.parquet		       svd_recommendations_test.pkl
interactions_test_data2.parquet		       svd_recommendations_valid.pkl
interactions_test_data.parquet		       svd_trainset.pkl
interactions_training_data1.parquet	       test_metadata.parquet
interactions_training_data2.parquet	       train_metadata2.parquet
interactions_training_data.parquet	       train_metadata.parquet
interactions_validation_data1.parquet	       train_reviews.parquet
interactions_validation_data2.parquet	       valid_metadata.parquet


In [None]:
train_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_training_data2.parquet")

In [None]:
valid_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_validation_data2.parquet")

In [None]:
test_df = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_test_data2.parquet")

In [None]:
# Convert ratings to numeric and ignore None values
train_df['rating'] = pd.to_numeric(train_df['rating'], errors='coerce')
train_df.dropna(subset=['rating'], inplace=True)

In [None]:
train_df['user_id'] = train_df['user_id'].astype(str)
train_df['parent_asin'] = train_df['parent_asin'].astype(str)
train_df['rating'] = train_df['rating'].astype(float)
#train_df['timestamp'] = train_df['timestamp'].apply(lambda x: pd.Timestamp(int(x), unit='ms'))
train_df.reset_index(drop = True, inplace = True)


In [None]:
sparse_matrix = train_df.pivot(index='user_id', columns='parent_asin', values='rating').fillna(0)
R = sparse_matrix.values

In [None]:
input_dim = R.shape[1]
encoding_dim = 50

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer=Adam(), loss='mean_squared_error')

In [None]:
batch_size = 256
autoencoder.fit(R, R, epochs=50, batch_size=batch_size, shuffle=True, validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ba49b13ea40>

In [None]:
K = 2000

In [None]:
def generate_recommendations(autoencoder, sparse_matrix, user_ids, k=K):
    recommendations = defaultdict(list)
    all_items = np.arange(sparse_matrix.shape[1])

    for user_id in user_ids:
        user_index = sparse_matrix.index.get_loc(user_id)
        user_rated_items = sparse_matrix.iloc[user_index].nonzero()[0]
        items_to_predict = np.setdiff1d(all_items, user_rated_items)

        # Predict ratings for all items the user hasn't rated yet
        user_input = sparse_matrix.iloc[user_index].values.reshape(1, -1)
        user_predictions = autoencoder.predict(user_input).flatten()

        # Get top K items
        top_k_items = items_to_predict[np.argsort(-user_predictions[items_to_predict])[:k]]
        recommendations[user_id] = sparse_matrix.columns[top_k_items].tolist()

    return recommendations

def generate_recommendations(autoencoder, sparse_matrix, user_ids, k=K):
    recommendations = defaultdict(list)
    all_items = np.arange(sparse_matrix.shape[1])

    for user_id in user_ids:
        user_index = sparse_matrix.index.get_loc(user_id)
        # Use .to_numpy() to convert the Series to a NumPy array before calling .nonzero()
        user_rated_items = sparse_matrix.iloc[user_index].to_numpy().nonzero()[0]
        items_to_predict = np.setdiff1d(all_items, user_rated_items)

        # Predict ratings for all items the user hasn't rated yet
        user_input = sparse_matrix.iloc[user_index].values.reshape(1, -1)
        user_predictions = autoencoder.predict(user_input).flatten()

        # Get top K items
        top_k_items = items_to_predict[np.argsort(-user_predictions[items_to_predict])[:k]]
        recommendations[user_id] = sparse_matrix.columns[top_k_items].tolist()

    return recommendations

In [None]:
def recall_precision_at_k(recommendations, ground_truth, k=K):
    recall = []
    precision = []
    for user_id in ground_truth['user_id'].unique():
        actual_items = set(ground_truth[ground_truth['user_id'] == user_id]['parent_asin'])
        recommended_items = set(recommendations[user_id][:k])

        true_positives = len(actual_items & recommended_items)
        recall.append(true_positives / len(actual_items))
        precision.append(true_positives / k)

    return np.mean(recall), np.mean(precision)

In [None]:
user_ids_valid = valid_df[valid_df.user_in_train == True]['user_id'].unique()

In [None]:
recommendations_valid = generate_recommendations(autoencoder, sparse_matrix, user_ids_valid)



In [None]:
sample_user = user_ids_valid[0]
print(f"Recommendations for user {sample_user}: {recommendations_valid[sample_user]}")

Recommendations for user AFE2EVN2R2UZ72E6WNYGL5ZZ262Q: ['B0C3D2BLKT', 'B09KPK26YX', 'B00U9TWCXU', 'B073WVCH57', 'B0BXM745HW', 'B0C4VTPT1Y', 'B09J7PX5YM', 'B0C3M2NBS7', 'B01N22CM3F', 'B0C8V52BLR', 'B0BMYFPXYR', 'B01E7MBSL6', 'B0079OYIFS', 'B09KC69RSH', 'B0C5NYLZ6Q', 'B075LYLKH7', 'B0C1RRLY4Y', 'B0C37PFCWW', 'B0182VBOJE', 'B0BS71PXPX', 'B0C3KRT7XR', 'B0C5QZXQFD', 'B0C43Z7CBY', 'B078K93HFD', 'B0B6QVGZ4X', 'B01NCOUY05', 'B0C5Y2WBRB', 'B0BL2GJJ13', 'B095J4YL2H', 'B0BS1VWRKN', 'B08G1D6PKK', 'B0BWK813BD', 'B07Z5MN69H', 'B0813SV1KJ', 'B081S74N5X', 'B0BXQRCB55', 'B0BC1TZSTQ', 'B0B6Z2BBKV', 'B0BV5SS7W6', 'B08G1M1FX7', 'B0BL4PM4G7', 'B01G5EA74I', 'B0BVGZ3J4V', 'B0BC3NB2VR', 'B007L0DPE0', 'B09XGRWCFQ', 'B07G31SQZ7', 'B09FD8STXW', 'B00RJKB2FQ', 'B009YTONJW', 'B07LFJF6TR', 'B09YRZ21PL', 'B07GF149M6', 'B09Q8YC68J', 'B0BJ5W3R8H', 'B0BZ7NQKBV', 'B0BP72BQZV', 'B0C42774KB', 'B07DM3MBCS', 'B08DDDL6W7', 'B0B6QWBPV4', 'B08X2386G3', 'B07RWRJ4XW', 'B07Q24FTY8', 'B0C69M1238', 'B07BK7TQP3', 'B0C4K2HYLZ', 'B09NZ

In [None]:
valid_recall, valid_precision = recall_precision_at_k(recommendations_valid, valid_df)

In [None]:
print(f"Validation Recall@K: {valid_recall:.4f}")
print(f"Validation Precision@K: {valid_precision:.4f}")

Validation Recall@K: 0.3900
Validation Precision@K: 0.0002


In [None]:
user_ids_test = valid_df[valid_df.user_in_train == True]['user_id'].unique()

In [None]:
recommendations_test = generate_recommendations(autoencoder, sparse_matrix, user_ids_test)



In [None]:
test_recall, test_precision = recall_precision_at_k(recommendations_test, test_df)

In [2]:
for k in [100, 200, 500, 1000, 2000]:
    print(f'k: {k}')
    test_recall, test_precision = recall_precision_at_k(recommendations_test, test_df, k=k)
    print(f"test Recall@K: {test_recall:.6f}")
    print(f"test Precision@K: {test_precision:.6f}")
    print("\n")

k: 100
test Recall@K: 0.112000
test Precision@K: 0.001120


k: 200
test Recall@K: 0.206000
test Precision@K: 0.001030


k: 500
test Recall@K: 0.288000
test Precision@K: 0.000576


k: 1000
test Recall@K: 0.336000
test Precision@K: 0.000336


k: 2000
test Recall@K: 0.390000
test Precision@K: 0.000195


