In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle

In [2]:
class AmazonDataset(Dataset):
    def __init__(self, user_data, item_data, interactions):
        self.user_data = user_data
        self.item_data = item_data
        self.interactions = interactions

        # Fill missing values with mean
        self.item_data['average_rating'].fillna(self.item_data['average_rating'].mean(), inplace=True)
        self.item_data['rating_number'].fillna(self.item_data['rating_number'].mean(), inplace=True)
        self.item_data['price'].fillna(self.item_data['price'].mean(), inplace=True)

        self.user_encoder = LabelEncoder()
        self.item_encoder = LabelEncoder()

        self.user_data['user_id'] = self.user_encoder.fit_transform(self.user_data['user_id'])
        self.item_data['parent_asin'] = self.item_encoder.fit_transform(self.item_data['parent_asin'])

        self.interactions['user_id'] = self.user_encoder.transform(self.interactions['user_id'])
        self.interactions['parent_asin'] = self.item_encoder.transform(self.interactions['parent_asin'])

        # Normalize numerical features
        self.scaler = StandardScaler()
        self.item_data[['average_rating', 'rating_number', 'price']] = self.scaler.fit_transform(self.item_data[['average_rating', 'rating_number', 'price']])

        # Encode categorical features
        self.category_encoder = LabelEncoder()
        self.item_data['main_category'] = self.category_encoder.fit_transform(self.item_data['main_category'])

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        user_id = self.interactions.iloc[idx]['user_id']
        item_id = self.interactions.iloc[idx]['parent_asin']
        rating = self.interactions.iloc[idx]['rating']

        user_features = torch.tensor(self.user_data[self.user_data['user_id'] == user_id].drop(columns=['user_id']).values, dtype=torch.float32).squeeze()
        item_features = torch.tensor(self.item_data[self.item_data['parent_asin'] == item_id].drop(columns=['parent_asin']).values, dtype=torch.float32).squeeze()

        return user_id, item_id, user_features, item_features, rating

In [19]:
class TwoTowerModel(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, embed_dim):
        super(TwoTowerModel, self).__init__()
        self.user_tower = nn.Sequential(
            nn.Linear(user_input_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
        self.item_tower = nn.Sequential(
            nn.Linear(item_input_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )

    def forward(self, user_features, item_features):
        user_embedding = self.user_tower(user_features)
        item_embedding = self.item_tower(item_features)
        return user_embedding, item_embedding

    def predict(self, user_features, item_features):
        user_embedding, item_embedding = self.forward(user_features, item_features)
        return torch.sum(user_embedding * item_embedding, dim=1)

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for _, _, user_features, item_features, rating in dataloader:
        user_features, item_features, rating = user_features.to(device), item_features.to(device), rating.to(device)

        optimizer.zero_grad()
        outputs = model.predict(user_features, item_features)

        loss = criterion(outputs, rating.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    return running_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, _, user_features, item_features, rating in dataloader:
            user_features, item_features, rating = user_features.to(device), item_features.to(device), rating.to(device)
            outputs = model.predict(user_features, item_features)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(rating.cpu().numpy())
    return predictions, actuals


In [None]:
! ls drive/MyDrive/ShoppingPulse/datasets/

content_based_recommended_items_dict.pkl       interactions_validation_data.parquet
content_based_recommended_items_dict_test.pkl  processed
content_based_train_item_metadata.pkl	       raw
interactions_test_data1.parquet		       svd_recommendations_test.pkl
interactions_test_data2.parquet		       svd_recommendations_valid.pkl
interactions_test_data.parquet		       svd_trainset.pkl
interactions_training_data1.parquet	       test_metadata.parquet
interactions_training_data2.parquet	       train_metadata2.parquet
interactions_training_data.parquet	       train_metadata.parquet
interactions_validation_data1.parquet	       train_reviews.parquet
interactions_validation_data2.parquet	       valid_metadata.parquet


In [29]:
train_interaction_data = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_training_data2.parquet")

In [3]:
valid_interaction_data = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_validation_data2.parquet")

In [None]:
test_interaction_data = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_test_data2.parquet")

In [None]:
train_interaction_data.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,category
0,AF2BLE54TEMGZ546U763ZHZRXC4A,B076681J7L,5.0,1560036743365,2019-06-08 23:32:23.365,Automotive
1,AF2BLE54TEMGZ546U763ZHZRXC4A,B000IDYKQO,5.0,1561946120701,2019-07-01 01:55:20.701,Automotive
2,AF2BLE54TEMGZ546U763ZHZRXC4A,B07Z4G64FB,4.0,1581276260800,2020-02-09 19:24:20.800,Automotive
3,AF2BLE54TEMGZ546U763ZHZRXC4A,B08KFDLLFL,4.0,1605798991513,2020-11-19 15:16:31.513,Automotive
4,AF2BLE54TEMGZ546U763ZHZRXC4A,B08GTVPVWD,4.0,1608820109721,2020-12-24 14:28:29.721,Automotive


In [None]:
valid_interaction_data2 = valid_interaction_data[(valid_interaction_data.user_in_train == True) &	\
        (valid_interaction_data.parent_asin_in_train == True)]

In [None]:
valid_interaction_data2 = valid_interaction_data2.sample(500).reset_index(drop = True)

In [None]:
test_interaction_data2 = test_interaction_data[(test_interaction_data.user_in_train == True) &	\
        (test_interaction_data.parent_asin_in_train == True)]

In [None]:
test_interaction_data2 = test_interaction_data2.sample(500).reset_index(drop = True)

In [None]:
valid_interaction_data2.to_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_validation_data2.parquet")
test_interaction_data2.to_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_test_data2.parquet")

In [None]:
users = set(valid_interaction_data2['user_id'].unique()) | set(test_interaction_data2['user_id'].unique())

In [None]:
items = set(valid_interaction_data2['parent_asin'].unique()) | set(test_interaction_data2['parent_asin'].unique())

In [None]:
train_interaction_data2 = train_interaction_data[(train_interaction_data.user_id.isin(users)) | (train_interaction_data.parent_asin.isin(items))]

In [None]:
train_interaction_data.shape

(36413, 6)

In [None]:
train_interaction_data2 = train_interaction_data.reset_index(drop = True)

In [None]:
train_interaction_data2.to_parquet("drive/MyDrive/ShoppingPulse/datasets/interactions_training_data2.parquet")

In [32]:
# Convert ratings to numeric and ignore None values
train_interaction_data['rating'] = pd.to_numeric(train_interaction_data['rating'], errors='coerce')
train_interaction_data.dropna(subset=['rating'], inplace=True)
train_interaction_data.reset_index(drop = True, inplace = True)

In [33]:
user_data = train_interaction_data[['user_id']].drop_duplicates()

In [34]:
interactions = train_interaction_data[['user_id', 'parent_asin', 'rating']]

In [35]:
train_item_metadata = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/train_metadata2.parquet")

In [None]:
train_item_metadata2 = train_item_metadata[train_item_metadata['parent_asin'].isin(\
                      train_interaction_data['parent_asin'].unique())].reset_index(drop=True)

In [None]:
train_item_metadata2.to_parquet("drive/MyDrive/ShoppingPulse/datasets/train_metadata2.parquet")

In [None]:
train_item_metadata.shape, train_interaction_data.shape

((18988, 16), (36413, 6))

In [36]:
train_item_metadata['price'] = pd.to_numeric(train_item_metadata['price'], errors='coerce')

In [37]:
item_data = train_item_metadata[['parent_asin', 'main_category', 'average_rating', 'rating_number', 'price']]

In [38]:
train_data = AmazonDataset(user_data, item_data, interactions)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['average_rating'].fillna(self.item_data['average_rating'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['rating_number'].fillna(self.item_data['rating_number'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['price'].fillna(self.item_data['price'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

In [None]:
train_data.user_encoder

In [None]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

In [None]:
# Initialize model, criterion, and optimizer
user_input_dim = train_data[0][2].shape[0]
item_input_dim = train_data[0][3].shape[0]
embed_dim = 64

model = TwoTowerModel(user_input_dim, item_input_dim, embed_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
for epoch in range(15):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    print(f'Epoch {epoch+1}, Loss: {train_loss}')




Epoch 1, Loss: 1.6049404283347155
Epoch 2, Loss: 1.403489111132121
Epoch 3, Loss: 1.3769214349702288
Epoch 4, Loss: 1.358137896955537
Epoch 5, Loss: 1.35869835723054
Epoch 6, Loss: 1.3458745726277832
Epoch 7, Loss: 1.353411356952886
Epoch 8, Loss: 1.3529148381759915
Epoch 9, Loss: 1.3537712926814343
Epoch 10, Loss: 1.3552916829583934
Epoch 11, Loss: 1.3519283504551032
Epoch 12, Loss: 1.3536675600547783
Epoch 13, Loss: 1.3505545240713968
Epoch 14, Loss: 1.3481008838633033
Epoch 15, Loss: 1.3474484291353208


In [None]:

# Save to drive
with open('drive/MyDrive/ShoppingPulse/datasets/two_tower_model_obj.pkl', 'wb') as file:
    pickle.dump(model, file)

print("saved to two_tower_model_obj.pkl")

saved to two_tower_model_obj.pkl


In [20]:
pkl_file_path = 'drive/MyDrive/ShoppingPulse/datasets/two_tower_model_obj.pkl'


with open(pkl_file_path, 'rb') as file:
    model = pickle.load(file)

print("Model loaded successfully!")

Model loaded successfully!


In [49]:
def get_predictions(model, dataloader, device, k):
    model.eval()
    user_item_predictions = {}
    with torch.no_grad():
        for user_id, item_id, user_features, item_features, rating in dataloader:
            user_features, item_features = user_features.to(device), item_features.to(device)
            outputs = model.predict(user_features, item_features)
            for u, i, p in zip(user_id, item_id, outputs):
                if u.item() not in user_item_predictions:
                    user_item_predictions[u.item()] = []
                user_item_predictions[u.item()].append((i.item(), p.item()))

    return user_item_predictions


In [None]:
def evaluate(model, dataloader, device, k):
    model.eval()
    user_item_predictions = {}
    with torch.no_grad():
        for user_id, item_id, user_features, item_features, rating in dataloader:
            user_features, item_features = user_features.to(device), item_features.to(device)
            outputs = model.predict(user_features, item_features)
            for u, i, p in zip(user_id, item_id, outputs):
                if u.item() not in user_item_predictions:
                    user_item_predictions[u.item()] = []
                user_item_predictions[u.item()].append((i.item(), p.item()))

    precision_at_k = []
    recall_at_k = []
    for user_id, predictions in user_item_predictions.items():
        predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:k]
        recommended_items = [x[0] for x in predictions]

        actual_items = dataloader.dataset.interactions[dataloader.dataset.interactions['user_id'] == user_id]['parent_asin'].values
        actual_set = set(actual_items)
        recommended_set = set(recommended_items)

        num_relevant_and_recommended = len(actual_set & recommended_set)
        precision = num_relevant_and_recommended / k
        recall = num_relevant_and_recommended / len(actual_set)

        precision_at_k.append(precision)
        recall_at_k.append(recall)

    avg_precision_at_k = np.mean(precision_at_k)
    avg_recall_at_k = np.mean(recall_at_k)

    return avg_precision_at_k, avg_recall_at_k


In [5]:
valid_interaction_data.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,date_time,user_in_train,parent_asin_in_train,category
0,AFE2EVN2R2UZ72E6WNYGL5ZZ262Q,B09C62FVNL,1.0,1633470584612,2021-10-05 21:49:44.612,True,True,Health_and_Household
1,AF5K4VTGH4QHL5Y2WI2CDI7756NA,B0B82LTDL9,1.0,1628703952276,2021-08-11 17:45:52.276,True,True,Beauty_and_Personal_Care
2,AEI62GY3HKFNAFSLW26HEVNM2KKQ,B07W3QSMF9,5.0,1640032645721,2021-12-20 20:37:25.721,True,True,Automotive
3,AEABHU6O2K3IKLRBKKNAXCB4ZJWQ,B092C59YPV,1.0,1640814022471,2021-12-29 21:40:22.471,True,True,Health_and_Household
4,AFHKCL3TX6FSV3XQXGCZPYLSCJCA,B07JZ67778,5.0,1648524138226,2022-03-29 03:22:18.226,True,True,Kindle_Store


In [6]:
! ls drive/MyDrive/ShoppingPulse/datasets/

content_based_recommended_items_dict.pkl       processed
content_based_recommended_items_dict_test.pkl  raw
content_based_train_item_metadata.pkl	       svd_recommendations_test.pkl
interactions_test_data1.parquet		       svd_recommendations_valid.pkl
interactions_test_data2.parquet		       svd_trainset.pkl
interactions_test_data.parquet		       test_metadata.parquet
interactions_training_data1.parquet	       train_metadata2.parquet
interactions_training_data2.parquet	       train_metadata.parquet
interactions_training_data.parquet	       train_reviews.parquet
interactions_validation_data1.parquet	       two_tower_model_obj.pkl
interactions_validation_data2.parquet	       valid_metadata2.parquet
interactions_validation_data.parquet	       valid_metadata.parquet


In [7]:
valid_item_metadata = pd.read_parquet("drive/MyDrive/ShoppingPulse/datasets/valid_metadata2.parquet")

In [None]:
#valid_item_metadata = valid_item_metadata[valid_item_metadata['parent_asin'].isin(\
#                      valid_interaction_data['parent_asin'].unique())].reset_index(drop=True)

In [None]:
#valid_item_metadata.to_parquet("drive/MyDrive/ShoppingPulse/datasets/valid_metadata2.parquet")

In [8]:
valid_item_metadata['price'] = pd.to_numeric(valid_item_metadata['price'], errors='coerce')

In [10]:
# Convert ratings to numeric and ignore None values
valid_interaction_data['rating'] = pd.to_numeric(valid_interaction_data['rating'], errors='coerce')
valid_interaction_data.dropna(subset=['rating'], inplace=True)
valid_interaction_data.reset_index(drop = True, inplace = True)

In [11]:
valid_user_data = valid_interaction_data[['user_id']].drop_duplicates()
valid_item_data = valid_item_metadata[['parent_asin', 'main_category', 'average_rating', 'rating_number', 'price']]
valid_interactions = valid_interaction_data[['user_id', 'parent_asin', 'rating']]


In [12]:
valid_dataset = AmazonDataset(valid_user_data, valid_item_data, valid_interactions)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['average_rating'].fillna(self.item_data['average_rating'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['rating_number'].fillna(self.item_data['rating_number'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.item_data['price'].fillna(self.item_data['price'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = va

In [13]:
val_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False,)

In [50]:
recomendations = get_predictions(model, val_loader, device, k=200)

In [58]:
recomendations[178.0][0][0]

136.0

In [69]:
for k, v in recomendations.items():
    print(k, v)
    break

156.0 [(321.0, 4.0243682861328125)]


In [81]:
train_item_metadata[train_item_metadata.parent_asin == train_data.item_encoder.inverse_transform([int(156.0)]).item()]

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
7570,Grocery,"HORMEL Premium Turkey Breast in Water, 5 Ounce...",4.6,216,"[100% real meat; fully cooked, ready to eat ca...",[Whether it’s piping hot turkey chili or zesty...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Hormel,"[Grocery & Gourmet Food, Pantry Staples, Canne...","{""Package Dimensions"": ""13.74 x 10.2 x 1.73 in...",B000FIZVYW,,,


In [82]:
reco_items = []
for k, v in recomendations.items():
    product, product_category, reco_product, reco_product_category ,score = \
train_item_metadata[train_item_metadata.parent_asin == train_data.item_encoder.inverse_transform([int(k)]).item()].title.values[0], \
train_item_metadata[train_item_metadata.parent_asin == train_data.item_encoder.inverse_transform([int(k)]).item()].main_category.values[0], \
train_item_metadata[train_item_metadata.parent_asin == train_data.item_encoder.inverse_transform([int(v[0][0])]).item()].title.values[0], \
train_item_metadata[train_item_metadata.parent_asin == train_data.item_encoder.inverse_transform([int(v[0][0])]).item()].main_category.values[0], \
      v[0][1]

    reco_items.append((product, product_category, reco_product, reco_product_category, score))

In [83]:
reco_items_df = pd.DataFrame(reco_items)

In [84]:
reco_items_df.columns = ["product", "product_category", "reco_product", "reco_product_category", "score"]

In [85]:
reco_items_df.head()

Unnamed: 0,product,product_category,reco_product,reco_product_category,score
0,"HORMEL Premium Turkey Breast in Water, 5 Ounce...",Grocery,FRAM Fresh Breeze Cabin Air Filter with Arm & ...,Appliances,4.024368
1,"K&N Motorcycle Oil Filter: High Performance, P...",Automotive,Revlon 1875W Compact Folding Handle Hair Dryer...,All Beauty,3.585972
2,Kraft Minute Tapioca 8oz,Grocery,Motorcraft YL186 A/C Receiver Drier,Automotive,4.299519
3,"Pacific Resources International Manuka Honey, ...",Grocery,Tweezerman Rockhard Stainless Steel Cuticle Ni...,All Beauty,3.42772
4,"Revlon Super Lustrous Lipstick, Blushed [420] ...",All Beauty,12 Pack Cincinnati Chili Mix packets,Grocery,3.960743


In [86]:
reco_items_df.to_csv("reco_items_df.csv", header = True, index = False)

In [27]:
#avg_precision_at_k, avg_recall_at_k, all_recommended_items, all_actual_items = get_predictions(model, val_loader, device, k=200)

In [39]:
all_actual_items[:10]

[[321], [375], [185], [301], [136], [276], [168], [194], [88], [103]]

In [41]:
train_data.item_encoder.inverse_transform([321])

array(['B0010E3JW6'], dtype=object)

In [None]:
"B0010E3JW6"

In [42]:
train_item_metadata[train_item_metadata.parent_asin == 'B0010E3JW6']

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
1594,Appliances,FRAM Fresh Breeze Cabin Air Filter with Arm & ...,4.7,602,[VEHICLE APPLICATIONS: 2001-2005 Lexus IS300; ...,[Cabin air filters have proven to be essential...,13.96,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Fram,"[Automotive, Replacement Parts, Filters, Air F...","{""Brand Name"": ""Fram"", ""Model Info"": ""CF10138""...",B0010E3JW6,,,


In [None]:
for k in [100, 200, 500, 1000, 2000]:
    val_precision_at_k, val_recall_at_k = evaluate(model, val_loader, device, k)
    print(f'Validation Set Performance: Precision@{k}: {val_precision_at_k}, Recall@{k}: {val_recall_at_k}')

Validation Set Performance: Precision@100: 0.01, Recall@100: 1.0
Validation Set Performance: Precision@200: 0.005, Recall@200: 1.0
Validation Set Performance: Precision@500: 0.002000000000000001, Recall@500: 1.0
Validation Set Performance: Precision@1000: 0.0010000000000000005, Recall@1000: 1.0
Validation Set Performance: Precision@2000: 0.0005000000000000002, Recall@2000: 1.0
