In [1]:
!pip install nbimporter

[0m

In [2]:
!pip install ipynb

[0m

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
import pickle

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
import nbimporter

In [13]:
# Import architectures
from recommenders_architecture import *


### Load data

In [14]:
# ======= Load Pairwise Training Data =======
current_dir = Path.cwd()

pairwise_data_train_path= current_dir.parent / "data" / "pairwise"/"pairwise_train.csv"
train = pd.read_csv(pairwise_data_train_path)
pairwise_data_val_path= current_dir.parent / "data" / "pairwise"/"pairwise_val.csv"
val = pd.read_csv(pairwise_data_val_path)
# ======= Load Item Metadata (1027-dim vectors) =======
encoded_dir = current_dir.parent / "data" / "encoded"


In [15]:
model_path = current_dir.parent / "models" / "Yahlly_10_3_NCF_with_Metadata_biases_2_0.9221274085422783.pth"

model = torch.load(model_path, map_location=device)  # Load the entire model object
model.eval()  # Set to evaluation mode

NCFWithMetadata(
  (user_embedding_gmf): Embedding(1096901, 24)
  (item_embedding_gmf): Embedding(198771, 24)
  (user_embedding_mlp): Embedding(1096901, 24)
  (item_embedding_mlp): Embedding(198771, 24)
  (item_proj_gmf): Linear(in_features=280, out_features=24, bias=True)
  (item_proj_mlp): Linear(in_features=280, out_features=24, bias=True)
  (gmf_layer): Linear(in_features=24, out_features=1, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=48, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2549127797233314, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2549127797233314, inplace=False)
  )
  (final_layer): Linear(in_features=129, out_features=1, bias=True)
)

In [30]:
item_metadata[0].shape

torch.Size([256])

In [18]:

# # ======= Configurations =======
# EMBEDDING_DIM = 128  # User embedding size
# ITEM_FEATURE_DIM = item_metadata[0].shape # Length of item metadata vector (text+image)
# BATCH_SIZE = 512
# EPOCHS = 10
# LR = 0.0001  # Learning rate
# VAL_SPLIT = 0.1


# ======= Configurations =======
EMBEDDING_DIM = 24  # User embedding size
#ITEM_FEATURE_DIM = 3075# item_metadata[0].shape # Length of item metadata vector (text+image)
ITEM_FEATURE_DIM = 256 # After autoencoder
BATCH_SIZE = 512
EPOCHS = 10
LR = 0.00001  # Learning rate
VAL_SPLIT = 0.1

In [19]:
# ======= Custom Dataset Class =======
class PairwiseDataset(Dataset):
    def __init__(self, dataframe):
        self.users = dataframe["user_id"].values
        self.item1 = dataframe["item1_id"].values
        self.item2 = dataframe["item2_id"].values
        self.labels = dataframe["label"].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.item1[idx],
            self.item2[idx],
            self.labels[idx],
        )


In [39]:
train

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,1,179127,5,5,1600753653091,5.0
...,...,...,...,...,...,...
8030471,1096899,26803,32852,32852,1692552324736,5.0
8030472,1096899,177842,10643,10643,1692552357767,5.0
8030473,1096900,183765,86867,183765,1600792118191,1.0
8030474,1096900,155119,99585,155119,1615811081145,1.0


In [41]:
val

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,16177,4,4,1490800837000,5.0
1,1,10,174536,10,1676601720832,2.0
2,2,42860,16,16,1588626339041,5.0
3,3,20877,29,29,1605455790941,5.0
4,4,17870,41,41,1638039645551,5.0
...,...,...,...,...,...,...
1096896,1096896,197,11404,11404,1693892929945,5.0
1096897,1096897,32215,161020,161020,1617640776113,5.0
1096898,1096898,9974,33337,9974,1691348903005,5.0
1096899,1096899,45300,92761,92761,1692552496934,5.0


In [20]:
train_loader = DataLoader(PairwiseDataset(train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(PairwiseDataset(val), batch_size=BATCH_SIZE, shuffle=False)

In [21]:
num_users = 1096901
num_items = 198771


In [None]:
# Define directories
current_dir = Path.cwd()
encoded_dir = current_dir.parent / "data" / "encoded"
user_item_dir = current_dir.parent / "data" / "pre_process"
pairwise_output_dir = current_dir.parent / "data" / "pairwise"

# Load user-item interaction matrix
user_item_matrix = sp.load_npz(user_item_dir / 'user_item_matrix.npz')

In [7]:
from scipy.sparse import csr_matrix

In [25]:
train

Unnamed: 0,user_id,item1_id,item2_id,label,timestamp,rating
0,0,0,13349,0,1349041740000,5.0
1,0,22959,1,1,1370958618000,1.0
2,0,97562,2,2,1440038761000,5.0
3,0,23003,3,3,1483320893000,3.0
4,1,179127,5,5,1600753653091,5.0
...,...,...,...,...,...,...
8030471,1096899,26803,32852,32852,1692552324736,5.0
8030472,1096899,177842,10643,10643,1692552357767,5.0
8030473,1096900,183765,86867,183765,1600792118191,1.0
8030474,1096900,155119,99585,155119,1615811081145,1.0


In [27]:
# Ensure 'user_idx', 'item_idx', 'timestamp', and 'rating' columns exist and drop rows with missing values
df2.dropna(subset=['user_idx', 'item_idx', 'timestamp', 'rating'], inplace=True)

# Create a user-item matrix with actual ratings
user_item_matrix = csr_matrix((df2['rating'], (df2['user_idx'], df2['item_idx'])))

# Compute item popularity based on occurrence
item_popularity = np.array((user_item_matrix != 0).sum(axis=0)).flatten()
popularity_prob = item_popularity / item_popularity.sum()

# Convert item indices to a dictionary for quick lookup
item_popularity_dict = {item: popularity for item, popularity in zip(df2['item_idx'].unique(), item_popularity)}


In [31]:
mean_value = np.mean(list(item_popularity_dict.values()))

print(f"Mean of dictionary values: {mean_value}")

Mean of dictionary values: 45.91905760900735


In [28]:
import numpy as np
import torch
from tqdm import tqdm
from scipy.sparse import csr_matrix


# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
correct = 0
total = 0
val_loss = 0

# Validation loop
for user_ids, item1_ids, item2_ids, labels in tqdm(val_loader):
    user_ids, item1_ids, item2_ids, labels = (
        user_ids.to(device),
        item1_ids.to(device),
        item2_ids.to(device),
        labels.to(device),
    )

    # Model scores
    score1 = model(user_ids, item1_ids)
    score2 = model(user_ids, item2_ids)

    # Get popularity scores for the items
    pop_score1 = torch.tensor([item_popularity_dict.get(item_id.item(), 0) for item_id in item1_ids], device=device)
    pop_score2 = torch.tensor([item_popularity_dict.get(item_id.item(), 0) for item_id in item2_ids], device=device)

    # Combine model score and popularity score
    final_score1 = score1 - 5 * pop_score1
    final_score2 = score2 - 5 * pop_score2

    # Determine correct positive and negative scores based on labels
    labels_binary = (labels == item1_ids).float()
    pos_scores = torch.where(labels_binary == 1, final_score1, final_score2)
    neg_scores = torch.where(labels_binary == 1, final_score2, final_score1)

    # Check if the model correctly ranked the positive item higher
    predictions = pos_scores > neg_scores
    correct += predictions.sum().item()
    total += predictions.shape[0]

# Compute final validation accuracy
val_accuracy = correct / total
val_loss = val_loss / len(val_loader)

print(f"Val Accuracy = {val_accuracy:.4f}")


  "        self.num_items = df2['item_idx'].max() + 1\n",
100%|██████████| 2143/2143 [03:39<00:00,  9.78it/s]

Val Accuracy = 0.4825





In [23]:
import numpy as np
import torch
from tqdm import tqdm
from scipy.sparse import csr_matrix

# Ensure required columns exist and drop rows with missing values
df2.dropna(subset=['user_idx', 'item_idx', 'timestamp', 'rating'], inplace=True)

# Create a user-item matrix with actual ratings
user_item_matrix = csr_matrix((df2['rating'], (df2['user_idx'], df2['item_idx'])))

# Compute item popularity based on occurrence (number of ratings)
item_popularity = np.array((user_item_matrix != 0).sum(axis=0)).flatten()

# Convert item indices to a dictionary for quick lookup
item_popularity_dict = {item: popularity for item, popularity in zip(df2['item_idx'].unique(), item_popularity)}

# Device setup (not needed but keeping for consistency)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Evaluation loop using only item popularity
correct = 0
total = 0

for user_ids, item1_ids, item2_ids, labels in tqdm(val_loader):
    item1_ids, item2_ids, labels = (
        item1_ids.cpu().numpy(),
        item2_ids.cpu().numpy(),
        labels.cpu().numpy(),
    )

    # Get popularity scores for the items
    pop_score1 = np.array([item_popularity_dict.get(item_id, 0) for item_id in item1_ids])
    pop_score2 = np.array([item_popularity_dict.get(item_id, 0) for item_id in item2_ids])

    # Predict the more popular item
    predictions = pop_score1 > pop_score2
    labels_binary = labels == item1_ids  # True if item1 is the correct answer

    # Compute accuracy
    correct += (predictions == labels_binary).sum()
    total += len(labels)

# Compute final validation accuracy
val_accuracy = correct / total

print(f"Val Accuracy = {val_accuracy:.4f}")


100%|██████████| 2143/2143 [00:03<00:00, 627.52it/s]

Val Accuracy = 0.5171





In [None]:

# # ======= Initialize Model, Loss, Optimizer =======
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.eval()
# correct = 0
# total = 0
# val_loss = 0
# for user_ids, item1_ids, item2_ids, labels in tqdm(train_loader):
#     user_ids, item1_ids, item2_ids, labels = (
#         user_ids.to(device),
#         item1_ids.to(device),
#         item2_ids.to(device),
#         labels.to(device),
#     )
#     score1 = model(user_ids, item1_ids)
#     score2 = model(user_ids, item2_ids)
    
#     # Determine the correct positive and negative scores based on labels
#     labels_binary = (labels == item1_ids).float()
#     #print(labels_binary)
#     pos_scores = torch.where(labels_binary == 1, score1, score2)
#     neg_scores = torch.where(labels_binary == 1, score2, score1)
#     #print(pos_scores)
#     # Check if the model correctly ranked the positive item higher
#     predictions = pos_scores > neg_scores
#     correct += predictions.sum().item()
#     total += predictions.shape[0]
   
# val_accuracy = correct / total
# val_loss=val_loss/len(val_loader)
# print(f"Val Accuracy = {val_accuracy:.4f}")


In [3]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import pickle
from scipy.sparse import load_npz

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import optuna
import wandb
import numpy as np
import pandas as pd
import scipy.sparse as sp
from pathlib import Path
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
current_dir = Path.cwd()
user_item_file_path = current_dir.parent / "data" / "data_and_test_files" / "user_item_rating_table_train.csv"
df2 = pd.read_csv(user_item_file_path,index_col=0)
# Load user and item mapping:
user_mapping_path = current_dir.parent / "data" / "pre_process" / "user_mapping.pkl"
item_mapping_path = current_dir.parent / "data" / "pre_process" / "item_mapping.pkl"
import pickle
# Load user and item mappings
with open(user_mapping_path, 'rb') as f:
    user_mapping = pickle.load(f)

with open(item_mapping_path, 'rb') as f:
    item_mapping = pickle.load(f)

# Add numeric indices to the DataFrame
df2['user_idx'] = df2['user_id'].map(user_mapping)
df2['item_idx'] = df2['parent_asin'].map(item_mapping)
df2

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_idx,item_idx
0,AGGZ357AO26RQZVRLGU4D4N52DZQ,B009RTBRVG,5.0,1349041740000,0,0
1,AGGZ357AO26RQZVRLGU4D4N52DZQ,B003MZ01CM,1.0,1370958618000,0,1
2,AGGZ357AO26RQZVRLGU4D4N52DZQ,B07L6QT33F,5.0,1440038761000,0,2
3,AGGZ357AO26RQZVRLGU4D4N52DZQ,B07V6PKCCG,3.0,1483320893000,0,3
5,AGGZ357AO26RQZVRLGU4D4N52DZQ,B0BNP511CS,5.0,1490800837000,0,4
...,...,...,...,...,...,...
10563583,AEQHNMSCENA2TJAJEFK5SFI3ZKXA,B09G9THPC6,5.0,1692552496934,1096899,92761
10563584,AFGBVYKTFNQH5NIHXNB5ANVPANXQ,B088D217BQ,1.0,1600792118191,1096900,183765
10563585,AFGBVYKTFNQH5NIHXNB5ANVPANXQ,B07WV5H4DN,1.0,1615811081145,1096900,155119
10563587,AFGBVYKTFNQH5NIHXNB5ANVPANXQ,B0BL3PQHR4,4.0,1693494834857,1096900,25515


In [None]:
# Ensure 'user_idx', 'item_idx', 'timestamp', and 'rating' columns exist and drop rows with missing values
df2.dropna(subset=['user_idx', 'item_idx', 'timestamp', 'rating'], inplace=True)

# Create a user-item matrix with actual ratings
user_item_matrix = csr_matrix((df2['rating'], (df2['user_idx'], df2['item_idx'])))

item_popularity = np.array((user_item_matrix != 0).sum(axis=0)).flatten()
item_ids = df2['item_idx'].unique()

# Create pairwise training data
popularity_prob = item_popularity / item_popularity.sum()


# ======= Reformat this to choose the more popular item as the prediction =======
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
correct = 0
total = 0
val_loss = 0
for user_ids, item1_ids, item2_ids, labels in tqdm(train_loader):
    user_ids, item1_ids, item2_ids, labels = (
        user_ids.to(device),
        item1_ids.to(device),
        item2_ids.to(device),
        labels.to(device),
    )
    score1 = model(user_ids, item1_ids)
    score2 = model(user_ids, item2_ids)
    
    # Determine the correct positive and negative scores based on labels
    labels_binary = (labels == item1_ids).float()
    #print(labels_binary)
    pos_scores = torch.where(labels_binary == 1, score1, score2)
    neg_scores = torch.where(labels_binary == 1, score2, score1)
    #print(pos_scores)
    # Check if the model correctly ranked the positive item higher
    predictions = pos_scores > neg_scores
    correct += predictions.sum().item()
    total += predictions.shape[0]
   
val_accuracy = correct / total
val_loss=val_loss/len(val_loader)
print(f"Val Accuracy = {val_accuracy:.4f}")


## Predict on warm items

In [32]:
pairwise_data_test_path= current_dir.parent / "data" / "data_and_test_files"/"warm_items_classification_test_format.csv" 
test = pd.read_csv(pairwise_data_test_path)
test

Unnamed: 0,user_id,item_0,item_1,class
0,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,B0BGJ85RYM,B07DRMVW3W,-1
1,AGKHLEW2SOWHNMFQIJGBECAF7INQ,B09Q7G1MMW,B00W3DKZAW,-1
2,AGBFYI2DDIKXC5Y4FARTYDTQBMFQ,B07C2SNKLV,B087P7MTPH,-1
3,AFVNEEPDEIH5SPUN5BWC6NKL3WNQ,B096TL6GP9,B0BD8S9X3D,-1
4,AHBHMMUTJ5W3SZUHVKUP35ZYKUHA,B07M9LPLM1,B07JZ5Z926,-1
...,...,...,...,...
377495,AEHZGPPZYJN3TZJCT3VRWWNDTFMA,B08W4VQ5SK,B08CXHZ568,-1
377496,AGMYCLCYBCQOSQU4B3HWPGC75NDA,B0BQ5TWJTB,B0B777HLJX,-1
377497,AHFOHO6VR5STQII7CB7XS3ZNHSRQ,B00I5IGWQ0,B07XCFNQK9,-1
377498,AFL75NBCXVMDRIUFOZWSSQBL242A,B07942VRLV,B07PP53522,-1


In [36]:
user_mapping_path = current_dir.parent / "data" / "pre_process" / "user_mapping.pkl"
item_mapping_path = current_dir.parent / "data" / "pre_process" / "item_mapping.pkl"
import pickle
# Load user and item mappings
with open(user_mapping_path, 'rb') as f:
    user_mapping = pickle.load(f)

with open(item_mapping_path, 'rb') as f:
    item_mapping = pickle.load(f)

# Add numeric indices to the DataFrame
test['user_idx'] = test['user_id'].map(user_mapping)
test['item1_id'] = test['item_0'].map(item_mapping)
test['item2_id'] = test['item_1'].map(item_mapping)
test['label'] = -1

In [38]:
test_loader = DataLoader(PairwiseDataset(test), batch_size=BATCH_SIZE, shuffle=False)

In [42]:
# Evaluation loop using only item popularity

tot_pred = []
for user_ids, item1_ids, item2_ids, labels in tqdm(test_loader):
    item1_ids, item2_ids, labels = (
        item1_ids.cpu().numpy(),
        item2_ids.cpu().numpy(),
        labels.cpu().numpy(),
    )

    # Get popularity scores for the items
    pop_score1 = np.array([item_popularity_dict.get(item_id, 0) for item_id in item1_ids])
    pop_score2 = np.array([item_popularity_dict.get(item_id, 0) for item_id in item2_ids])

    # Predict the more popular item
    predictions = (pop_score1 < pop_score2).astype(int)
    tot_pred.append(predictions)

test['class'] = np.concatenate(tot_pred)

100%|██████████| 738/738 [00:01<00:00, 713.56it/s]


In [45]:
test['class'].value_counts()

class
0    189591
1    187909
Name: count, dtype: int64

In [50]:
test[['user_id','item_0','item_1','class']].to_csv("warm_items_classification_test_format.csv",index=False)