In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def train_file_path(category):
  return f'/content/drive/MyDrive/ShoppingPulse/datasets/raw/interactions_train/flattened_data_{category}.parquet'

In [6]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.user_ids = df['user_id'].astype('category').cat.codes.values
        self.item_ids = df['parent_asin'].astype('category').cat.codes.values
        self.ratings = df['rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        return user_embeds * item_embeds

class MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dim):
        super(MLP, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.ReLU()
        )

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        x = torch.cat([user_embeds, item_embeds], dim=-1)
        return self.fc_layers(x)

class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, gmf_dim, mlp_dim, hidden_dim):
        super(NeuMF, self).__init__()
        self.gmf = GMF(num_users, num_items, gmf_dim)
        self.mlp = MLP(num_users, num_items, mlp_dim, hidden_dim)
        self.final_layer = nn.Linear(gmf_dim + hidden_dim // 4, 1)

    def forward(self, user_ids, item_ids):
        gmf_out = self.gmf(user_ids, item_ids)
        mlp_out = self.mlp(user_ids, item_ids)
        concat_out = torch.cat([gmf_out, mlp_out], dim=-1)
        return self.final_layer(concat_out).squeeze()

def collaborative_filtering_neumf(category, gmf_dim=32, mlp_dim=32, hidden_dim=64, epochs=10, batch_size=64, lr=0.001):
    dataset_path = train_file_path(category)
    df = pd.read_parquet(dataset_path)

    df['user_id'] = df['user_id'].astype(str)
    df['parent_asin'] = df['parent_asin'].astype(str)
    df['rating'] = df['rating'].astype(float)

    dataset = RatingsDataset(df)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    num_users = len(df['user_id'].unique())
    num_items = len(df['parent_asin'].unique())

    model = NeuMF(num_users, num_items, gmf_dim, mlp_dim, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_ids, item_ids, ratings in dataloader:
            user_ids = torch.tensor(user_ids, dtype=torch.long)
            item_ids = torch.tensor(item_ids, dtype=torch.long)
            ratings = torch.tensor(ratings, dtype=torch.float)

            optimizer.zero_grad()
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(dataloader)}')

    model.eval()
    with torch.no_grad():
        user_ids = torch.tensor(dataset.user_ids, dtype=torch.long)
        item_ids = torch.tensor(dataset.item_ids, dtype=torch.long)
        predicted_ratings = model(user_ids, item_ids).numpy()

    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=['predicted_rating'])
    result_df = df.copy()
    result_df['predicted_rating'] = predicted_ratings_df['predicted_rating']

    return result_df.pivot(index='user_id', columns='parent_asin', values='predicted_rating')

category = "Gift_Cards"
predicted_ratings_df = collaborative_filtering_neumf(category)
print(predicted_ratings_df)

  user_ids = torch.tensor(user_ids, dtype=torch.long)
  item_ids = torch.tensor(item_ids, dtype=torch.long)
  ratings = torch.tensor(ratings, dtype=torch.float)


Epoch 1/10, Loss: 1.6909517706104167
Epoch 2/10, Loss: 1.1267571861203134
Epoch 3/10, Loss: 1.0812575206168262
Epoch 4/10, Loss: 1.0180320267940297
Epoch 5/10, Loss: 0.9190749168735536
Epoch 6/10, Loss: 0.7931490699007745
Epoch 7/10, Loss: 0.6373895726198979
Epoch 8/10, Loss: 0.47059869331007237
Epoch 9/10, Loss: 0.3284822066079638
Epoch 10/10, Loss: 0.2246081908650654
parent_asin                   1619923009  B00067G16E  B000VFA4BY  B001GXRQW0  \
user_id                                                                        
AE225J6Y3OFUCKVVCRDHXLZKDULA         NaN         NaN         NaN         NaN   
AE22EOOOTMJOODRKZEOCVDODY4GA         NaN         NaN         NaN         NaN   
AE22HVB2CJ27FBMLD6PBNZ3KLHUQ         NaN         NaN         NaN         NaN   
AE22LNDYO5VO3ZPRC4SO4HTXLQHQ         NaN         NaN         NaN         NaN   
AE22LYKDYTYNPV7M4SGSDIGUQY7Q         NaN         NaN         NaN         NaN   
...                                  ...         ...         ...    

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

class RatingsDataset(Dataset):
    def __init__(self, df):
        self.user_ids = df['user_id'].astype('category').cat.codes.values
        self.item_ids = df['parent_asin'].astype('category').cat.codes.values
        self.ratings = df['rating'].values

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.ratings[idx]

class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super(GMF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        return user_embeds * item_embeds

class MLP(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dim):
        super(MLP, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.ReLU()
        )

        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embedding(user_ids)
        item_embeds = self.item_embedding(item_ids)
        x = torch.cat([user_embeds, item_embeds], dim=-1)
        return self.fc_layers(x)

class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, gmf_dim, mlp_dim, hidden_dim):
        super(NeuMF, self).__init__()
        self.gmf = GMF(num_users, num_items, gmf_dim)
        self.mlp = MLP(num_users, num_items, mlp_dim, hidden_dim)
        self.final_layer = nn.Linear(gmf_dim + hidden_dim // 4, 1)

        nn.init.kaiming_normal_(self.final_layer.weight, nonlinearity='relu')
        if self.final_layer.bias is not None:
            nn.init.constant_(self.final_layer.bias, 0)

    def forward(self, user_ids, item_ids):
        gmf_out = self.gmf(user_ids, item_ids)
        mlp_out = self.mlp(user_ids, item_ids)
        concat_out = torch.cat([gmf_out, mlp_out], dim=-1)
        return self.final_layer(concat_out).squeeze()

def collaborative_filtering_neumf(category, gmf_dim=32, mlp_dim=32, hidden_dim=64, epochs=1, batch_size=64, lr=0.001):
    dataset_path = train_file_path(category)
    df = pd.read_parquet(dataset_path)

    df['user_id'] = df['user_id'].astype(str)
    df['parent_asin'] = df['parent_asin'].astype(str)
    df['rating'] = df['rating'].astype(float)

    # Normalize ratings
    df['rating'] = df['rating'] / df['rating'].max()

    dataset = RatingsDataset(df)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    num_users = len(df['user_id'].unique())
    num_items = len(df['parent_asin'].unique())

    model = NeuMF(num_users, num_items, gmf_dim, mlp_dim, hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    # Move model to device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user_ids, item_ids, ratings in dataloader:
            user_ids = torch.tensor(user_ids, dtype=torch.long).clone().detach()
            item_ids = torch.tensor(item_ids, dtype=torch.long).clone().detach()
            ratings = torch.tensor(ratings, dtype=torch.float).clone().detach()

            optimizer.zero_grad()
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(dataloader)}')

    model.eval()
    with torch.no_grad():
        user_ids = torch.tensor(dataset.user_ids, dtype=torch.long).clone().detach()
        item_ids = torch.tensor(dataset.item_ids, dtype=torch.long).clone().detach()
        predicted_ratings = model(user_ids, item_ids).cpu().numpy()

    predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=['predicted_rating'])
    result_df = df.copy()
    result_df['predicted_rating'] = predicted_ratings_df['predicted_rating']

    return result_df.pivot(index='user_id', columns='parent_asin', values='predicted_rating')

# Example usage
category = 'Gift_Cards'
predicted_ratings_df = collaborative_filtering_neumf(category)
print(predicted_ratings_df)


  user_ids = torch.tensor(user_ids, dtype=torch.long).clone().detach()
  item_ids = torch.tensor(item_ids, dtype=torch.long).clone().detach()
  ratings = torch.tensor(ratings, dtype=torch.float).clone().detach()


Epoch 1/1, Loss: 0.0716021574193497
parent_asin                   1619923009  B00067G16E  B000VFA4BY  B001GXRQW0  \
user_id                                                                        
AE225J6Y3OFUCKVVCRDHXLZKDULA         NaN         NaN         NaN         NaN   
AE22EOOOTMJOODRKZEOCVDODY4GA         NaN         NaN         NaN         NaN   
AE22HVB2CJ27FBMLD6PBNZ3KLHUQ         NaN         NaN         NaN         NaN   
AE22LNDYO5VO3ZPRC4SO4HTXLQHQ         NaN         NaN         NaN         NaN   
AE22LYKDYTYNPV7M4SGSDIGUQY7Q         NaN         NaN         NaN         NaN   
...                                  ...         ...         ...         ...   
AHZZWDFRRAR5P43PBUMDDCXO2DTQ         NaN         NaN         NaN         NaN   
AHZZX33LWQAIOABECYVVD5VI5MXA         NaN         NaN         NaN         NaN   
AHZZXXX7OJX266FSIET6OU4BEBSA         NaN         NaN         NaN         NaN   
AHZZYQXZBHX3G3BQEENWOMJYYXDQ         NaN         NaN         NaN         NaN   
AHZZ