We want to start by preprocessing data and get it ready for the NN

In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [4]:
# Load the datasets
events = pd.read_csv('../data/events.csv')
prop1 = pd.read_csv('../data/item_properties_part1.csv')
prop2 = pd.read_csv('../data/item_properties_part2.csv')

properties = pd.concat([prop1, prop2], ignore_index=True)

In [5]:
properties.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [6]:
#events.sort_values(by=['visitorid', 'itemid'], inplace=True)
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


Properties is not going to be too useful to us, but we can incorporate it later in a content-based filtering system

## Preprocessing

Now we are trying to make a data matrix that has view counts, binary added-to-cart, and binary transaction.

In [7]:
views = events[events['event'] == 'view'][['visitorid', 'itemid']].copy()
views = views.groupby(['visitorid', 'itemid']).size().reset_index(name='view')

carts = events[events['event'] == 'addtocart'][['visitorid', 'itemid']].copy()
carts['cart'] = 1

transactions = events[events['event'] == 'transaction'][['visitorid', 'itemid']].copy()
transactions['transaction'] = 1

merged = pd.merge(views, carts, on=['visitorid', 'itemid'], how='outer')
merged = pd.merge(merged, transactions, on=['visitorid', 'itemid'], how='outer')

merged.fillna(0, inplace=True)
merged.head()

Unnamed: 0,visitorid,itemid,view,cart,transaction
0,0,67045,1.0,0.0,0.0
1,0,285930,1.0,0.0,0.0
2,0,357564,1.0,0.0,0.0
3,1,72028,1.0,0.0,0.0
4,2,216305,2.0,0.0,0.0


In [8]:
data = merged.copy()
data['score'] = np.minimum(np.log(1 + data['view']), 5) + 5*data['cart'] + 10*data['transaction']

data.head()

Unnamed: 0,visitorid,itemid,view,cart,transaction,score
0,0,67045,1.0,0.0,0.0,0.693147
1,0,285930,1.0,0.0,0.0,0.693147
2,0,357564,1.0,0.0,0.0,0.693147
3,1,72028,1.0,0.0,0.0,0.693147
4,2,216305,2.0,0.0,0.0,1.098612


Boom. I think we need to map everything to unique IDs now for the NN.

In [9]:
num_users = data['visitorid'].nunique()
num_items = data['itemid'].nunique()

print(f"Unique users: {num_users}")
print(f"Unique items: {num_items}")

Unique users: 1407580
Unique items: 235061


In [10]:
unique_users = data.visitorid.unique()
user_to_index = {old: new for new,old in enumerate(unique_users)}

unique_products = data.itemid.unique()
product_to_index = {old: new for new,old in enumerate(unique_products)}

data['usermap'] = data['visitorid'].map(user_to_index)
data['itemmap'] = data['itemid'].map(product_to_index)

data.head()

Unnamed: 0,visitorid,itemid,view,cart,transaction,score,usermap,itemmap
0,0,67045,1.0,0.0,0.0,0.693147,0,0
1,0,285930,1.0,0.0,0.0,0.693147,0,1
2,0,357564,1.0,0.0,0.0,0.693147,0,2
3,1,72028,1.0,0.0,0.0,0.693147,1,3
4,2,216305,2.0,0.0,0.0,1.098612,2,4


In [11]:
class NCFDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['usermap'].values, dtype=torch.long)
        self.items = torch.tensor(df['itemmap'].values, dtype=torch.long)
        self.scores = torch.tensor(df['score'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.scores[idx]

dataset = NCFDataset(data)
dataloader = DataLoader(dataset, batch_size=512, shuffle=True)

In [14]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embed = nn.Embedding(num_users, embedding_dim)
        self.item_embed = nn.Embedding(num_items, embedding_dim)
        
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user_ids, item_ids):
        user_vecs = self.user_embed(user_ids)
        item_vecs = self.item_embed(item_ids)
        x = torch.cat([user_vecs, item_vecs], dim=-1)
        out = self.fc_layers(x).squeeze()
        return out

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NCF(num_users=len(user_to_index), num_items=len(product_to_index)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

model.train()
for epoch in range(30):  # increase epochs later
    total_loss = 0
    for user, item, score in dataloader:
        user, item, score = user.to(device), item.to(device), score.to(device)
        optimizer.zero_grad()
        preds = model(user, item)
        loss = criterion(preds, score)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

Epoch 1 Loss: 13669.5442
Epoch 2 Loss: 12649.2259
Epoch 3 Loss: 11523.4586
Epoch 4 Loss: 10422.7467
Epoch 5 Loss: 9321.5823
Epoch 6 Loss: 8263.9447
Epoch 7 Loss: 7272.9897
Epoch 8 Loss: 6351.6711
Epoch 9 Loss: 5511.7323
Epoch 10 Loss: 4769.6020
Epoch 11 Loss: 4126.7672
Epoch 12 Loss: 3563.9383
Epoch 13 Loss: 3098.5544
Epoch 14 Loss: 2709.6837
Epoch 15 Loss: 2385.3076
Epoch 16 Loss: 2119.8204
Epoch 17 Loss: 1896.7484
Epoch 18 Loss: 1708.1860
Epoch 19 Loss: 1553.5207
Epoch 20 Loss: 1423.0626
Epoch 21 Loss: 1307.6498
Epoch 22 Loss: 1212.3939
Epoch 23 Loss: 1127.0281
Epoch 24 Loss: 1056.2507
Epoch 25 Loss: 993.1530
Epoch 26 Loss: 935.3659
Epoch 27 Loss: 885.6914
Epoch 28 Loss: 836.5214
Epoch 29 Loss: 795.4093
Epoch 30 Loss: 761.3263
