In [2]:
from preprocess import *

data_path = "data/epinions.txt"

preprocess(data_path, 10)

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Processed 110000 lines
Processed 120000 lines
Processed 130000 lines
Processed 140000 lines
Number of users: 1178
Number of items: 15094
Number of interactions: 20084
Sparsity: 0.9988704627008607
Test set size: 1178
    index          user                                               item  \
0      48        200067  Samsung_SGH_N100_Voice_Activated_WAP_Phone_Cel...   
1      66         300ex                      auto_Make-2002_Subaru_Impreza   
2      83      77chelle      Leappad_Microphone_With_Two_Interactive_Books   
3      95   ASourdough4       Hewlett_Packard_LASERJET_PRO_P1606DN_PRINTER   
4     151      AVaddict           pr-Bose_301_Series_V_Main_Stereo_Speaker   
5     161    AgnesVarda         pr-Pentax_UC-1_35mm_Point_and_Shoot_Camera   
6     205       A

In [1]:
from epinions_dataset import EpinionsDataset
import pandas as pd
import pickle
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get the data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

# Get user to items dictionary from pickle file
with open('data/user_to_item.pkl', 'rb') as f:
    user_to_items = pickle.load(f)

# Get items set from pickle file
with open('data/items.pkl', 'rb') as f:
    items_set = pickle.load(f)

# Create dataset
trainset = EpinionsDataset(train, user_to_items, items_set)
print(len(trainset))
print(trainset[0][0].shape)
print(trainset[0][1].shape)

# Create test dataset
testset = EpinionsDataset(test, user_to_items, items_set)

94530
torch.Size([1178])
torch.Size([15094])


In [2]:
from ncf import NCF
from torch.utils.data import DataLoader

# Create dataloader
train_loader = DataLoader(trainset, batch_size=64, shuffle=True)
test_loader = DataLoader(testset, batch_size=1, shuffle=False)

# Create NCF model
user_input_dim = len(user_to_items)
item_input_dim = len(items_set)
hidden_layers = [64, 32]
embedding_dim = 1024
model = NCF(user_input_dim, item_input_dim, embedding_dim, hidden_layers[0], hidden_layers[1]).to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [5]:
from tqdm import tqdm
import random

def test(model, test_loader, topk=10):
    model.eval()
    items_list = list(items_set)
    with torch.no_grad():
        num_hits = 0
        for user, item, label in tqdm(test_loader, desc="Testing", total=len(test_loader)):
            user, item, label = user.to(device), item.to(device), label.to(device)

            # For each user, we sample 100 items and get the model output
            negative_samples = set()
            while len(negative_samples) < 100:
                # Get random item from items list
                user_plaintext = trainset.one_hot_to_user(user[0])
                random_item = random.choice(items_list)
                if random_item not in user_to_items[user_plaintext]:
                    negative_samples.add(random_item)

            # Create batch of negative samples
            for i, sample in enumerate(negative_samples):
                one_hot = trainset.item_to_one_hot(sample)
                # Reshape one_hot to be compatible with item
                one_hot = one_hot.unsqueeze(0)
                user_temp = user[0].unsqueeze(0)
                user = torch.cat((user, user_temp), 0)
                item = torch.cat((item, one_hot), 0)
            output = model(user, item)
            # Flatten output to be 1D
            output = output.reshape(-1)
            # Check if the output of the positive sample is in the top k
            _, indices = torch.topk(output, topk)
            if 0 in indices:
                num_hits += 1

    return num_hits / len(test_loader)

def train(model, train_loader, test_loader, optimizer, criterion, epochs=10):
    model.train()
    losses = []
    hit_rates = []
    for epoch in range(epochs):
        iterator = iter(train_loader)
        
        total_loss = []
        for i, (user, item, label) in tqdm(enumerate(iterator), desc=f"Epoch {epoch+1}", total=len(train_loader)):
            user, item, label = user.to(device), item.to(device), label.to(device)
            optimizer.zero_grad()
            output = model(user, item)
            label = label.unsqueeze(1).float()
            loss = criterion(output, label)
            total_loss.append(loss.item())
            loss.backward()
            optimizer.step()

        hit_rate = test(model, test_loader)
        hit_rates.append(hit_rate)
        
        print(f"Epoch {epoch+1} loss: {sum(total_loss)/len(total_loss)} hit rate: {hit_rate}")
        losses.append(sum(total_loss)/len(total_loss))

    return losses, hit_rates

In [4]:
# Run the training
losses, hit_rates = train(model, train_loader, test_loader, optimizer, criterion, epochs=10)

Epoch 1: 100%|██████████| 1478/1478 [01:47<00:00, 13.75it/s]
Testing: 100%|██████████| 1178/1178 [01:50<00:00, 10.68it/s]


Epoch 0 loss: 0.511633781290264 hit rate: 0.08234295415959253


Epoch 2: 100%|██████████| 1478/1478 [01:47<00:00, 13.75it/s]
Testing:  14%|█▍        | 164/1178 [00:15<01:36, 10.56it/s]
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:110] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 