In [55]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

In [77]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, mf_dim, layers, num_anime_features):
        super(NCF, self).__init__()

        # MF layers
        self.user_embed_MF = nn.Embedding(num_users, mf_dim)
        self.item_embed_MF = nn.Embedding(num_items, mf_dim)

        # MLP layers
        self.user_embed_MLP = nn.Embedding(num_users, layers[0] // 2)
        self.item_embed_MLP = nn.Embedding(num_items, layers[0] // 2)
        
        # Anime features embedding layer
        self.anime_feature_embed = nn.Linear(num_anime_features, layers[0] // 2)
        
        # self.mlp = nn.Sequential([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers) - 1)])
        self.mlp = nn.Sequential(
            nn.Linear(3 * (layers[0] // 2), 256),
            nn.Linear(256, 128),
            nn.Linear(128, 64),
            nn.Linear(64, 32)
        )

        # final prediction layer
        # self.prediction = nn.Linear(mf_dim + layers[-1], 1)
        self.prediction = nn.Linear(40, 1)

    def forward(self, user_indices, item_indices, anime_features):
        # MF part
        user_embed_MF = self.user_embed_MF(user_indices)
        item_embed_MF = self.item_embed_MF(item_indices)
        mf_vector = user_embed_MF * item_embed_MF

        # MLP part
        user_embed_MLP = self.user_embed_MLP(user_indices)
        item_embed_MLP = self.item_embed_MLP(item_indices)
        
        # Anime features part
        anime_feature_embed = self.anime_feature_embed(anime_features)
        
        # user_embed_MLP (layers[0] // 2)
        # item_embed_MLP (layers[0] // 2)
        # anime_feature_embed (layers[0] // 2)

        mlp_vector = torch.cat([user_embed_MLP, item_embed_MLP, anime_feature_embed], dim=-1)
        mlp_vector = self.mlp(mlp_vector)

        # concatenate MF and MLP parts
        vector = torch.cat([mf_vector, mlp_vector], dim=-1)

        # final prediction
        pred = self.prediction(vector)
        return torch.sigmoid(pred)


In [57]:
class RatingDataset(Dataset):
    """Rating Dataset for DataLoader"""

    def __init__(self, user_tensor, item_tensor, feature_tensor, target_tensor=None):
        """
        Args:
            user_tensor (torch.Tensor): User ID tensor. Size [n_samples]
            item_tensor (torch.Tensor): Item ID tensor. Size [n_samples]
            feature_tensor (torch.Tensor): Feature tensor. Size [n_samples, n_features]
            target_tensor (torch.Tensor, optional): Target tensor. Size [n_samples]
        """
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.feature_tensor = feature_tensor
        self.target_tensor = target_tensor

    def __getitem__(self, index):
        if self.target_tensor is None:
            return self.user_tensor[index], self.item_tensor[index], self.feature_tensor[index]
        else:
            return self.user_tensor[index], self.item_tensor[index], self.feature_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.user_tensor.size(0)

In [58]:
# Reload the data
train_data = pd.read_csv('../../data/train.csv')
anime_data = pd.read_csv('../../data/anime.csv')
test_data = pd.read_csv('../../data/test.csv')

In [76]:
print(train_data.shape)
print(anime_data.shape)
print(test_data.shape)

(136401, 10)
(2000, 18)
(117676, 9)


In [59]:
# Create anime_features
anime_features = anime_data[['anime_id', 'episodes', 'members', 'watching', 'completed', 'on_hold', 'dropped', 'plan_to_watch']]

# Merge train data and anime features
train_data = train_data.merge(anime_features, on='anime_id', how='left')

# Merge test data and anime features
test_data = test_data.merge(anime_features, on='anime_id', how='left')

# Get unique user and item IDs
unique_user_ids = pd.unique(pd.concat([train_data['user_id'], test_data['user_id']]))
unique_item_ids = pd.unique(pd.concat([train_data['anime_id'], test_data['anime_id']]))

# Create user and item ID to index mapping
user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
item_id_to_index = {item_id: index for index, item_id in enumerate(unique_item_ids)}

# Apply mapping to train and test data
train_data['user_id'] = train_data['user_id'].map(user_id_to_index)
train_data['anime_id'] = train_data['anime_id'].map(item_id_to_index)
test_data['user_id'] = test_data['user_id'].map(user_id_to_index)
test_data['anime_id'] = test_data['anime_id'].map(item_id_to_index)

In [60]:
# # Merge train data and anime features
# train_data = pd.read_csv('/mnt/data/train.csv')
# train_data = train_data.merge(anime_features, on='anime_id', how='left')

# # Merge test data and anime features
# test_data = pd.read_csv('/mnt/data/test.csv')
# test_data = test_data.merge(anime_features, on='anime_id', how='left')

# # Get unique user and item IDs
# unique_user_ids = pd.unique(pd.concat([train_data['user_id'], test_data['user_id']]))
# unique_item_ids = pd.unique(pd.concat([train_data['anime_id'], test_data['anime_id']]))

# # Create user and item ID to index mapping
# user_id_to_index = {user_id: index for index, user_id in enumerate(unique_user_ids)}
# item_id_to_index = {item_id: index for index, item_id in enumerate(unique_item_ids)}

# # Apply mapping to train and test data
# train_data['user_id'] = train_data['user_id'].map(user_id_to_index)
# train_data['anime_id'] = train_data['anime_id'].map(item_id_to_index)
# test_data['user_id'] = test_data['user_id'].map(user_id_to_index)
# test_data['anime_id'] = test_data['anime_id'].map(item_id_to_index)


In [84]:
train_user_tensor.unique()

tensor([   0,    1,    2,  ..., 1791, 1792, 1793])

In [85]:
test_user_tensor.unique()

tensor([   0,    1,    2,  ..., 1995, 1996, 1997])

In [61]:
# Replace 'Unknown' with NaN in 'episodes' column
train_data['episodes'] = train_data['episodes'].replace('Unknown', np.nan)
test_data['episodes'] = test_data['episodes'].replace('Unknown', np.nan)

# Convert 'episodes' column to float type
train_data['episodes'] = train_data['episodes'].astype(float)
test_data['episodes'] = test_data['episodes'].astype(float)

# Fill NaN with the median of the column
train_data['episodes'].fillna(train_data['episodes'].median(), inplace=True)
test_data['episodes'].fillna(test_data['episodes'].median(), inplace=True)

# Prepare training data
train_user_tensor = torch.from_numpy(train_data['user_id'].values.astype(np.int64))
train_item_tensor = torch.from_numpy(train_data['anime_id'].values.astype(np.int64))
train_feature_tensor = torch.from_numpy(train_data[anime_features.columns].values.astype(np.float32))
train_target_tensor = torch.from_numpy(train_data['score'].values.astype(np.float32))
train_dataset = RatingDataset(train_user_tensor, train_item_tensor, train_feature_tensor, train_target_tensor)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

# Prepare test data
test_user_tensor = torch.from_numpy(test_data['user_id'].values.astype(np.int64))
test_item_tensor = torch.from_numpy(test_data['anime_id'].values.astype(np.int64))
test_feature_tensor = torch.from_numpy(test_data[anime_features.columns].values.astype(np.float32))
test_dataset = RatingDataset(test_user_tensor, test_item_tensor, test_feature_tensor, target_tensor=None)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

# Initialize model
num_users = len(train_data['user_id'].unique())
num_items = len(train_data['anime_id'].unique())
num_anime_features = len(anime_features.columns)
mf_dim = 8  # dimension of MF
layers = [384, 128, 32, 8]  # layer size of MLP
model = NCF(num_users, num_items, mf_dim, layers, num_anime_features)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train model
for epoch in tqdm(range(10)):  # run for 10 epochs
    for user, item, feature, rating in train_loader:
        # Forward pass
        outputs = model(user, item, feature)
        loss = criterion(outputs.squeeze(), rating)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{10}, Loss: {loss.item()}')

 10%|█         | 1/10 [00:03<00:31,  3.46s/it]

Epoch 1/10, Loss: 62.727272033691406


 20%|██        | 2/10 [00:06<00:26,  3.32s/it]

Epoch 2/10, Loss: 58.464115142822266


 30%|███       | 3/10 [00:10<00:23,  3.32s/it]

Epoch 3/10, Loss: 63.7559814453125


 40%|████      | 4/10 [00:13<00:19,  3.30s/it]

Epoch 4/10, Loss: 62.90909194946289


 50%|█████     | 5/10 [00:16<00:16,  3.30s/it]

Epoch 5/10, Loss: 64.18659973144531


 60%|██████    | 6/10 [00:19<00:13,  3.31s/it]

Epoch 6/10, Loss: 62.483253479003906


 70%|███████   | 7/10 [00:23<00:09,  3.32s/it]

Epoch 7/10, Loss: 61.976078033447266


 80%|████████  | 8/10 [00:26<00:06,  3.34s/it]

Epoch 8/10, Loss: 61.784690856933594


 90%|█████████ | 9/10 [00:29<00:03,  3.33s/it]

Epoch 9/10, Loss: 62.349281311035156


100%|██████████| 10/10 [00:33<00:00,  3.32s/it]

Epoch 10/10, Loss: 63.47846984863281





IndexError: index out of range in self

In [78]:
# Make predictions on test set
model.eval()  # switch to evaluation mode
with torch.no_grad():
    predictions = []
    for user, item, feature in test_loader:
        outputs = model(user, item, feature)
        predictions.extend(outputs.squeeze().tolist())

# Print first 10 predictions
print('First 10 predictions:', predictions[:10])

IndexError: index out of range in self

In [None]:
384 // 2

In [None]:
layers = [384, 192, 32, 8]
[nn.Linear(layers[i], layers[i+1]) for i in range(len(layers) - 1)]

In [None]:
# mlp_vector = torch.cat([user_embed_MLP, item_embed_MLP, anime_feature_embed], dim=-1)
# ここの次元数が合わない