In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('./October_missions_full.csv')
df['mission'] = df['type'] + '_' + df['target'].astype(str)

df = df[['user', 'mission', 'createdAtT', 'type', 'target', 'completed', 'performance']]
df['createdAtT'] = pd.to_datetime(df['createdAtT'], unit='ms')
df['user'] = df['user'].astype('category').cat.codes
df['mission'] = df['mission'].astype('category')
df['type'] = df['type'].astype('category')

df.sort_values(by=['createdAtT', 'user'], inplace=True, ignore_index=True)
df

Unnamed: 0,user,mission,createdAtT,type,target,completed,performance
0,4299,quiz_2,2024-10-01 00:00:07.698,quiz,2,False,0.000000
1,4299,mobility_1,2024-10-01 00:00:07.698,mobility,1,True,1.000000
2,4299,activity_6,2024-10-01 00:00:07.698,activity,6,False,0.666667
3,2566,mobility_1,2024-10-01 00:00:17.857,mobility,1,False,0.000000
4,2566,episode_3,2024-10-01 00:00:17.857,episode,3,False,0.666667
...,...,...,...,...,...,...,...
91353,4978,mobility_2,2024-10-31 23:55:25.309,mobility,2,False,0.000000
91354,4978,quiz_5,2024-10-31 23:55:25.309,quiz,5,False,0.000000
91355,1593,episode_2,2024-10-31 23:59:41.228,episode,2,False,0.000000
91356,1593,activity_3,2024-10-31 23:59:41.228,activity,3,False,0.000000


In [None]:
df = df.drop_duplicates(subset=['user', 'mission'], keep='last')

dataset = pd.DataFrame({
    'user': df['user'],
    'mission': df['mission'].cat.codes,
    'y': df['performance']
})

dataset

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MissionDataset(Dataset):
    def __init__(self, user_col, mission_col, y_col, device=DEVICE):
        self.user = torch.tensor(user_col.values, dtype=torch.long, device=device)
        self.mission = torch.tensor(mission_col.values, dtype=torch.long, device=device)
        self.y = torch.tensor(y_col.values, dtype=torch.float32, device=device)
        self.device = device

    def __len__(self):
        assert len(self.user) == len(self.mission) == len(self.y)
        return len(self.user)

    def __getitem__(self, idx):
        return self.user[idx], self.mission[idx], self.y[idx]

In [None]:
import torch.nn as nn

class GMF(nn.Module):
    def __init__(self, n_users, n_missions, n_factors=16):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.mission_emb = nn.Embedding(n_missions, n_factors)
        self.fc = nn.Linear(n_factors, 1)

    def forward(self, user, mission):
        user_emb = self.user_emb(user)
        mission_emb = self.mission_emb(mission)
        return self.fc(user_emb * mission_emb).squeeze()

In [None]:
def train(model: nn.Module, dataset: Dataset, device=DEVICE, n_epochs=10, lr=1e-2, wd=0.0):
    model.to(device)
    model.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.MSELoss()
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(n_epochs):
        for user, mission, y in dataloader:
            optimizer.zero_grad()
            y_pred = model(user, mission)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}/{n_epochs} Loss: {loss.item():.4f}')
    
    return model

@torch.no_grad()
def eval(model: nn.Module, dataset: Dataset, device=DEVICE):
    model.to(device)
    model.eval()

    dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
    y_true = []
    y_pred = []

    for user, mission, y in dataloader:
        y_true.extend(y.tolist())
        y_pred.extend(model(user, mission).tolist())
    
    return np.array(y_true), np.array(y_pred)


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset = MissionDataset(train_df['user'], train_df['mission'], train_df['y'])
test_dataset = MissionDataset(test_df['user'], test_df['mission'], test_df['y'])

n_users = dataset['user'].nunique()
n_missions = dataset['mission'].nunique()

In [None]:
from sklearn.metrics import mean_squared_error
model = GMF(n_users, n_missions, n_factors=8)
model = train(model, train_dataset, n_epochs=10, lr=1e-2, wd=1e-4)

y_true, y_pred = eval(model, test_dataset)
mean_squared_error(y_true, y_pred)