In [1]:
import pandas as pd

df = pd.concat((
    pd.read_csv('./data/October_missions_full.csv'),
    pd.read_csv('./data/November_1stW_missions_full.csv')
), ignore_index=True)

df['mission'] = df['type'] + '_' + df['target'].astype(str)

df = df[['user', 'mission', 'createdAtT', 'type', 'target', 'performance']]
df['createdAtT'] = pd.to_datetime(df['createdAtT'], unit='ms').dt.date
df = df.groupby('user').filter(lambda x: len(x['createdAtT'].unique()) > 2)

df['user'] = df['user'].astype('category').cat.codes
df['mission'] = df['mission'].astype('category')
df['missionID'] = df['mission'].cat.codes
df['type'] = df['type'].astype('category')

def reward(x):
    if x <= 1:
        return x
    return max(0, 2 - x**2)

df['reward'] = df['performance'].apply(reward)
df.rename(columns={'createdAtT': 'date'}, inplace=True)

df.sort_values(by=['date', 'user'], inplace=True, ignore_index=True)
display(df)

Unnamed: 0,user,mission,date,type,target,performance,missionID,reward
0,0,action_1,2024-10-01,action,1,1.000000,0,1.000000
1,0,quiz_5,2024-10-01,quiz,5,1.000000,28,1.000000
2,0,exp_100,2024-10-01,exp,100,1.200000,18,0.560000
3,2,episode_1,2024-10-01,episode,1,1.000000,12,1.000000
4,2,quiz_1,2024-10-01,quiz,1,1.000000,24,1.000000
...,...,...,...,...,...,...,...,...
87477,2043,activity_7,2024-11-07,activity,7,0.000000,9,0.000000
87478,2043,quiz_1,2024-11-07,quiz,1,0.000000,24,0.000000
87479,2049,mobility_3,2024-11-07,mobility,3,0.000000,22,0.000000
87480,2049,quiz_3,2024-11-07,quiz,3,0.333333,26,0.333333


In [2]:
import torch
import numpy as np
from src import models as m
from sklearn.metrics import mean_squared_error

df.drop_duplicates(subset=['user', 'mission'], keep='last', inplace=True, ignore_index=True)
DEVICE = 'cpu'

def fold(d):
    test_df: pd.DataFrame = df[df['date'] == d]
    train_df = df[df['date'] < d]

    # Remove users from test set that are not in the training set
    test_df = test_df[test_df['user'].isin(train_df['user'])]

    n_users = train_df['user'].max() + 1
    n_missions = train_df['missionID'].max() + 1

    mf = m.MF(n_users, n_missions, embedding_dim=16).fit(train_df)
    autorec = m.UserBasedAutoRec(n_users=n_users, n_missions=n_missions, hidden_dim=16, dropout=0.1).fit(train_df)
    mlp = m.MLP(n_users, n_missions, embedding_dim=16, hidden_dim=32, dropout=0.1).fit(train_df)

    y_hat_autorec = np.clip(autorec.predict(
        torch.tensor(test_df['user'].values, dtype=torch.long, device=DEVICE),
        torch.tensor(test_df['missionID'].values, dtype=torch.long, device=DEVICE)
    ).cpu().detach().numpy(), a_min=0, a_max=None)

    y_hat_mf = np.clip(mf(
        torch.tensor(test_df['user'].values, dtype=torch.long, device=DEVICE),
        torch.tensor(test_df['missionID'].values, dtype=torch.long, device=DEVICE)
    ).cpu().detach().numpy(), a_min=0, a_max=None)

    y_hat_mlp = np.clip(mlp(
        torch.tensor(test_df['user'].values, dtype=torch.long, device=DEVICE),
        torch.tensor(test_df['missionID'].values, dtype=torch.long, device=DEVICE)
    ).cpu().detach().numpy(), a_min=0, a_max=None)

    return pd.DataFrame(
        index=['AutoRec', 'MF', 'MLP'],
        columns=[str(d)],
        data=[
            mean_squared_error(test_df['performance'], y_hat_autorec),
            mean_squared_error(test_df['performance'], y_hat_mf),
            mean_squared_error(test_df['performance'], y_hat_mlp)
        ],
    )

In [3]:
from tqdm.auto import tqdm

torch.manual_seed(0)
np.random.seed(0)

results = pd.concat((
    fold(d.date()) for d in tqdm(pd.date_range('2024-11-01', '2024-11-02'))
), axis=1)

results.aggregate(['mean', 'std'], axis=1)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,mean,std
AutoRec,0.121604,0.000858
MF,0.143922,7.1e-05
MLP,0.124154,8.4e-05
