In [1]:
import pandas as pd

df = pd.concat((
    pd.read_csv('./data/October_missions_full.csv'),
    pd.read_csv('./data/November_1stW_missions_full.csv'),
), ignore_index=True)

df['mission'] = df['type'] + '_' + df['target'].astype(str)

df = df[['user', 'mission', 'createdAtT', 'type', 'target', 'performance']]
df['createdAtT'] = pd.to_datetime(df['createdAtT'], unit='ms').dt.date
df = df.groupby('user').filter(lambda x: len(x['createdAtT'].unique()) > 1)

df.drop_duplicates(subset=['user', 'mission'], keep='last', inplace=True)
df['user'] = df['user'].astype('category').cat.codes
df['mission'] = df['mission'].astype('category')
df['missionID'] = df['mission'].cat.codes
df['type'] = df['type'].astype('category')
df['performance'] = df['performance'].apply(lambda x: 0.7 <= x <= 1.05).astype(int)

df.sort_values(by=['createdAtT', 'user'], inplace=True, ignore_index=True)
df

Unnamed: 0,user,mission,createdAtT,type,target,performance,missionID
0,0,episode_2,2024-10-01,episode,2,0,13
1,0,action_2,2024-10-01,action,2,0,1
2,0,episode_1,2024-10-01,episode,1,1,12
3,0,quiz_1,2024-10-01,quiz,1,1,24
4,0,quiz_4,2024-10-01,quiz,4,1,27
...,...,...,...,...,...,...,...
44117,2946,quiz_4,2024-11-07,quiz,4,1,27
44118,2960,activity_7,2024-11-07,activity,7,0,9
44119,2968,mobility_3,2024-11-07,mobility,3,0,22
44120,2968,quiz_3,2024-11-07,quiz,3,0,26


In [2]:
train_df = df[df['createdAtT'] < pd.Timestamp(2024, 11, 1).date()]
validation_df = train_df.groupby('user').sample(frac=0.15, random_state=42)
train_df = train_df.drop(validation_df.index)

test_df = df[df['createdAtT'] >= pd.Timestamp(2024, 11, 1).date()]

train_df.shape, validation_df.shape, test_df.shape

((25857, 7), (4618, 7), (13647, 7))

In [3]:
from src.dataset import MissionDataset

train_dataset = MissionDataset(missions=train_df['missionID'].values, users=train_df['user'].values, ratings=train_df['performance'].values)
validation_dataset = MissionDataset(missions=validation_df['missionID'].values, users=validation_df['user'].values, ratings=validation_df['performance'].values)
users = df['user'].nunique()
missions = df['missionID'].nunique()

In [4]:
import torch
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, auc
from src.models import MissionMatrixFactorization, train

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

matrix_factorization = MissionMatrixFactorization(users, missions, embedding_dim=8).to(DEVICE)
train(matrix_factorization, train_dataset, validation_set=validation_dataset, weight_decay=1e-4, lr=1e-3, epochs=20)

matrix_factorization.eval()
preds = torch.sigmoid(matrix_factorization(
    torch.from_numpy(test_df['user'].values).to(DEVICE).long(),
    torch.from_numpy(test_df['missionID'].values).to(DEVICE).long(),
).flatten()).cpu().detach().numpy()

print(roc_auc_score(test_df['performance'], preds).round(4))
precision, recall, _ = precision_recall_curve(test_df['performance'], preds)
print(auc(recall, precision).round(4))


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

0.8506
0.6932


In [5]:
from src.models import MissionLinearRegression

linear_regression = MissionLinearRegression(users, missions).to(DEVICE)
train(linear_regression, train_dataset, validation_set=validation_dataset, weight_decay=1e-4, lr=1e-3, epochs=20)

linear_regression.eval()
preds = torch.sigmoid(linear_regression(
    torch.from_numpy(test_df['user'].values).to(DEVICE).long(),
    torch.from_numpy(test_df['missionID'].values).to(DEVICE).long(),
).flatten()).cpu().detach().numpy()

print(roc_auc_score(test_df['performance'], preds).round(4))
precision, recall, _ = precision_recall_curve(test_df['performance'], preds)
print(auc(recall, precision).round(4))

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

  0%|          | 0/809 [00:00<?, ?it/s]

0.8094
0.6308
