In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('./October_missions_full.csv')
df['mission'] = df['type'] + '_' + df['target'].astype(str)

df = df[['user', 'mission', 'createdAtT', 'type', 'target', 'completed', 'performance']]
df['createdAtT'] = pd.to_datetime(df['createdAtT'], unit='ms')
df['user'] = df['user'].astype('category').cat.codes
df['mission'] = df['mission'].astype('category')
df['type'] = df['type'].astype('category')

df.sort_values(by=['createdAtT', 'user'], inplace=True, ignore_index=True)
df

Unnamed: 0,user,mission,createdAtT,type,target,completed,performance
0,4299,quiz_2,2024-10-01 00:00:07.698,quiz,2,False,0.000000
1,4299,mobility_1,2024-10-01 00:00:07.698,mobility,1,True,1.000000
2,4299,activity_6,2024-10-01 00:00:07.698,activity,6,False,0.666667
3,2566,mobility_1,2024-10-01 00:00:17.857,mobility,1,False,0.000000
4,2566,episode_3,2024-10-01 00:00:17.857,episode,3,False,0.666667
...,...,...,...,...,...,...,...
91353,4978,mobility_2,2024-10-31 23:55:25.309,mobility,2,False,0.000000
91354,4978,quiz_5,2024-10-31 23:55:25.309,quiz,5,False,0.000000
91355,1593,episode_2,2024-10-31 23:59:41.228,episode,2,False,0.000000
91356,1593,activity_3,2024-10-31 23:59:41.228,activity,3,False,0.000000


In [2]:
df = df.drop_duplicates(subset=['user', 'mission'], keep='last')

dataset = pd.DataFrame({
    'user': df['user'],
    'mission': df['mission'].cat.codes,
    'y': df['performance']
})

dataset

Unnamed: 0,user,mission,y
0,4299,25,0.000000
1,4299,20,1.000000
2,4299,8,0.666667
8,4819,8,1.000000
9,4532,30,1.000000
...,...,...,...
91353,4978,21,0.000000
91354,4978,28,0.000000
91355,1593,13,0.000000
91356,1593,5,0.000000


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MissionDataset(Dataset):
    def __init__(self, user_col, mission_col, y_col, device=DEVICE):
        self.user = torch.tensor(user_col.values, dtype=torch.long, device=device)
        self.mission = torch.tensor(mission_col.values, dtype=torch.long, device=device)
        self.y = torch.tensor(y_col.values, dtype=torch.float32, device=device)
        self.device = device

    def __len__(self):
        assert len(self.user) == len(self.mission) == len(self.y)
        return len(self.user)

    def __getitem__(self, idx):
        return self.user[idx], self.mission[idx], self.y[idx]

In [4]:
import torch.nn as nn

class NMF(nn.Module):
    def __init__(self, n_users, n_missions, n_factors=10):
        super(NMF, self).__init__()

        self.user_factors = nn.Embedding(n_users, n_factors)
        self.mission_factors = nn.Embedding(n_missions, n_factors)

    def forward(self, user, mission):
        user_factors = self.user_factors(user)
        mission_factors = self.mission_factors(mission)
        dot = (user_factors * mission_factors).sum(dim=1)
        return dot
    
    def fit(self, dataset: MissionDataset, epochs=10, lr=0.01, weight_decay=0.0):
        self.to(dataset.device)
        self.train()

        optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.MSELoss()
        dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

        for epoch in range(epochs):
            running_loss = 0.0
            for user, mission, y in dataloader:
                optimizer.zero_grad()
                y_pred = self(user, mission)
                loss = criterion(y_pred, y)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

                # clamp factors to be non-negative
                self.user_factors.weight.data = self.user_factors.weight.clamp(min=0)
                self.mission_factors.weight.data = self.mission_factors.weight.clamp(min=0)
            
            print(f'Epoch {epoch+1}/{epochs} - Loss: {running_loss/len(dataloader):.4f}')
    
    @torch.no_grad
    def evaluate(self, dataset: MissionDataset):
        self.to(dataset.device)
        self.eval()

        dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
        y_true = []
        y_pred = []

        for user, mission, y in dataloader:
            y_true.append(y.cpu().numpy())
            y_pred.append(self(user, mission).cpu().numpy())
        
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)

        return y_true, y_pred

In [5]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)
train_dataset = MissionDataset(train_df['user'], train_df['mission'], train_df['y'])
test_dataset = MissionDataset(test_df['user'], test_df['mission'], test_df['y'])

n_users = dataset['user'].nunique()
n_missions = dataset['mission'].nunique()

In [6]:
from sklearn.metrics import mean_squared_error
model = NMF(n_users, n_missions, n_factors=8)
model.fit(train_dataset, epochs=15, lr=0.001, weight_decay=0.0)

y_true, y_pred = model.evaluate(test_dataset)
print(f'\nTest MSE: {mean_squared_error(y_true, y_pred):.4f}')

Epoch 1/15 - Loss: 1.7138
Epoch 2/15 - Loss: 0.7149
Epoch 3/15 - Loss: 0.3677
Epoch 4/15 - Loss: 0.2378
Epoch 5/15 - Loss: 0.1821
Epoch 6/15 - Loss: 0.1538
Epoch 7/15 - Loss: 0.1364
Epoch 8/15 - Loss: 0.1248
Epoch 9/15 - Loss: 0.1166
Epoch 10/15 - Loss: 0.1105
Epoch 11/15 - Loss: 0.1058
Epoch 12/15 - Loss: 0.1021
Epoch 13/15 - Loss: 0.0991
Epoch 14/15 - Loss: 0.0967
Epoch 15/15 - Loss: 0.0945

Test MSE: 0.1236


In [7]:
users = pd.DataFrame(model.user_factors.weight.data.cpu().numpy())
users.to_csv('user_embeddings.csv')

missions = pd.DataFrame(model.mission_factors.weight.data.cpu().numpy(), index=df['mission'].cat.categories)
missions.to_csv('mission_embeddings.csv')

In [8]:
from numpy.linalg import norm

def top_knn(embeddings, query, top_k=5):
    cosine = np.dot(embeddings, query) / (norm(embeddings, axis=1) * norm(query))
    return np.argsort(cosine)[::-1][:top_k]

user = np.random.randint(n_users)
query = users.iloc[user].values

top_k = top_knn(missions.values, query, top_k=5)

print(f'\nTop 5 missions for user {user}:')
for idx in top_k:
    print(f'{missions.index[idx]}')


Top 5 missions for user 2349:
episode_6
episode_3
activity_1
streak_1
activity_5


In [9]:
df[df['user'] == user]

Unnamed: 0,user,mission,createdAtT,type,target,completed,performance
24575,2349,quiz_4,2024-10-09 05:35:32.045,quiz,4,False,0.0
24576,2349,activity_5,2024-10-09 05:35:32.045,activity,5,False,0.2
24577,2349,streak_1,2024-10-09 05:35:32.045,streak,1,True,1.0
