In [1]:
import json
import pandas as pd
import numpy as np

data = [json.loads(line) for line in open('../data/Ferrara2023-citizens-individualChallenges.json')]
data = [d for d in data if d['concept']['modelName'] != 'survey' and d['concept']['state'] in ['COMPLETED', 'FAILED']]
filtered_data = [
    {
        'user_id': d['playerId'],
        'counter': d['concept']['fields']['counterName'],
        'target': int(d['concept']['fields']['target']),
        'periodTarget': d['concept']['fields']['periodTarget'],
        'start': d['concept']['start'],
        'completed': int(d['concept']['completed']),
    }
    for d in data
]

df = pd.DataFrame(filtered_data)
df['start'] = pd.to_datetime(df['start'])
df = df[df['start'] < '2023-09-17']

df['periodTarget'] = df['periodTarget'].fillna(1).astype(int)
df['user_id'] = df['user_id'].astype('category')
df['counter'] = df['counter'].astype('category')

df['beta'] = df['target'] * df['periodTarget']
df['beta'] = df.groupby('counter', observed=True)['beta'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

df.sort_values('start', ascending=True, inplace=True, ignore_index=True)

df

Unnamed: 0,user_id,counter,target,periodTarget,start,completed,beta
0,u_0bea6988-bd00-4aa6-a456-4285744356ee,Walk_Km,1,1,2023-04-23 22:00:00,1,0.000000
1,u_f4b95ec9-5f53-48d8-8adf-fa61be00c967,green leaves,30,2,2023-04-23 22:00:00,1,0.032258
2,u_f2d8d95d-6cc9-4e8e-b97d-88d0c1b0ae27,green leaves,30,2,2023-04-23 22:00:00,1,0.032258
3,u_f2d8d95d-6cc9-4e8e-b97d-88d0c1b0ae27,Walk_Km,1,1,2023-04-23 22:00:00,1,0.000000
4,u_e4a7a7fcd26644e9961e3e05bf98454e,green leaves,30,2,2023-04-23 22:00:00,1,0.032258
...,...,...,...,...,...,...,...
5858,u_553c90102f4e48f88e4617393a5550a1,Walk_Km,1,1,2023-09-10 22:00:00,0,0.000000
5859,u_54061e0911df4785a154a10fdf6def06,green leaves,1,5,2023-09-10 22:00:00,0,0.001669
5860,u_54061e0911df4785a154a10fdf6def06,Walk_Km,1,1,2023-09-10 22:00:00,0,0.000000
5861,u_614d26e70185417d831235a03703fdb4,Walk_Km,1,1,2023-09-10 22:00:00,1,0.000000


In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

df['completed'].value_counts().plot(kind='bar', ax=ax1)
ax1.set_title('Number of challenges completed')
ax1.set_xlabel('Challenges completed')
ax1.set_ylabel('Number of challenges')

df.groupby('start').size().plot(ax=ax2)
ax3 = ax2.twiny()
df.groupby(['start', 'completed']).size().unstack().plot(ax=ax3, legend=True, stacked=False, kind='bar', color=['red', 'green'])
ax3.set_xlabel('')
ax3.set_xticks([])

ax2.set_title('Number of challenges per week')
ax2.set_xlabel('Start date')
ax2.set_ylabel('Number of challenges')

plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
np.random.seed(0)

class FactorizationMachine(nn.Module):
    def __init__(self, field_dims, num_factors, sigmoid=True):
        super(FactorizationMachine, self).__init__()
        num_inputs = sum(field_dims)
        self.offsets = torch.tensor((0, *np.cumsum(field_dims)[:-1]), dtype=torch.long)
        self.offsets = nn.Parameter(self.offsets, requires_grad=False)

        self.embeddings = nn.Embedding(num_inputs, num_factors)
        self.biases = nn.Embedding(num_inputs, 1)
        self.bias_ = nn.Parameter(torch.tensor([0.0]), requires_grad=True)

        self.sigmoid = sigmoid

    def forward(self, features: torch.Tensor, features_values: torch.Tensor):
        features = features + self.offsets  # add feature offsets to get indices for embeddings

        embeddings: torch.Tensor = self.embeddings(features)
        features_values = features_values.unsqueeze(dim=-1)
        embeddings = embeddings * features_values

        square_of_sum = torch.sum(embeddings, dim=1).pow(2)
        sum_of_square = torch.sum(embeddings.pow(2), dim=1)

        FM = 0.5 * (square_of_sum - sum_of_square).sum(dim=1, keepdim=True)

        feature_bias: torch.Tensor = self.biases(features)
        feature_bias = (feature_bias * features_values).sum(dim=1)

        FM = FM + feature_bias + self.bias_

        if self.sigmoid:    # return probability
            return torch.sigmoid(FM).view(-1)
        return FM.view(-1)  # return logits
    
    def fit(self, dataset: Dataset):
        self.train()
        optimizer = optim.Adam(self.parameters(), lr=0.01)
        criterion = nn.BCELoss()

        train_loader = DataLoader(dataset, batch_size=16, shuffle=False)
        for _ in (bar := tqdm(range(10))):
            for features, features_values, target in tqdm(train_loader, leave=False):
                optimizer.zero_grad()
                output = self(features.to(DEVICE), features_values.to(DEVICE))
                loss = criterion(output, target.to(DEVICE))
                loss.backward()
                optimizer.step()
            bar.set_postfix(loss=loss.item())
    
    def predict(self, dataset: Dataset):
        self.eval()
        predictions = []
        loader = DataLoader(dataset, batch_size=32)
        with torch.no_grad():
            for features, features_values, _ in loader:
                output = self(features.to(DEVICE), features_values.to(DEVICE))
                predictions.extend(output.cpu().numpy())
        return np.array(predictions)


class FMData(Dataset):
    def __init__(self, data: pd.DataFrame, y_col: str = ''):
        if y_col == '':
            self.y = None
            self.data = data.copy()
        else:
            self.y = data[y_col].apply(lambda x: x.cat.codes if data[y_col].dtype == 'category' else x).values
            self.data = data.drop(columns=y_col)
        
        self.field_dims = [len(self.data[col].cat.categories) if self.data[col].dtype == 'category' else 1 for col in self.data.columns]
        self.data = self.data.apply(lambda x: x.cat.codes if x.dtype == 'category' else x)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        features = torch.tensor([v if feature_size > 1 else 0 for v, feature_size in zip(row, self.field_dims)], dtype=torch.long)
        features_values = torch.tensor([1.0 if feature_size > 1 else v for v, feature_size in zip(row, self.field_dims)], dtype=torch.float32)

        if self.y is not None:
            target = torch.tensor(self.y[idx], dtype=torch.float32)
            return features, features_values, target
        return features, features_values