In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

os.makedirs('../out/', exist_ok=True)

STATS_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Stats.parquet',
    'HSC_Lecco_2024': '../data/HSC_Lecco_2024-Players_Daily_Mobility_Stats.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Stats.parquet',
}

SCORES_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Scores.parquet',
    'HSC_Lecco_2024': '../data/HSC_Lecco_2024-Players_Daily_Mobility_Scores.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Scores.parquet',
}

In [None]:
import pandas as pd
import numpy as np

def load_stats(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df['distance'] = df['distance'] / 1000 # convert to km
    df['campaign'] = c
    df.rename(columns={'modeType': 'counter', 'distance': 'score', 'stat_date': 'ts'}, inplace=True)
    return df[['playerId', 'campaign', 'ts', 'counter', 'score']]

def load_scores(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df.rename(columns={'player_id': 'playerId', 'mobilityScore': 'score', 'day': 'ts'}, inplace=True)
    df['counter'] = 'score'
    df['campaign'] = c
    return df[['playerId', 'campaign', 'ts', 'counter', 'score']]

df_stats = pd.concat([load_stats(f, k) for k, f in STATS_FILES.items()])
df_scores = pd.concat([load_scores(f, k) for k, f in SCORES_FILES.items()])

df = pd.concat([df_stats, df_scores])
df['ts'] = pd.to_datetime(df['ts'])

df = df.groupby(['playerId', 'campaign', 'counter']).resample('W', on='ts').sum(numeric_only=True).reset_index()
df['score'] = np.ceil(df['score']).astype(int)
df

In [None]:
def flatten(df: pd.DataFrame):
    return pd.Series({
        'start': df['ts'].min(),
        'scores': df[['walk', 'bike', 'train', 'bus', 'car', 'score']].values
    })

flattened_df = df.pivot(index=['playerId', 'campaign', 'ts'], columns='counter', values='score').fillna(0).reset_index()
flattened_df = flattened_df.groupby(['playerId', 'campaign']).apply(flatten, include_groups=False).reset_index()
flattened_df = flattened_df[flattened_df['scores'].apply(lambda x: np.sum(x) > 0 and len(x) > 2)]
flattened_df.sort_values('start', inplace=True)
flattened_df

In [None]:
flattened_df['n'] = flattened_df['scores'].apply(lambda x: x.shape[0])
flattened_df = flattened_df[flattened_df['n'] > 6]

flattened_df['n'].hist(bins=100, figsize=(10, 5))

flattened_df.sort_values('n', ascending=True).head(10)

## Time-series analysis

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)


class PerformancePredictorRNN(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int, layers: int = 1, dropout: float = 0.0):
        super(PerformancePredictorRNN, self).__init__()
        self.hidden_size = hidden_size
        dropout = 0.0 if layers == 1 else dropout

        assert layers >= 1
        self.rnn = nn.GRU(input_size, hidden_size, layers, bidirectional=False, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        assert len(x.shape) == 2
        out, _ = self.rnn(x)
        out = out[-1, :]
        out = self.fc(out)
        return out

    def _prepare_data(self, data):
        if self.scaler_ is not None:
            X = self.scaler_.fit_transform(data)
        X = torch.tensor(X, dtype=torch.float32).to(DEVICE)
        return X[:-1], X[-1]

    def _train_loop(self, X, y, epochs, lr):
        self.to(DEVICE)
        self.train()
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)

        for _ in range(epochs):
            optimizer.zero_grad()
            output = self(X)
            loss: torch.Tensor = criterion(output, y)
            loss.backward()
            optimizer.step()

    # Fit the model
    def fit(self, performance_matrix, epochs=100, lr=0.01, scaler=None):
        self.scaler_ = scaler
        self.X_, self.y_ = self._prepare_data(performance_matrix)
        self._train_loop(self.X_, self.y_, epochs, lr)
        return self

    # Predict next week performance array
    def predict(self):
        self.to(DEVICE)
        self.eval()

        X = torch.cat([
            self.X_[1:],
            self.y_.reshape(1, -1)
        ], dim=0)

        y = self(X).cpu().detach().numpy()
        if self.scaler_ is not None:
            y = self.scaler_.inverse_transform(y.reshape(1, -1)).reshape(-1)
        y = np.maximum(y, 0) # ensure non-negative values

        return y

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def mean_absolute_error(y_true, y_pred, ignore_zeros=False):
    if ignore_zeros:
        y_true = np.where(y_true == 0, np.nan, y_true)
    errors = np.abs(y_true - y_pred)
    return np.nanmean(errors, axis=0)

def root_mean_squared_error(y_true, y_pred, ignore_zeros=False):
    if ignore_zeros:
        y_true = np.where(y_true == 0, np.nan, y_true)
    errors = np.square(y_true - y_pred)
    return np.sqrt(np.nanmean(errors, axis=0))

def mean_absolute_percentage_error(y_true, y_pred, ignore_zeros=False):
    if ignore_zeros:
        y_true = np.where(y_true == 0, np.nan, y_true)
    errors = np.abs(y_true - y_pred) / y_true
    return np.nanmean(errors, axis=0)

def cross_validate(x: pd.Series, predict):
    dataframe = x['scores']
    tscv = TimeSeriesSplit(n_splits=5, test_size=1)

    preds = []
    ground_truths = []
    for train_index, test_index in tscv.split(dataframe):
        assert len(test_index) == 1

        y_true = dataframe[test_index].reshape(-1)
        y_hat = predict(dataframe[train_index])

        preds.append(y_hat)
        ground_truths.append(y_true)
    
    preds = np.array(preds)
    ground_truths = np.array(ground_truths)
    return pd.Series({
        'rmse': root_mean_squared_error(ground_truths, preds, ignore_zeros=False),
        'mae': mean_absolute_error(ground_truths, preds, ignore_zeros=False),
        'mape': mean_absolute_percentage_error(ground_truths, preds, ignore_zeros=False)
    })

def global_mean(x: pd.Series):
    x = np.array(x.values.tolist())
    return np.nanmean(x, axis=0)

In [None]:
# Current predictor: moving average of the last 4 weeks

from tqdm import tqdm
tqdm.pandas()

def predict(X):
    return X[-4:].mean(axis=0) # moving average of the last 4 weeks
        
Y_hat: pd.DataFrame = flattened_df.progress_apply(cross_validate, axis=1, predict=predict).apply(global_mean).map('{:.2f}'.format)
Y_hat.index = ['walk', 'bike', 'train', 'bus', 'car', 'score']
Y_hat

In [None]:
from sklearn.preprocessing import StandardScaler as scaler

from tqdm import tqdm
tqdm.pandas()

def predict(X):
    model = PerformancePredictorRNN(
        input_size=6,
        hidden_size=40, 
        output_size=6,
        layers=1,
    )
    model = model.fit(X, epochs=200, lr=1e-4, scaler=scaler())
    return model.predict()

Y_hat: pd.DataFrame = flattened_df.progress_apply(cross_validate, axis=1, predict=predict).apply(global_mean).map('{:.2f}'.format)
Y_hat.index = ['walk', 'bike', 'train', 'bus', 'car', 'score']
Y_hat

In [None]:
# POP predictor: take the last week score

from tqdm import tqdm
tqdm.pandas()


def predict(X):
    return X[-1] # take the last week score

Y_hat: pd.DataFrame = flattened_df.progress_apply(cross_validate, axis=1, predict=predict).apply(global_mean).map('{:.2f}'.format)
Y_hat.index = ['walk', 'bike', 'train', 'bus', 'car', 'score']
Y_hat