In [None]:
import pandas as pd
import numpy as np

def load_stats(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df['distance'] = df['distance'] / 1000  # convert to km
    df['campaign'] = c
    df.rename(columns={'modeType': 'counter',
              'distance': 'score', 'stat_date': 'ts'}, inplace=True)
    return df[['playerId', 'campaign', 'ts', 'counter', 'score']]


def load_scores(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df.rename(columns={'player_id': 'playerId',
              'mobilityScore': 'score', 'day': 'ts'}, inplace=True)
    df['counter'] = 'score'
    df['campaign'] = c
    return df[['playerId', 'campaign', 'ts', 'counter', 'score']]


STATS_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Stats.parquet',
    'HSC_Lecco_2024': '../data/HSC_Lecco_2024-Players_Daily_Mobility_Stats.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Stats.parquet',
}

SCORES_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Scores.parquet',
    'HSC_Lecco_2024': '../data/HSC_Lecco_2024-Players_Daily_Mobility_Scores.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Scores.parquet',
}

df_stats = pd.concat([load_stats(f, k) for k, f in STATS_FILES.items()])
df_scores = pd.concat([load_scores(f, k) for k, f in SCORES_FILES.items()])

df = pd.concat([df_stats, df_scores])
df['ts'] = pd.to_datetime(df['ts'])

df = df.groupby(['playerId', 'campaign', 'counter', pd.Grouper(
    key='ts', freq='W')]).sum(numeric_only=True).reset_index()
df

In [None]:
df.groupby('playerId').size().value_counts().sort_index()

In [None]:
items = {
    'walk': list(range(0, 101, 1)),
    'bike': list(range(0, 201, 1)),
    'train': list(range(0, 1001, 10)),
    'bus': list(range(0, 501, 10)),
    'car': list(range(0, 1001, 10)),
    'score': list(range(0, 2001, 10)),
}
items = pd.DataFrame({
    'counter': [(len(v) * [k]) for k, v in items.items()],
    'score': [v for v in items.values()],
}).explode(['counter', 'score'], ignore_index=True)
items['score'] = items['score'].astype(float)
items = items[items['score'] > 0].reset_index(drop=True)

def get_difficulty(scores: pd.Series) -> pd.Series:
    return scores.apply(lambda x: (x - scores.min()) / (scores.max() - scores.min()))
items['difficulty'] = items.groupby('counter')['score'].transform(get_difficulty)
items['item_id'] = items.index
items

In [None]:
users = df['playerId'].unique()
users = pd.DataFrame({'playerId': users, 'user_id': range(len(users))})
users

In [None]:
observed = pd.merge_asof(
    df.sort_values('score', ascending=True), 
    items.assign(item_id=items.index).sort_values('score', ascending=True),
    by='counter', on='score', direction='backward').dropna()
observed = pd.merge(observed, users, on='playerId')
observed['item_id'] = observed['item_id'].astype(int)
observed['month'] = observed['ts'].dt.month

observed.sort_values('ts', ascending=True, inplace=True, ignore_index=True)
observed

In [None]:
dataset = observed[[
    'user_id',
    'item_id',
    'counter',
    'month',
    'campaign',
    'difficulty'
]].copy()

X_test = dataset.groupby('playerId').nth[-5:]
X_train = dataset.groupby('playerId').nth[-20:-5]

dataset

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
np.random.seed(42)
torch.manual_seed(42)


def one_hot_encode(n, i):
    x = torch.zeros(n)
    x[i] = 1
    return x


class FactorizationMachine(nn.Module):
    def __init__(self, n, k):
        super(FactorizationMachine, self).__init__()
        self.n = n
        self.k = k
        self.linear = nn.Linear(n, 1, bias=True)
        self.V = nn.Parameter(torch.randn(n, k), requires_grad=True)

    def forward(self, x: torch.Tensor):
        square_of_sum = torch.matmul(x, self.V).pow(2).sum(dim=1, keepdim=True)
        sum_of_square = torch.matmul(x.pow(2), self.V.pow(2)).sum(dim=1, keepdim=True)
        linear_terms = self.linear(x)

        return linear_terms + 0.5 * (square_of_sum - sum_of_square)


class FMTrainingDataset(Dataset):
    def __init__(self, observations: pd.DataFrame) -> None:
        super().__init__()
        self.obs = observations
        self.items = set(self.obs['item_id'])
        self.users = observed.groupby('playerId').apply(lambda x: set(x['item_id']), include_groups=False).to_dict()

    def __len__(self) -> int:
        return len(self.obs)

    def __getitem__(self, idx: int) -> torch.Tensor:
        row = self.obs.iloc[idx].to_frame().T

        negative = row.copy()
        negative['item_id'] = np.random.choice(list(self.items - self.users[row['playerId'].values[0]]))

        positive = encoders.transform(row).astype(float)
        negative = encoders.transform(negative).astype(float)

        return torch.tensor(positive, dtype=torch.float32).reshape(-1), torch.tensor(negative, dtype=torch.float32).reshape(-1)

class FMTRecDataset(Dataset):
    def __init__(self, queries: pd.DataFrame) -> None:
        super().__init__()
        self.queries = encoders.transform(queries).astype(float)

    def __len__(self) -> int:
        return len(self.queries)

    def __getitem__(self, idx: int) -> torch.Tensor:
        row = self.queries[idx]
        return torch.tensor(row.reshape(-1), dtype=torch.float32)

        
# Bayesian Personalized Ranking loss
class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, positive, negative):
        return - torch.sum(torch.log(torch.sigmoid(positive - negative)), dim=0, keepdim=True)
    

def train(model: nn.Module, dataset: Dataset, epochs=10, batch_size=32, lr=0.01, weight_decay=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = BPRLoss()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model.to(DEVICE)
    model.train()
    for _ in (bar := tqdm(range(epochs))):
        for positive, negative in (epoch_bar := tqdm(dataloader, leave=False)):
            optimizer.zero_grad()
            positive = positive.to(DEVICE)
            negative = negative.to(DEVICE)
            loss = criterion(model(positive), model(negative))
            loss.backward()
            optimizer.step()
            epoch_bar.set_description(f'Loss: {loss.item():.4f}')
        bar.set_description(f'Loss: {loss.item():.4f}')

    return model


def predict(model: nn.Module, dataset: Dataset, batch_size=32):
    model.to(DEVICE)
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    predictions = []
    with torch.no_grad():
        for x in tqdm(dataloader):
            x = x.to(DEVICE)
            pred = model(x).detach().cpu().numpy()
            predictions.extend(pred)
    
    return np.array(predictions).flatten()

In [None]:
model = FactorizationMachine(encoders.transform(X_train).shape[1], 8)
dataset = FMTrainingDataset(X_train)

train(model, dataset, epochs=10, batch_size=32, lr=0.01, weight_decay=1e-3)

In [None]:
from itertools import product

users = encoders.transformers_[0][1].categories_
items = encoders.transformers_[1][1].categories_

queries = pd.DataFrame(list(product(users[0], items[0])), columns=['playerId', 'item_id'])
queries['rating'] = predict(model, FMTRecDataset(queries), batch_size=32)

In [None]:
rec_sys = queries.groupby('playerId').apply(lambda x: x.nlargest(5, 'rating'), include_groups=False).reset_index()
rec_sys = rec_sys[['playerId', 'item_id']]
rec_sys = rec_sys.groupby('playerId')['item_id'].apply(set).reset_index()
rec_sys['ground_truth'] = X_test.groupby('playerId')['item_id'].apply(set).reset_index()['item_id']
rec_sys['intersection'] = rec_sys.apply(lambda x: x['ground_truth'] & x['item_id'], axis=1)

rec_sys

In [None]:
print('Precision: ', rec_sys.apply(lambda x: len(x['intersection']) / len(x['item_id']), axis=1).mean().round(2))
print('Recall: ', rec_sys.apply(lambda x: len(x['intersection']) / len(x['ground_truth']), axis=1).mean().round(2))
print('Hit rate: ', rec_sys['intersection'].apply(lambda x: len(x) > 0).mean().round(2))