In [None]:
import pandas as pd
import numpy as np

STATS_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Stats.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Stats.parquet',
}

SCORES_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Scores.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Scores.parquet',
}

def load_stats(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df['distance'] = df['distance'] / 1000 # convert to km
    df['campaign'] = c
    df.rename(columns={'playerId': 'user', 'modeType': 'counter', 'distance': 'score', 'stat_date': 'ds'}, inplace=True)
    return df[['user', 'campaign', 'ds', 'counter', 'score']]

def load_scores(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df.rename(columns={'player_id': 'user', 'mobilityScore': 'score', 'day': 'ds'}, inplace=True)
    df['counter'] = 'green leaves'
    df['campaign'] = c
    return df[['user', 'campaign', 'ds', 'counter', 'score']]

df = pd.concat([
    *[load_stats(f, k) for k, f in STATS_FILES.items()],
    *[load_scores(f, k) for k, f in SCORES_FILES.items()]
])
df['ds'] = pd.to_datetime(df['ds'])
df = df.groupby(['user', 'campaign', pd.Grouper(key='ds', freq='W'), 'counter']).sum().reset_index()
df['score'] = np.ceil(df['score']).astype(int)
df = df[df['score'] > 0]

df['item'] = (df['counter'] + '/' + df['score'].astype(str)).astype('category')
df['user'] = df['user'].astype('category')
df['campaign'] = df['campaign'].astype('category')

df.sort_values(['ds', 'user'], inplace=True, ascending=True)

df

In [None]:
dataset = pd.DataFrame({
    'user': df['user'].cat.codes,
    'item': df['item'].cat.codes,
})

dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class FactorizationMachine(nn.Module):
    def __init__(self, field_dims, num_factors):
        super(FactorizationMachine, self).__init__()
        num_inputs = sum(field_dims)
        self.embedding = nn.Embedding(num_inputs, num_factors)
        self.fc = nn.Embedding(num_inputs, 1)
        self.linear = nn.Linear(1, 1, bias=True)

    def forward(self, x):
        square_of_sum = torch.sum(self.embedding(x), dim=1).pow(2)
        sum_of_square = torch.sum(self.embedding(x).pow(2), dim=1)
        x = self.linear(self.fc(x).sum(1)) + 0.5 * \
            (square_of_sum - sum_of_square).sum(1, keepdim=True)
        return x


class FMDataWithNegativeSample(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data.copy()

        # Apply offsets to all columns
        for i in range(1, len(data.columns)):
            self.data.iloc[:, i] = self.data.iloc[:, i] + self.data.iloc[:, i - 1].max() + 1

        self.all_items = set(data['item'])
        self.field_dims = [len(set(data[c])) for c in data.columns]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user = row['user']
        I_u = set(self.data.loc[self.data['user'] == user, 'item'])

        positive_item = row['item']
        negative_item = np.random.choice(list(self.all_items - I_u))

        return torch.tensor((user, positive_item), dtype=torch.long), torch.tensor((user, negative_item), dtype=torch.long)


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, positive: torch.Tensor, negative: torch.Tensor):
        return - torch.sum(torch.log(torch.sigmoid(positive - negative)), dim=0, keepdim=True)


def train(model: nn.Module, data: Dataset, epochs: int = 10, batch_size: int = 32, lr: float = 1e-2, weight_decay: float = 0.0):
    model.to(DEVICE)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr,
                           weight_decay=weight_decay)
    criterion = BPRLoss()
    data_loader = DataLoader(data, batch_size=batch_size, shuffle=False)

    for _ in (bar := tqdm(range(epochs))):
        for positive, negative in (epoch_bar := tqdm(data_loader, leave=False)):
            positive, negative = positive.to(DEVICE), negative.to(DEVICE)
            optimizer.zero_grad()
            positive_score = model(positive)
            negative_score = model(negative)
            loss = criterion(positive_score, negative_score)
            loss.backward()
            optimizer.step()
            epoch_bar.set_postfix(loss=loss.item())
        bar.set_postfix(loss=loss.item())
    return model

In [None]:
ds = FMDataWithNegativeSample(dataset)
model = FactorizationMachine(ds.field_dims, 8).to(DEVICE)
model = train(model, ds, epochs=10, batch_size=32, lr=1e-2, weight_decay=1e-6)