In [1]:
import pandas as pd
import numpy as np

STATS_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Stats.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Stats.parquet',
}

SCORES_FILES = {
    'HSC_Lecco_2023': '../data/HSC_Lecco_2023-Players_Daily_Mobility_Scores.parquet',
    'HSC_Ferrara_2023': '../data/HSC_Ferrara_2023-Players_Daily_Mobility_Scores.parquet',
}

def load_stats(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df['distance'] = df['distance'] / 1000 # convert to km
    df['campaign'] = c
    df.rename(columns={'playerId': 'user', 'modeType': 'counter', 'distance': 'score', 'stat_date': 'ds'}, inplace=True)
    return df[['user', 'campaign', 'ds', 'counter', 'score']]

def load_scores(path: str, c: str) -> pd.DataFrame:
    df = pd.read_parquet(path)
    df.rename(columns={'player_id': 'user', 'mobilityScore': 'score', 'day': 'ds'}, inplace=True)
    df['counter'] = 'green leaves'
    df['campaign'] = c
    return df[['user', 'campaign', 'ds', 'counter', 'score']]

df = pd.concat([
    *[load_stats(f, k) for k, f in STATS_FILES.items()],
    *[load_scores(f, k) for k, f in SCORES_FILES.items()]
])
df['ds'] = pd.to_datetime(df['ds'])
df = df.groupby(['user', 'campaign', pd.Grouper(key='ds', freq='W'), 'counter']).sum().reset_index()
df['score'] = np.ceil(df['score']).astype(int)
df = df[df['score'] > 0]

df['item'] = (df['counter'] + '/' + df['score'].astype(str)).astype('category')
df['user'] = df['user'].astype('category')
df['campaign'] = df['campaign'].astype('category')

df.sort_values(['ds', 'user'], inplace=True, ascending=True, ignore_index=True)

df

Unnamed: 0,user,campaign,ds,counter,score,item
0,u_00144002f1614ee9a45f7822760e3746,HSC_Lecco_2023,2023-03-05,green leaves,15,green leaves/15
1,u_00144002f1614ee9a45f7822760e3746,HSC_Lecco_2023,2023-03-05,walk,2,walk/2
2,u_00567a7bce8c4d09bea7db9bae375af4,HSC_Lecco_2023,2023-03-05,bus,7,bus/7
3,u_00567a7bce8c4d09bea7db9bae375af4,HSC_Lecco_2023,2023-03-05,green leaves,47,green leaves/47
4,u_00567a7bce8c4d09bea7db9bae375af4,HSC_Lecco_2023,2023-03-05,walk,2,walk/2
...,...,...,...,...,...,...
10522,u_e6a0be423c7e417aab5cff748e8d3685,HSC_Ferrara_2023,2023-06-04,walk,7,walk/7
10523,u_e6eeaa10dd9a4477be1039b97cb61cc1,HSC_Ferrara_2023,2023-06-04,green leaves,101,green leaves/101
10524,u_e6eeaa10dd9a4477be1039b97cb61cc1,HSC_Ferrara_2023,2023-06-04,walk,5,walk/5
10525,u_eb38ede33d7e49deae2d346646d642d6,HSC_Ferrara_2023,2023-06-04,green leaves,50,green leaves/50


In [15]:
df.groupby('counter')['score'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
counter,Unnamed: 1_level_1,Unnamed: 2_level_1
bike,1,139
bus,1,380
car,2,964
green leaves,1,1371
train,1,687
walk,1,77


In [None]:
items = [
    *[('bike', i) for i in range(1, 6) ]
]

In [2]:
dataset = pd.DataFrame({
    'user': df['user'].cat.codes,
    'item': df['item'].cat.codes,
})

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
np.random.seed(0)

class FactorizationMachine(nn.Module):
    def __init__(self, field_dims, num_factors):
        super(FactorizationMachine, self).__init__()
        num_inputs = sum(field_dims)
        self.offsets = torch.tensor((0, *np.cumsum(field_dims)[:-1]), dtype=torch.long)
        self.offsets = nn.Parameter(self.offsets, requires_grad=False)
        print(self.offsets)

        self.embedding = nn.Embedding(num_inputs, num_factors)
        self.fc = nn.Embedding(num_inputs, 1)
        self.linear = nn.Linear(1, 1, bias=True)

    def forward(self, x: torch.Tensor):
        x = x + self.offsets
        square_of_sum = torch.sum(self.embedding(x), dim=1).pow(2)
        sum_of_square = torch.sum(self.embedding(x).pow(2), dim=1)
        x = self.linear(self.fc(x).sum(1)) + 0.5 * \
            (square_of_sum - sum_of_square).sum(1, keepdim=True)
        return x


class FMDataWithNegativeSample(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data.copy()

        self.all_items = set(data['item'])
        self.field_dims = [len(set(data[c])) for c in data.columns]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user = row['user']
        I_u = set(self.data.loc[self.data['user'] == user, 'item'])

        positive_item = row['item']
        negative_item = np.random.choice(list(self.all_items - I_u))

        return torch.tensor((user, positive_item), dtype=torch.long), torch.tensor((user, negative_item), dtype=torch.long)


class BPRLoss(nn.Module):
    def __init__(self):
        super(BPRLoss, self).__init__()

    def forward(self, positive: torch.Tensor, negative: torch.Tensor):
        return - torch.sum(torch.log(torch.sigmoid(positive - negative)), dim=0, keepdim=True)


def train(model: nn.Module, data: Dataset, epochs: int = 10, batch_size: int = 32, lr: float = 1e-2, weight_decay: float = 0.0):
    model.to(DEVICE)
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr,
                           weight_decay=weight_decay)
    criterion = BPRLoss()
    data_loader = DataLoader(data, batch_size=batch_size, shuffle=False)

    for _ in (bar := tqdm(range(epochs))):
        for positive, negative in (epoch_bar := tqdm(data_loader, leave=False)):
            positive, negative = positive.to(DEVICE), negative.to(DEVICE)
            optimizer.zero_grad()
            positive_score = model(positive)
            negative_score = model(negative)
            loss = criterion(positive_score, negative_score)
            loss.backward()
            optimizer.step()
            epoch_bar.set_postfix(loss=loss.item())
        bar.set_postfix(loss=loss.item())
    return model

In [4]:
ds = FMDataWithNegativeSample(dataset)
model = FactorizationMachine(ds.field_dims, 8).to(DEVICE)
model = train(model, ds, epochs=10, batch_size=32, lr=1e-2, weight_decay=1e-6)

Parameter containing:
tensor([  0, 689])


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

  0%|          | 0/329 [00:00<?, ?it/s]

In [13]:
USER = np.random.choice(dataset['user'].unique())
ITEMS = dataset['item'].unique()

with torch.no_grad():
    scores = model(torch.tensor([(USER, i) for i in ITEMS], dtype=torch.long).to(DEVICE)).cpu().numpy().flatten()
    
scores = list(zip(ITEMS, scores))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores = [(df['item'].cat.categories[i], s) for i, s in scores][:10]

print(f'User: {df["user"].cat.categories[USER]}')
scores

User: u_51493203bd174219808f5a8d129b8a0d


[('walk/10', np.float32(9.045524)),
 ('walk/4', np.float32(8.897294)),
 ('walk/2', np.float32(8.202766)),
 ('bike/7', np.float32(6.866133)),
 ('green leaves/19', np.float32(6.669152)),
 ('green leaves/56', np.float32(6.449106)),
 ('walk/9', np.float32(6.3551283)),
 ('walk/1', np.float32(6.3482113)),
 ('bike/1', np.float32(6.323927)),
 ('green leaves/167', np.float32(6.305286))]

In [14]:
[df['item'].cat.categories[i] for i in dataset[dataset['user'] == USER]['item']]

['bike/4',
 'green leaves/146',
 'walk/9',
 'bike/7',
 'bus/2',
 'green leaves/278',
 'walk/22',
 'green leaves/56',
 'walk/5']