In this notebook, we build some wrapper classes around a simple Pytorch Logit model.  We find that we are able to replicate (and even outperform) the sklearn logit model.

In [1]:
from tennis_new.fetch.tennis_explorer.combiner import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


#### Run Set ELO

Run SetELO first so that we have easy access to training set and validation set and all that

In [2]:
from tennis_new.model.config.elo.global_set_elo import SetELO

set_elo = SetELO()
set_elo.run(jd)

In [3]:
set_elo.validation_evaluation

{'DummyFilter_prediction_AUCMetric': 0.8187031847302881,
 'DummyFilter_prediction_AccuracyMetric': 0.7358520800135314,
 'DummyFilter_prediction_LogLikelihoodMetric': -0.5226366377611569,
 'HasOddsFilter_prediction_AUCMetric': 0.7839029874196454,
 'HasOddsFilter_prediction_AccuracyMetric': 0.7056423354253945,
 'HasOddsFilter_prediction_LogLikelihoodMetric': -0.5594758958654537,
 'DummyFilter_odds_implied_probability_AUCMetric': None,
 'DummyFilter_odds_implied_probability_AccuracyMetric': None,
 'DummyFilter_odds_implied_probability_LogLikelihoodMetric': None,
 'HasOddsFilter_odds_implied_probability_AUCMetric': 0.7937506478103871,
 'HasOddsFilter_odds_implied_probability_AccuracyMetric': 0.7114980299325661,
 'HasOddsFilter_odds_implied_probability_LogLikelihoodMetric': -0.5501844612492598}

#### Define Logit Training X, y

Now we'll need to create a sparse dataset for the logistic regression.  We'll start by making sure we have the right date filtering.  Recall that for ELO models, our training data is the full date range.  We'll have to manually cut the dates for our logit model.

In [4]:
elo_training_set = set_elo.training_filter.filter_data(jd)
elo_validation_set = set_elo.validation_filter.filter_data(set_elo.all_jd)
elo_test_set = set_elo.test_filter.filter_data(set_elo.all_jd)
logit_training_set = elo_training_set[
    elo_training_set['date'] < elo_validation_set['date'].min()
].copy()
(
    (logit_training_set['date'].min(), logit_training_set['date'].max()),
    (elo_validation_set['date'].min(), elo_validation_set['date'].max()),
    (elo_test_set['date'].min(), elo_test_set['date'].max())
)

(('1997-01-01', '2010-12-31'),
 ('2011-01-01', '2014-12-31'),
 ('2015-01-01', '2020-12-21'))

#### Mess Around with Pytorch DataLoaders

In [5]:
import pandas as pd

all_players = pd.concat([
    logit_training_set[['p1_link', 'date']].rename(columns={'p1_link': 'pid'}).drop_duplicates('pid', keep='first'),
    logit_training_set[['p2_link', 'date']].rename(columns={'p2_link': 'pid'}).drop_duplicates('pid', keep='first')
]).sort_values('date', ascending=True)['pid'].drop_duplicates(keep='first')
player_map = dict(enumerate(all_players))
inv_player_map = {v: k for k, v in player_map.items()}

In [6]:
torch_training_set = logit_training_set[[
    'p1_link',
    'p2_link',
    'p1_sets_won',
    'p2_sets_won'
]].copy()
torch_training_set['p1_id'] = torch_training_set['p1_link'].map(inv_player_map)
torch_training_set['p2_id'] = torch_training_set['p2_link'].map(inv_player_map)

In [7]:
import torch

torch_validation_set = elo_validation_set[
    elo_validation_set['p1_link'].isin(torch_training_set['p1_link']) &
    elo_validation_set['p2_link'].isin(torch_training_set['p2_link']) &
    (elo_validation_set['date'] < '2012-01-01')
].copy()

torch_val_X = torch.from_numpy(
    pd.DataFrame({
        'p1_id': torch_validation_set['p1_link'].map(inv_player_map),
        'p2_id': torch_validation_set['p2_link'].map(inv_player_map)
    }).values
)

In [8]:
# Calculate embedding size
embedding_size = torch_training_set[['p1_id', 'p2_id']].max().max() + 1
embedding_size

26545

In [9]:
import numpy as np

class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, X, w1, w2):
        self.X = torch.from_numpy(X.values)
        self.w1 = torch.from_numpy(w1.values.astype(np.float32))
        self.w2 = torch.from_numpy(w2.values.astype(np.float32))
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.w1[idx], self.w2[idx]

batch_size = 1024
train_ds = MyDataSet(
    torch_training_set[['p1_id', 'p2_id']], 
    torch_training_set['p1_sets_won'],
    torch_training_set['p2_sets_won']
)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [10]:
class TorchRunner(object):
    
    def __init__(
        self,
        train_dl,
        validation_set=None,
        test_set=None,
    ):
        self.train_dl = train_dl
        self.n_epochs = 0
        self.epoch_loss = 0
        self.validation_set = validation_set
        self.test_set = test_set
        self.training_initialized=False
        self.model = None
        
        
    def on_epoch_end(self):
        # Callback to do at the end of an epoch
        pass
    
    def on_minibatch_end(self):
        # Callback to perform at end of a minibatch
        pass
   
    @property
    def model_cls(self):
        raise NotImplementedError()

    @property
    def model_kwargs(self):
        return {}

    @property
    def optimizer_cls(self):
        raise NotImplementedError()

    @property
    def optimizer_kwargs(self): 
        return {}

    '''
    @property
    def loss_criterion(self):
        raise NotImplementedError()
    '''
    
    def loss(self, minibatch_data):
        raise NotImplementedError()

    def init_training(self):
        # Instantiate model and optimizer
        self.training_initialized = True
        self.model = self.model_cls(**self.model_kwargs)
        self.optimizer = self.optimizer_cls(self.model.parameters(), **self.optimizer_kwargs)
        
    def train(self, n_epochs):
        if not self.training_initialized:
            self.init_training()
        for epoch in range(n_epochs):
            self.epoch_loss = 0
            for minibatch_data in self.train_dl:
                self.optimizer.zero_grad()
                loss = self.loss(minibatch_data)
                self.epoch_loss += loss.item()
                loss.backward()
                self.optimizer.step()
                self.on_minibatch_end()
            self.n_epochs += 1
            self.on_epoch_end()

In [11]:
class EmbeddedLogisticModel(torch.nn.Module):
    def __init__(self, n_players):
        super(EmbeddedLogisticModel, self).__init__()
        self.embedding = torch.nn.Embedding(n_players, 1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x):
        embedded = self.embedding(x)
        return self.sigmoid(embedded[:, 0] - embedded[:, 1])

In [12]:
INITIAL_LR = 200.


class MyLogitFitter(TorchRunner):

    def __init__(self, *args, **kwargs):
        super(MyLogitFitter, self).__init__(*args, **kwargs)
        self.last_epoch_loss = 9999999999999
        self.lr = INITIAL_LR
        
    @property
    def model_cls(self):
        return EmbeddedLogisticModel

    @property
    def model_kwargs(self):
        return {
            'n_players': embedding_size
        }
        
    @property
    def optimizer_cls(self):
        return torch.optim.SGD

    @property
    def optimizer_kwargs(self):
        return {'lr': INITIAL_LR}

    @property
    def loss_criterion(self):
        return torch.nn.BCELoss(reduction='none')
    
    def on_epoch_end(self):
        val_preds = self.model(self.validation_set)
        accuracy = (val_preds[:, 0].detach().numpy() > 0.5).mean()
        print("Iteration: {}, Loss: {}, Accuracy: {}.".format(self.n_epochs, self.epoch_loss, accuracy))
        if self.epoch_loss > self.last_epoch_loss:  # If training loss is getting worse, halve learning rate
            self.lr /= 2.
            print("Reducing learning rate to %0.2f" % self.lr)
            for pg in self.optimizer.param_groups:
                pg['lr'] = self.lr
        self.last_epoch_loss = self.epoch_loss
        
    def loss(self, minibatch_data):
        # Set-Weighted Loss
        X, w1, w2 = minibatch_data
        outputs = self.model(X)[:, 0]
        y_1 = torch.from_numpy(np.ones(X.shape[0], dtype=np.float32))
        y_2 = torch.from_numpy(np.zeros(X.shape[0], dtype=np.float32))
        loss_1 = torch.mean(torch.mul(w1, self.loss_criterion(outputs, y_1)))
        loss_2 = torch.mean(torch.mul(w2, self.loss_criterion(1. - outputs, y_2)))
        loss = loss_1 + loss_2
        return loss

In [13]:
logit_fitter = MyLogitFitter(train_dl, validation_set=torch_val_X)

In [14]:
logit_fitter.train(20)

Iteration: 1, Loss: 644.2845195531845, Accuracy: 0.6670013234153243.
Iteration: 2, Loss: 582.9165781736374, Accuracy: 0.6811025418701228.
Iteration: 3, Loss: 569.1016620397568, Accuracy: 0.6855975904714097.
Iteration: 4, Loss: 562.0199350118637, Accuracy: 0.6934924474056496.
Iteration: 5, Loss: 557.941135764122, Accuracy: 0.6956144754255464.
Iteration: 6, Loss: 555.3975894451141, Accuracy: 0.6957285629534979.
Iteration: 7, Loss: 552.8754503726959, Accuracy: 0.6984666636243326.
Iteration: 8, Loss: 551.0330901145935, Accuracy: 0.6993337288367636.
Iteration: 9, Loss: 549.9968975782394, Accuracy: 0.7003376990827362.
Iteration: 10, Loss: 548.9059181213379, Accuracy: 0.6996303564094373.
Iteration: 11, Loss: 548.3309115171432, Accuracy: 0.6982841235796102.
Iteration: 12, Loss: 546.7757701873779, Accuracy: 0.7032355222927029.
Iteration: 13, Loss: 547.0961427688599, Accuracy: 0.7020033769908274.
Reducing learning rate to 100.00
Iteration: 14, Loss: 520.3741438388824, Accuracy: 0.706635330625656