In this notebook, we extend our pytorch logistic regression model to include surface information.  We find that including surface information results in only very modest (barely noticeable, if at all) model improvement, but that the player-specific surface information largely makes sense.  We suspect that we need:
* To tune our coefficient-specific regularization (more regularization for surface offsets than main effects)
* To get surface data for all matches.

This notebook is also no longer in perfect working shape, and will need some editing to make operational again.

In [1]:
from tennis_new.fetch.tennis_explorer.combiner import read_joined

jd = read_joined()

  if (yield from self.run_code(code, result)):


#### Run Set ELO

Run SetELO first so that we have easy access to training set and validation set and all that

In [2]:
from tennis_new.model.config.elo.global_set_elo import SetELO

set_elo = SetELO()
set_elo.run(jd)

In [3]:
set_elo.validation_evaluation

{'DummyFilter_prediction_AUCMetric': 0.8187031847302881,
 'DummyFilter_prediction_AccuracyMetric': 0.7358520800135314,
 'DummyFilter_prediction_LogLikelihoodMetric': -0.5226366377611569,
 'HasOddsFilter_prediction_AUCMetric': 0.7839029874196454,
 'HasOddsFilter_prediction_AccuracyMetric': 0.7056423354253945,
 'HasOddsFilter_prediction_LogLikelihoodMetric': -0.5594758958654537,
 'DummyFilter_odds_implied_probability_AUCMetric': None,
 'DummyFilter_odds_implied_probability_AccuracyMetric': None,
 'DummyFilter_odds_implied_probability_LogLikelihoodMetric': None,
 'HasOddsFilter_odds_implied_probability_AUCMetric': 0.7937506478103871,
 'HasOddsFilter_odds_implied_probability_AccuracyMetric': 0.7114980299325661,
 'HasOddsFilter_odds_implied_probability_LogLikelihoodMetric': -0.5501844612492598}

#### Define Logit Training X, y

Now we'll need to create a sparse dataset for the logistic regression.  We'll start by making sure we have the right date filtering.  Recall that for ELO models, our training data is the full date range.  We'll have to manually cut the dates for our logit model.

In [4]:
elo_training_set = set_elo.training_filter.filter_data(jd)
elo_validation_set = set_elo.validation_filter.filter_data(set_elo.all_jd)
elo_test_set = set_elo.test_filter.filter_data(set_elo.all_jd)
logit_training_set = elo_training_set[
    elo_training_set['date'] < elo_validation_set['date'].min()
].copy()
(
    (logit_training_set['date'].min(), logit_training_set['date'].max()),
    (elo_validation_set['date'].min(), elo_validation_set['date'].max()),
    (elo_test_set['date'].min(), elo_test_set['date'].max())
)

(('1997-01-01', '2010-12-31'),
 ('2011-01-01', '2014-12-31'),
 ('2015-01-01', '2020-12-21'))

#### Mess Around with Pytorch DataLoaders

In [5]:
# Get Player Mapping

import pandas as pd

all_players = pd.concat([
    logit_training_set[['p1_link', 'date']].rename(columns={'p1_link': 'pid'}).drop_duplicates('pid', keep='first'),
    logit_training_set[['p2_link', 'date']].rename(columns={'p2_link': 'pid'}).drop_duplicates('pid', keep='first')
]).sort_values('date', ascending=True)['pid'].drop_duplicates(keep='first')
player_map = dict(enumerate(all_players))
inv_player_map = {v: k for k, v in player_map.items()}

In [6]:
SURFACE_LIST = logit_training_set['surface'].dropna().unique()

def ohe_surface(df):
    return np.array([
        (df['surface'] == s).astype(int).values for s in SURFACE_LIST 
    ]).T

In [11]:
import numpy as np
import torch

torch_val_df = elo_validation_set[
    elo_validation_set['p1_link'].isin(logit_training_set['p1_link']) &
    elo_validation_set['p2_link'].isin(logit_training_set['p2_link']) &
    (elo_validation_set['date'] < '2012-01-01')
].copy()

def get_torch_set(df):
    X = torch.from_numpy(
        pd.DataFrame({
            'p1_id': df['p1_link'].map(inv_player_map),
            'p2_id': df['p2_link'].map(inv_player_map)
        }).values
    )
    s = torch.from_numpy(ohe_surface(df))
    w1 = torch.from_numpy(df['p1_sets_won'].values)
    w2 = torch.from_numpy(df['p2_sets_won'].values)
    return {'X': X, 's': s, 'w1': w1, 'w2': w2}
    
torch_train = get_torch_set(logit_training_set)
torch_val = get_torch_set(torch_val_df)
torch_val_w_surface = get_torch_set(
    torch_val_df[
        torch_val_df['surface'].isin([
            'clay',
            'hard',
            'indoors',
            'grass'
        ])
    ]
)

In [12]:
import numpy as np

class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, X, s, w1, w2):
        self.X = X
        self.s = s 
        self.w1 = w1
        self.w2 = w2
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return {
            'X': self.X[idx],
            's': self.s[idx],
            'w1': self.w1[idx],
            'w2': self.w2[idx]
        }
    
batch_size = 1024
train_ds = MyDataSet(
    torch_train['X'],
    torch_train['s'],
    torch_train['w1'],
    torch_train['w2']
)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [13]:
class TorchRunner(object):
    
    def __init__(
        self,
        train_dl,
        validation_sets=None,
        test_set=None,
    ):
        self.train_dl = train_dl
        self.n_epochs = 0
        self.epoch_loss = 0
        self.validation_sets = validation_sets
        if self.validation_sets is None:
            self.validation_sets = []
        self.test_set = test_set
        self.training_initialized=False
        # self.model = None
        
        
    def on_epoch_end(self):
        # Callback to do at the end of an epoch
        pass
    
    def on_minibatch_end(self):
        # Callback to perform at end of a minibatch
        pass
   
    @property
    def model_cls(self):
        raise NotImplementedError()

    @property
    def model_kwargs(self):
        return {}

    @property
    def optimizer_cls(self):
        raise NotImplementedError()

    @property
    def optimizer_params_kwargs(self): 
        return {}
    
    @property
    def loss_criterion(self):
        raise NotImplementedError()
    
    def loss(self, minibatch_data):
        raise NotImplementedError()

    def init_training(self):
        # Instantiate model and optimizer
        self.training_initialized = True
        self.model = self.model_cls(**self.model_kwargs)
        self.optimizer = self.optimizer_cls(
            self.optimizer_params_kwargs[0],
            **self.optimizer_params_kwargs[1]
        )
        
    def train(self, n_epochs):
        if not self.training_initialized:
            self.init_training()
        for epoch in range(n_epochs):
            self.epoch_loss = 0
            for minibatch_data in self.train_dl:
                self.optimizer.zero_grad()
                loss = self.loss(minibatch_data)
                self.epoch_loss += loss.item()
                loss.backward()
                self.optimizer.step()
                self.on_minibatch_end()
            self.n_epochs += 1
            self.on_epoch_end()

In [14]:
class SurfaceAgnosticLogisticModel(torch.nn.Module):
    # Normal Logistic Regression
    def __init__(self, n_players):
        super(SurfaceAgnosticLogisticModel, self).__init__()
        self.main_embedding = torch.nn.Embedding(n_players, 1)
        self.surface_embedding = torch.nn.Embedding(n_players, torch_val_surfaces.shape[1])  # TODO: Declare an N_SURFACES
        
    def forward(self, data):
        main_embedded = self.main_embedding(data['X'])
        surface_embedded = self.surface_embedding(data['X'])
        s0 = torch.mul(surface_embedded[:, 0, :], data['s'])
        s1 = torch.mul(surface_embedded[:, 1, :], data['s'])
        surface_diff = (s0 - s1).sum(1)
        return torch.sigmoid(main_embedded[:, 0, 0] - main_embedded[:, 1, 0])


class SurfaceLogisticModel(torch.nn.Module):
    # Logit with Surfaces
    def __init__(self, n_players):
        super(SurfaceLogisticModel, self).__init__()
        self.main_embedding = torch.nn.Embedding(n_players, 1)
        self.surface_embedding = torch.nn.Embedding(n_players, torch_val_surfaces.shape[1])  # TODO: Declare an N_SURFACES
        
    def forward(self, data):
        main_embedded = self.main_embedding(data['X'])
        surface_embedded = self.surface_embedding(data['X'])
        s0 = torch.mul(surface_embedded[:, 0, :], data['s'])
        s1 = torch.mul(surface_embedded[:, 1, :], data['s'])
        surface_diff = (s0 - s1).sum(1)
        return torch.sigmoid(main_embedded[:, 0, 0] - main_embedded[:, 1, 0] + surface_diff)

In [19]:
INITIAL_LR = 200.


class SurfaceLogitFitter(TorchRunner):

    def __init__(self, *args, **kwargs):
        super(SurfaceLogitFitter, self).__init__(*args, **kwargs)
        self.last_epoch_loss = 9999999999999
        self.lr = INITIAL_LR

    @property
    def model_cls(self):
        return SurfaceLogisticModel 

    @property
    def model_kwargs(self):
        return {
            'n_players': len(player_map) 
        }
        
    @property
    def optimizer_cls(self):
        return torch.optim.SGD

    @property
    def optimizer_params_kwargs(self):
        params = [
            {'params': self.model.main_embedding.parameters(), 'weight_decay': 0.000001},
            {'params': self.model.surface_embedding.parameters(), 'weight_decay': 0.00001},
        ]
        kwargs = {
            'lr': INITIAL_LR
        }
        return params, kwargs

    @property
    def loss_criterion(self):
        return torch.nn.BCELoss(reduction='none')
    
    def on_epoch_end(self):
        print("Iteration: {}, Loss: {}".format(self.n_epochs, self.epoch_loss))
        val_accuracies = []
        for val_name, val_set in self.validation_sets:
            val_preds = self.model(val_set)
            accuracy = (val_preds.detach().numpy() > 0.5).mean()
            print("Val Set: {}, Accuracy: {}".format(val_name, accuracy))
        if self.epoch_loss > self.last_epoch_loss:  # If training loss is getting worse, halve learning rate
            self.lr /= 2.
            print("Reducing learning rate to %0.2f" % self.lr)
            for pg in self.optimizer.param_groups:
                pg['lr'] = self.lr
        self.last_epoch_loss = self.epoch_loss
        
    def loss(self, minibatch_data):
        # Set-Weighted Loss
        outputs = self.model(minibatch_data)
        y_1 = torch.from_numpy(np.ones(minibatch_data['X'].shape[0], dtype=np.float32))
        y_2 = torch.from_numpy(np.zeros(minibatch_data['X'].shape[0], dtype=np.float32))
        loss_1 = torch.mean(torch.mul(minibatch_data['w1'], self.loss_criterion(outputs, y_1)))
        loss_2 = torch.mean(torch.mul(minibatch_data['w2'], self.loss_criterion(1. - outputs, y_2)))
        loss = loss_1 + loss_2
        return loss
    
class SurfaceAgnosticLogitFitter(SurfaceLogitFitter):

    @property
    def model_cls(self):
        return SurfaceAgnosticLogisticModel
        
    @property
    def optimizer_params_kwargs(self):
        params = [
            {'params': self.model.main_embedding.parameters()},
        ]
        kwargs = {
            'lr': INITIAL_LR
        }
        return params, kwargs
    

In [20]:
surface_agnostic_logit_fitter = SurfaceAgnosticLogitFitter(
    train_dl, validation_sets=[
        ('full_validation', torch_val),
        ('surface_validation', torch_val_w_surface)
    ]
)
surface_logit_fitter = SurfaceLogitFitter(
    train_dl, validation_sets=[
        ('full_validation', torch_val),
        ('surface_validation', torch_val_w_surface)
    ]
)

In [21]:
surface_agnostic_logit_fitter.train(10)

NameError: name 'torch_val_surfaces' is not defined

In [None]:
surface_logit_fitter.train(10)

In [None]:
main_coefs = surface_logit_fitter.model.main_embedding.weight.detach().numpy()[:, 0]
surface_coefs = surface_logit_fitter.model.surface_embedding.weight.detach().numpy()

In [None]:
mean_surface_coef = surface_coefs.mean(axis=1)
surface_coefs -= mean_surface_coef.reshape((len(mean_surface_coef), 1))
main_coefs += mean_surface_coef

In [None]:
embedding_df = pd.DataFrame(
    surface_coefs,
    columns=logit_training_set['surface'].dropna().drop_duplicates()
)
embedding_df['main_effect'] = main_coefs
embedding_df['player_idx'] = range(len(embedding_df))
embedding_df['pid'] = embedding_df['player_idx'].map(player_map)

In [None]:
logit_training_set['surface'].value_counts()

In [None]:
embedding_df.sort_values('main_effect', ascending=False).head()

In [None]:
by_surface_df = embedding_df[['pid']].copy()
by_surface_df['clay'] = embedding_df['main_effect'] + embedding_df['clay']
by_surface_df['hard'] = embedding_df['main_effect'] + embedding_df['hard']
by_surface_df['indoors'] = embedding_df['main_effect'] + embedding_df['indoors']
by_surface_df['grass'] = embedding_df['main_effect'] + embedding_df['grass']

In [None]:
pcounts = logit_training_set['p1_link'].value_counts()
by_surface_df['pcount'] = by_surface_df['pid'].map(pcounts).fillna(0)

In [None]:
stronk = logit_training_set[
    (logit_training_set['p1_link'] == '/player/nadal/') |
    (logit_training_set['p2_link'] == '/player/nadal/')
][[
    'date',
    'p1_link',
    'p2_link',
    'surface'
]]
stronk[stronk['surface'] == 'grass']

In [None]:
by_surface_df[
    by_surface_df['pcount'] > 20
].sort_values('clay', ascending=False).head(20)

In [None]:
by_surface_df[
    by_surface_df['pcount'] > 20
].sort_values('hard', ascending=False).head(20)

In [None]:
by_surface_df[
    by_surface_df['pcount'] > 20
].sort_values('grass', ascending=False).head(20)