The goal of this notebook is to provide a lightweight and fast starting point for training MLP models. It uses a very simple model and is optimized for speed so the whole thing including dataloading and training takes around a minute and half to run on kaggle. Still using the same parameters I got a respectable single model score: `0.143` when trained on the whole dataset.

Things to note:
- All the data is preloaded into GPU and then directly passed to model without any copying. 
- I'm using a custom version of pearson coefficient for loss.
- There is no regularization, just a small model and small number of epochs to prevent overfitting.
- A custom callback allows me to track the competition metric while training.
- A flat constant learning rate, I found that LR scheduling didn't help with just five epochs.
- A large batch size makes training fast.

In [None]:
from fastai.tabular.all import *

### Load the data from feather and stick on GPU

In [None]:
%%time
SPLIT_IDX = 2493988 # train/val split at 80% of time_ids
data_df = pd.read_feather('../input/ubiquant-trainfeather-32-bit/train32.feather')
val_df = data_df.iloc[SPLIT_IDX:].copy()

ftrs = [f'f_{i}' for i in range(300)]
feature_tensor = torch.tensor(data_df[ftrs].to_numpy()).cuda()
target_tensor = torch.tensor(data_df.target.to_numpy()).cuda()

del data_df

### Barebone lightweight dataloading

In [None]:
class UbiquantDataset:
    def __init__(self, feature_tensor, targets):
        store_attr()
        self.n_inp = 2
    def __getitem__(self, idx):
        return torch.empty(0),self.feature_tensor[idx], self.targets[idx, None]
    
    def __len__(self):
        return len(self.feature_tensor)
    
class UbiDL(DataLoader):
    def __iter__(self):
        if self.shuffle:
            self.__idxs = torch.tensor(range(0,self.n))
        else:
            self.__idxs = torch.tensor(range(0,self.n))
        for batch_start in range(0, self.n, self.bs):
            if batch_start + self.bs > self.n and self.drop_last:
                return 
            indices = self.__idxs[batch_start:batch_start+self.bs]
            yield self.dataset[indices]

### A custom metric and loss function for training

In [None]:
def pearson_coef(data):
    return data.corr()['target']['preds']

class CompMetric(AccumMetric):
    def __init__(self, val_df):
        super().__init__(None)
        self.val_df = val_df
        
    @property
    def name(self):
        return 'Pears'
        
    @property
    def value(self):
        preds = torch.cat(self.preds)
        val_df['preds'] = preds.cpu().numpy()
        return np.mean(self.val_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
def pearson_loss(x, y):
    xd = x - x.mean()
    yd = y - y.mean()
    nom = (xd*yd).sum()
    denom = ((xd**2).sum() * (yd**2).sum()).sqrt()
    return 1 - nom / denom

### Use fast.ai Learner for trainig

In [None]:
ds_train = UbiquantDataset(feature_tensor[:SPLIT_IDX], target_tensor[:SPLIT_IDX])
ds_val = UbiquantDataset(feature_tensor[SPLIT_IDX:], target_tensor[SPLIT_IDX:])

dls = DataLoaders.from_dsets(ds_train, ds_val , bs = 4096,dl_type=UbiDL, num_workers=0)

In [None]:
model = TabularModel(emb_szs={}, n_cont=300, out_sz=1, layers = [128,64, 32,16]).cuda()

learn = Learner(dls, model, loss_func=pearson_loss, metrics = CompMetric(val_df))

In [None]:
%%time
learn.fit(5, 1e-3)