Fastai tutorial Notebook by @slawekbiel
Notebook link >
https://www.kaggle.com/slawekbiel/fast-fastai-training

Cross Validation in Time Series Idea by @lucasmorin
Discussion Link >
https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302710


In [None]:
from fastai.tabular.all import *
import gc
import random
import os

import numpy as np
from scipy.stats import pearsonr as p
import scipy.stats

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
num_gpus = torch.cuda.device_count()
print('gpu count : ', num_gpus)

for gpu_id in range(num_gpus):
    print(f'set gpu id : {gpu_id}')
    torch.cuda.set_device(gpu_id)

# Barebone lightweight dataloading

In [None]:
class UbiquantDataset:
    def __init__(self, feature_tensor, targets):
        store_attr()
        self.n_inp = 2
    def __getitem__(self, idx):
        return torch.empty(0),self.feature_tensor[idx], self.targets[idx, None]
    
    def __len__(self):
        return len(self.feature_tensor)
    
class UbiDL(DataLoader):
    def __iter__(self):
        if self.shuffle:
            self.__idxs = torch.tensor(range(0,self.n))
        else:
            self.__idxs = torch.tensor(range(0,self.n))
        for batch_start in range(0, self.n, self.bs):
            if batch_start + self.bs > self.n and self.drop_last:
                return 
            indices = self.__idxs[batch_start:batch_start+self.bs]
            yield self.dataset[indices]

# A custom metric and loss function for training

In [None]:
def pearson_coef(data):
    return data.corr()['target']['preds']

class CompMetric(AccumMetric):
    def __init__(self, val_df):
        super().__init__(None)
        self.val_df = val_df
        
    @property
    def name(self):
        return 'Valid_Pearson'
        
    @property
    def value(self):
        preds = torch.cat(self.preds)
        self.val_df['preds'] = preds.cpu().numpy()
        return np.mean(self.val_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))

In [None]:
from sklearn.metrics import mean_squared_error

def pearson_loss(x, y):
    xd = x - x.mean()
    yd = y - y.mean()
    nom = (xd*yd).sum()
    denom = ((xd**2).sum() * (yd**2).sum()).sqrt()
    return 1 - nom / denom

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Inference Function

In [None]:
def inference(models, data):
    preds = []
    for model in models:
        with torch.no_grad():
            pred = model([], data).view(-1).cpu().numpy()
            preds.append(pred)
    res = np.mean(np.stack(preds), axis=0)
    return res

# Load the data from feather

In [None]:
from sklearn.model_selection import GroupKFold

# Seed everything
seed_everything(42)

FOLDS = 5
# Read data
train = pd.read_feather('../input/ubiquant-trainfeather-32-bit/train32.feather')

# Feature list
features = [col for col in train.columns if col not in ['row_id', 'time_id', 'investment_id', 'target']]

# Create groups based on time_id
train.loc[(train['time_id'] >= 0) & (train['time_id'] < 280), 'group'] = 0
train.loc[(train['time_id'] >= 280) & (train['time_id'] < 585), 'group'] = 1
train.loc[(train['time_id'] >= 585) & (train['time_id'] < 825), 'group'] = 2
train.loc[(train['time_id'] >= 825) & (train['time_id'] < 1030), 'group'] = 3
train.loc[(train['time_id'] >= 1030) & (train['time_id'] < 1400), 'group'] = 4
train['group'] = train['group'].astype(np.int16)

# Use fast.ai Learner for trainig with Sequential Cross Validation

![CV](https://miro.medium.com/max/753/1*qvdnPF8ETV9mFdMT0Y_BBA.png)

### train : validation = 8 : 2

> ### And 

### test -> next group data

In [None]:
%%time

for fold in range(FOLDS):
    group_index = train[train['group'] <= fold].index
    group_index_list = group_index.values.tolist()
    trn_ind = group_index_list[:int(len(group_index_list)*0.8)]
    val_ind = group_index_list[int(len(group_index_list)*0.8):]

    print(f'Training fold {fold}')
    print(f'Training with {len(trn_ind)} rows')
    print(f'Validating with {len(val_ind)} rows')
    print(f'Training light gradient boosting model with {len(features)} features...')
    print('train time ids is ... ', train['time_id'].loc[trn_ind].values[:5], ' ~ ', train['time_id'].loc[trn_ind].values[-5:])
    print('train idx is ... ', trn_ind[:5], ' ~ ', trn_ind[-5:])
    print('valid time ids is ... ', train['time_id'].loc[val_ind].values[:5], ' ~ ', train['time_id'].loc[val_ind].values[-5:])
    print('valid idx is ... ', val_ind[:5], ' ~ ', val_ind[-5:])

    
    x_train, x_val = train[features].loc[trn_ind], train[features].loc[val_ind]
    y_train, y_val = train['target'].loc[trn_ind], train['target'].loc[val_ind]


    feature_tensor_train = torch.tensor(x_train.to_numpy()).cuda()
    target_tensor_train = torch.tensor(y_train.to_numpy()).cuda()
    feature_tensor_valid = torch.tensor(x_val.to_numpy()).cuda()
    target_tensor_valid = torch.tensor(y_val.to_numpy()).cuda()
    
    
    del x_train, x_val, y_train, y_val


    ds_train = UbiquantDataset(feature_tensor_train, target_tensor_train)
    ds_val = UbiquantDataset(feature_tensor_valid, target_tensor_valid)


    del feature_tensor_train, target_tensor_train, feature_tensor_valid, target_tensor_valid
    
    
    dls = DataLoaders.from_dsets(ds_train, ds_val , bs = 4096, dl_type=UbiDL, num_workers=0)


    model = TabularModel(emb_szs={}, n_cont=len(features), out_sz=1, layers = [128, 64, 32, 16]).cuda()

    
    # pearson loss > 
    learn = Learner(dls, model, loss_func=pearson_loss, metrics = CompMetric(train.loc[val_ind]))
    # mse loss >
    # learn = Learner(dls, model, loss_func=MSELossFlat(), metrics=CompMetric(train.loc[val_ind]))

        
    print('=start train=')
    learn.fit(10, 1e-3,  cbs=EarlyStoppingCallback(monitor="Valid_Pearson", patience=2))
    # arn.fit_one_cycle(5, 1e-3, cbs=[SaveModelCallback(monitor="Valid_Pearson", comp=np.less)])


    torch.save(learn.model, f'model_{fold}.pth')
    
    del ds_train, ds_val
    del dls, model, learn
    
    
    
    
    
    ###############################################################################################
    # testing at next group < 0 ~ 3 and group 4
    if fold < 4:
        # load models
        loaded_models = []
        for i in range(fold + 1):
            loaded_model = torch.load(f'model_{i}.pth').cuda().eval()
            loaded_models.append(loaded_model)
        
        print('=' * 50)
        print(f' > fold {fold} model Scores at test data(next fold & final fold) < ')
        print('=' * 50)
        
        # at next group
        test_df = train[train['group'] == (fold + 1)]
        test_data = torch.tensor(test_df[features].to_numpy(), dtype=torch.float).cuda()
        
        ensemble_preds = inference(loaded_models, test_data)
        preds = inference(loaded_models[-1:], test_data)
        print('=' * 50)
        pearson_score = p(preds, test_df.target)[0]
        print(f'fold {str(fold)} model Pearson at group {str(fold+1)} data : ', pearson_score)
        rmse_score = rmse(test_df.target, preds)
        print(f'fold {str(fold)} model RMSE at group {str(fold+1)} data : ', rmse_score)
        
        pearson_score = p(ensemble_preds, test_df.target)[0]
        print(f'fold 0 ~ {str(fold)} model ensemble Pearson at group {str(fold+1)} data : ', pearson_score)
        rmse_score = rmse(test_df.target, ensemble_preds)
        print(f'fold 0 ~ {str(fold)} model ensemble RMSE at group {str(fold+1)} data : ', rmse_score)
        print('=' * 50)
        del test_df, test_data, preds, ensemble_preds
        
        # at group 4
        test_df = train[train['group'] == 4]
        test_data = torch.tensor(test_df[features].to_numpy(), dtype=torch.float).cuda()
        # with torch.no_grad():
        #     preds = loaded_model([], test_data).view(-1).cpu().numpy()
        ensemble_preds = inference(loaded_models, test_data)
        preds = inference(loaded_models[-1:], test_data)
        
        print('=' * 50)
        pearson_score = p(preds, test_df.target)[0]
        print(f'fold {str(fold)} model Pearson at group 4 data : ', pearson_score)
        rmse_score = rmse(test_df.target, preds)
        print(f'fold {str(fold)} model RMSE at group 4 data : ', rmse_score)
        pearson_score = p(ensemble_preds, test_df.target)[0]
        print(f'fold 0 ~ {str(fold)} model ensemble Pearson at group 4 data : ', pearson_score)
        rmse_score = rmse(test_df.target, ensemble_preds)
        print(f'fold 0 ~ {str(fold)} model ensemble RMSE at group 4 data : ', rmse_score)
        print('=' * 50)
        del loaded_model, test_df, test_data, preds, ensemble_preds
        
    gc.collect()

# Conclusion

At this result, The greater the time difference, the lower the score, and the smaller the time difference, the higher the score.
(Especially, in the fold 1 data and fold 4 data results of the fold 0 model,)

Maybe it's the fate we'll face in Private Score...
(The larger the time difference between LB and PB data, the more)

Is ensembe always right? 

Sometimes it's good and sometimes it's not.
(At least in this result.)

And, it will depend on the environment of the future market. 

We don't know what the environment would be like. 

Maybe, this is why you should use a generalized model.

# Submission

In [None]:
models = []
for fold in range(FOLDS):
    models.append(torch.load(f'model_{fold}.pth').cuda().eval())

In [None]:
def submission_inference(models, data):
    preds = []
    for model in models:
        with torch.no_grad():
            pred = model([], data).view(-1).cpu().numpy()
            preds.append(pred)
#     res = np.mean(np.stack(preds), axis=0)
#     print('mean : ', res)
    res = preds[4] * 0.6 + preds[3] * 0.2 + preds[2] * 0.1 + preds[1] * 0.05 + + preds[0] * 0.05
    # print('weighted : ', res)
    return res

In [None]:
import ubiquant

env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sub_df) in iter_test:
    
    data = torch.tensor(test_df[features].to_numpy(), dtype=torch.float).cuda()
    preds = submission_inference(models, data)
        
    sub_df['target'] = preds
    env.predict(sub_df) 