# Autoencoder Baseline for Ubiquant Market Prediction
some notes:
- this notebook uses an autoencoder to obtain lower-dimensional features
- the regular model uses this lower-dimensional representation to predict our 'target' variable
- a 5Fold GroupTimeSeriesSplit is used for CV, and only the last model is used for inference
- make sure to use GPU when running the code

In [None]:
import pandas as pd
from fastai.tabular.all import *
import numpy as np
import gc
# import optuna

set_seed(42)

# from pympler.tracker import SummaryTracker
# tracker = SummaryTracker()

cont = [f'f_{i}' for i in range(300)]
PATH = '../input/ubiquant-trainfeather-gtss'
splits = pd.read_csv('../input/ubiquant-trainfeather-gtss/cv_splits.csv')

# Config

In [None]:
class CFG_AE:
    layers = [300, 600]
    bottleneck = 150
    ps = 0.025
    bswap = 0.1
    bs = 1024
    

class CFG:
    layers = [450, 600, 600, 450, 300, 150]
    ps = 0.05
    embed_p = 0.5
    bs = 4096

cfg_ae = CFG_AE()
cfg = CFG()

# Dataset/Dataloader and other functions

In [None]:
# Credits to Slawek Biel
class UbiquantDataset:
    def __init__(self, feature_tensor, targets):
        store_attr()
        self.n_inp = 2
    def __getitem__(self, idx):
        return torch.empty(0),self.feature_tensor[idx], self.targets[idx, None]
    
    def __len__(self):
        return len(self.feature_tensor)
    
class UbiDL(DataLoader):
    def __iter__(self):
        if self.shuffle:
            self.__idxs = torch.tensor(range(0,self.n))
        else:
            self.__idxs = torch.tensor(range(0,self.n))
        for batch_start in range(0, self.n, self.bs):
            if batch_start + self.bs > self.n and self.drop_last:
                return 
            indices = self.__idxs[batch_start:batch_start+self.bs]
            yield self.dataset[indices]


# Pearson Loss and Metric

In [None]:
# Mean Pearson Corr Metric
def pearson_coef(data):
    return data.corr()['target']['preds']

class CompMetric(AccumMetric):
    def __init__(self, val_df):
        super().__init__(None)
        self.val_df = val_df
        
    @property
    def name(self):
        return 'Pears'
        
    @property
    def value(self):
        preds = torch.cat(self.preds)
        val_df['preds'] = preds.cpu().numpy()
        return np.mean(self.val_df[['time_id', 'target', 'preds']].groupby('time_id').apply(pearson_coef))
    
# Loss Function
def pearson_loss(x, y):
    xd = x - x.mean()
    yd = y - y.mean()
    nom = (xd*yd).sum()
    denom = ((xd**2).sum() * (yd**2).sum()).sqrt()
    return 1 - nom / denom

# AutoEncoder

In [None]:
class BatchSwapNoise(Module):
    "Swap Noise Module"
    def __init__(self, p): store_attr()

    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x
        
class TabularAE(TabularModel):
    "A simple AutoEncoder model"
    def __init__(self, emb_szs, n_cont, hidden_size, cats, layers, ps=0.05, embed_p=0.2, bswap=None):
        super().__init__(emb_szs, n_cont, layers=layers[::-1], out_sz=hidden_size, embed_p=embed_p, ps=ps, act_cls=nn.ReLU(inplace=True))
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(layers[0], hidden_size, p=ps, act=nn.ReLU(inplace=True))))
        
        if(bswap != None): self.noise = BatchSwapNoise(bswap)
        decoder_layers = [LinBnDrop(hidden_size, layers[0], p=ps, act=nn.ReLU(inplace=True))]
        for i in range(1, len(layers)):
            decoder_layers.append(LinBnDrop(layers[i-1], layers[i], p=ps, act=nn.ReLU(inplace=True)))
        self.decoder = nn.Sequential(*decoder_layers)
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(layers[-1], n_cont, p=ps, bn=False, act=None),
        )
        
    def forward(self, x_cat, x_cont=None, encode=False):
        if(self.bswap != None):
            x_cont = self.noise(x_cont)
        encoded = super().forward(x_cat, x_cont)
        if encode: 
            return encoded # return the representation
        decoded_trunk = self.decoder(encoded)
        decoded_conts = self.decoder_cont(decoded_trunk)
        
        return decoded_conts

# Dataloaders

In [None]:
# Dataloader for Autoencoder
def get_dls_ae(df, fold, SPLIT_IDX):
    feature_tensor = torch.tensor(df[cont].to_numpy()).cuda()
    
    ds_train = UbiquantDataset(feature_tensor[:SPLIT_IDX], feature_tensor[:SPLIT_IDX])
    ds_val = UbiquantDataset(feature_tensor[SPLIT_IDX:], feature_tensor[SPLIT_IDX:])
    
    del df
    gc.collect()

    dls = DataLoaders.from_dsets(ds_train, ds_val, bs = cfg_ae.bs, dl_type=UbiDL, num_workers=0, drop_last=False)

    return dls

# Dataloador for regular model
def get_dls(df, fold, SPLIT_IDX):     

    feature_tensor = torch.tensor(df[cont_embed].to_numpy()).cuda()
    target_tensor = torch.tensor(df.target.to_numpy()).cuda()
    
    ds_train = UbiquantDataset(feature_tensor[:SPLIT_IDX], target_tensor[:SPLIT_IDX])
    ds_val = UbiquantDataset(feature_tensor[SPLIT_IDX:], target_tensor[SPLIT_IDX:])
    
    val_df = df.iloc[SPLIT_IDX:].copy()
    
    del df
    gc.collect()

    dls = DataLoaders.from_dsets(ds_train, ds_val, bs = cfg_ae.bs, dl_type=UbiDL, num_workers=0, drop_last=False)

    return dls, val_df

# Function to obtain autoencoder embeddings
def obtain_embeddings(learn, dl):
    outs = []
    for batch in dl:
        with torch.no_grad():
            learn.model.eval()
            out = learn.model(tensor([]).cuda(), batch[1], encode=True).cpu().numpy()
            outs.append(out)
    outs = np.concatenate(outs)
    return outs

# Training

In [None]:
oof = []
cont_embed = [f'f_{x}' for x in range(cfg_ae.bottleneck)]    

for fold in range(5):  
    print('#'*15)
    print(f'Fold: {fold}')
    print('#'*15)
    print()
    print('Training Autoencoder')
    
    df = pd.read_feather(f'{PATH}/fold_{fold}.feather')
    SPLIT_IDX = splits.iloc[fold, 1]
    
    dls = get_dls_ae(df, fold, SPLIT_IDX)
    
    # Train Autoencoder and obtain outputs
    autoencoder = TabularAE(emb_szs=[], n_cont=len(cont), ps=cfg_ae.ps, hidden_size=cfg_ae.bottleneck, bswap=cfg_ae.bswap, layers=cfg_ae.layers, cats={}).cuda()
    learn_ae = Learner(dls, autoencoder, loss_func=MSELossFlat())
    learn_ae.fit_one_cycle(50, cbs=[SaveModelCallback(monitor='valid_loss'), EarlyStoppingCallback(monitor='valid_loss', patience=3)])
    
    out_train = obtain_embeddings(learn_ae, learn_ae.dls.train)
    out_valid = obtain_embeddings(learn_ae, learn_ae.dls.valid)
    out = np.concatenate((out_train, out_valid), axis=0)    
    
    torch.save(learn_ae.model, f'model_AE_fold_{fold}.pkl')
    
    print('Training model on embeddings')
    
    # Create dataset with embeddings
    df = pd.read_feather(f'{PATH}/fold_{fold}.feather', columns=['target', 'time_id'])
    df[cont_embed] = out

    del dls
    del autoencoder
    del learn_ae
    del out_train
    del out_valid
    del out
    gc.collect()
        
    # Train regular model
    dls, val_df = get_dls(df, fold, SPLIT_IDX) 
    model = TabularModel(emb_szs=[], n_cont=len(cont_embed), layers=cfg.layers, out_sz=1,
                         ps=cfg.ps, embed_p=cfg.embed_p, use_bn=True, act_cls=nn.ReLU(inplace=True)).cuda()
        
    learn = Learner(dls, model, loss_func=pearson_loss, metrics = CompMetric(val_df))   

    learn.fit_one_cycle(30,  
                        cbs=[SaveModelCallback(monitor='Pears', comp=np.greater),
                             EarlyStoppingCallback(monitor='Pears', patience=3, comp=np.greater)])
    
    preds, _ = learn.get_preds(dl=learn.dls.valid)
    oof.append([x[0] for x in preds.tolist()])

    torch.save(learn.model, f'model_fold_{fold}.pkl')
    
    # Delete garbage
    del dls
    del model
    del learn
    del val_df
    del df
    gc.collect()

# OOF Score

In [None]:
# Save OOF
oof = [y for x in oof for y in x]

idx = range(splits.iloc[0, 1], splits.iloc[0, 1]+len(oof))
data = list(zip(idx, oof))
oof = pd.DataFrame(data, columns=['index', 'preds'])

oof.to_csv('oof.csv', index=False)

del data
del idx
del oof
gc.collect()

### Get Score
df = pd.read_feather('../input/ubiquant-trainfeather/train.feather', columns=['time_id', 'target'])
oof = pd.read_csv('./oof.csv').set_index('index')
df = df.join(oof, how='inner')

# Mean Pearson Coefficient
score_per_time = df.groupby('time_id').apply(pearson_coef)
score_equal = np.mean(score_per_time)
score_per_time = pd.DataFrame(score_per_time, columns=['preds']).reset_index()

print(f'OOF score:')
print(f'equal weights: {score_equal}')

plt.plot(score_per_time['preds'])

del df
del oof
gc.collect()

# Get Predictions

In [None]:
# Only uses the last model (trained on most of the data)
model_f4 = torch.load('./model_fold_4.pkl').cuda().eval()
ae_f4 = torch.load('./model_AE_fold_4.pkl').cuda().eval()

import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    
    data = torch.tensor(test_df[cont].to_numpy(), dtype=torch.float).cuda()
    with torch.no_grad():        
        out = ae_f4([], data, encode=True)
        preds_f4 = model_f4([], out)
    
    sample_prediction_df['target'] = preds_f4.view(-1).cpu().numpy()  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions