In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import os
import sys
sys.path.append('../src')
from torch import nn
from omegaconf import OmegaConf
from torch.utils.data import DataLoader
import torch
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm import tqdm
pd.options.display.max_columns=100
from sklearn.metrics import mean_absolute_error

In [2]:
import modellib
from utils import fc
import datalib
from torch.utils.data import Dataset
from train_meta_regression import get_group_dict,map_dataset
import joblib
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GroupKFold
from litmodellib import Model
import pytorch_lightning as pl
import copy
import glob
import re

In [3]:
class VentilatorDataClassification(Dataset):
    def __init__(
        self, group_dict, breath_df,categorical_columns, numerical_columns, target_column=None
    ):
        self.group_dict = group_dict
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.target_column = target_column
        self.breath_df = breath_df

    def __len__(self):
        return len(self.breath_df)

    def __getitem__(self, idx):
        breath_id = self.breath_df.iloc[idx]
        data = self.group_dict[breath_id]
        
        cat_data = torch.tensor(
            np.array([data[k] for k in self.categorical_columns]).T, dtype=torch.long
        )
        num_data = torch.tensor(
            np.array([data[k] for k in self.numerical_columns]).T, dtype=torch.float32
        )
        u_out = torch.tensor(np.array(data["u_out"]), dtype=torch.long)

        if self.target_column is not None:
            tar_data = torch.tensor(
                np.array(data[self.target_column]), dtype=torch.long
            )
            return {
                "num": num_data,
                "cat": cat_data,
                "target": tar_data,
                "u_out": u_out,
            }
        else:
            return {"num": num_data, "cat": cat_data, "u_out": u_out}

In [4]:
DATA_DIR = '/mnt/disks/extra_data/kaggle/ventilator_prediction/'
R_MAP = {5: 0, 50: 1, 20: 2}
C_MAP = {20: 0, 50: 1, 10: 2}

In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [6]:
# train = pd.read_csv(os.path.join(DATA_DIR,'train.csv'))
# test = pd.read_csv(os.path.join(DATA_DIR,'test.csv'))
# train.breath_id = train.breath_id.map(dict(zip(train.breath_id.unique().tolist(),range(train.breath_id.nunique()))))
# test.breath_id = train.breath_id.map(dict(zip(train.breath_id.unique().tolist(),range(train.breath_id.nunique()))))

In [7]:
# train = pd.read_csv('../data/train_oof.csv')

In [8]:
# mean_absolute_error(train[train.u_out==0].pressure,train[train.u_out==0].preds)

In [8]:
def preprocess(config):
    train = pd.read_csv('../data/train_oof.csv')
    test = pd.read_csv('../data/test_v61.csv')    
    train["R_1"] = train["R"].values
    train["C_1"] = train["C"].values
    test["R_1"] = test["R"].values
    test["C_1"] = test["C"].values

    train = train.groupby("breath_id").head(config.seq_len)
    test = test.groupby("breath_id").head(config.seq_len)    
    config.model.kwargs["output_dim"] = 1
    
    if config.normalization.is_norm:
        scl = StandardScaler()
        print(config.dataset.train.kwargs.numerical_columns)
        for col in config.dataset.train.kwargs.numerical_columns:
            train[col] = scl.fit_transform(train[[col]])
            test[col] = scl.transform(test[[col]])        
    return train,test

def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)    
        
def predict(model,dl,data,device,is_test=False):
    df = copy.deepcopy(data)
    trainer = pl.Trainer(gpus=[device])
    trainer.test(model = model,test_dataloaders=dl)    
    preds = [x['preds'] for x in torch.load('prediction.pt')]
    df['preds'] = preds
    return df        

def get_model_path(model_dir,metric_name):
    regex = re.compile('{}=(\d+\.?\d+)'.format(metric_name))    
    models = glob.glob(model_dir)
    score = [float(regex.findall(x)[0]) for x in models]
    idx = np.argmax(score)
    return models[idx]

In [9]:
config = OmegaConf.load('../experiments/meta_LSTM/config.yaml')

In [10]:
# config.topk=1
# config.topk

In [11]:
%%time
train,test = preprocess(config)

['R_1', 'C_1', 'time_step', 'u_in', 'preds']
CPU times: user 7.44 s, sys: 1.47 s, total: 8.91 s
Wall time: 8.91 s


In [12]:
train.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,preds,R_1,C_1
0,1,1,20,50,-1.685675,-0.614677,0,5.837492,-1.11315,-0.359072,1.394522
1,2,1,20,50,-1.597761,0.397359,0,5.907794,-1.11315,-0.359072,1.394522
2,3,1,20,50,-1.509299,0.625553,0,7.876254,-0.883184,-0.359072,1.394522
3,4,1,20,50,-1.420404,0.642119,0,11.742872,-0.461581,-0.359072,1.394522
4,5,1,20,50,-1.331024,0.782979,0,12.234987,-0.407922,-0.359072,1.394522


In [13]:
test.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,preds,R_1,C_1
0,1,0,5,20,-1.685675,-0.619286,0,-1.067157,-1.124554,-0.354513
1,2,0,5,20,-1.602329,-0.203678,0,-1.105485,-1.124554,-0.354513
2,3,0,5,20,-1.518931,0.191001,0,-0.967505,-1.124554,-0.354513
3,4,0,5,20,-1.435533,0.554839,0,-0.913847,-1.124554,-0.354513
4,5,0,5,20,-1.352216,0.836352,0,-0.752871,-1.124554,-0.354513


In [14]:
train = map_dataset(train)
test = map_dataset(test)

In [15]:
train.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,preds,R_1,C_1
0,1,0,2,1,-1.685675,-0.614677,0,5.837492,-1.11315,-0.359072,1.394522
1,2,0,2,1,-1.597761,0.397359,0,5.907794,-1.11315,-0.359072,1.394522
2,3,0,2,1,-1.509299,0.625553,0,7.876254,-0.883184,-0.359072,1.394522
3,4,0,2,1,-1.420404,0.642119,0,11.742872,-0.461581,-0.359072,1.394522
4,5,0,2,1,-1.331024,0.782979,0,12.234987,-0.407922,-0.359072,1.394522


In [16]:
test.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,preds,R_1,C_1
0,1,0,0,0,-1.685675,-0.619286,0,-1.067157,-1.124554,-0.354513
1,2,0,0,0,-1.602329,-0.203678,0,-1.105485,-1.124554,-0.354513
2,3,0,0,0,-1.518931,0.191001,0,-0.967505,-1.124554,-0.354513
3,4,0,0,0,-1.435533,0.554839,0,-0.913847,-1.124554,-0.354513
4,5,0,0,0,-1.352216,0.836352,0,-0.752871,-1.124554,-0.354513


In [17]:
%%time
train_grp_dict = get_group_dict(train)

100%|██████████| 75450/75450 [01:18<00:00, 961.26it/s] 

CPU times: user 1min 17s, sys: 2.63 s, total: 1min 20s
Wall time: 1min 18s





In [18]:
%%time
test_grp_dict = get_group_dict(test)

100%|██████████| 50300/50300 [00:47<00:00, 1057.95it/s]

CPU times: user 47.3 s, sys: 1.49 s, total: 48.8 s
Wall time: 47.6 s





In [19]:
len(train_grp_dict),len(test_grp_dict)

(75450, 50300)

In [20]:
folds = GroupKFold(n_splits=15)
folds = list(folds.split(train, groups=train["breath_id"]))

In [21]:
len(folds)

15

In [22]:
models = [get_model_path('../experiments/{}/fold_{}/*.ckpt'.format(config.experiment_name,x),'val_MAE') for x in range(1)]
models

['../experiments/meta_LSTM/fold_0/model-epoch=99-val_MAE=0.1087-val_loss=0.1093.ckpt']

In [23]:
oof_preds=[]
test_preds=[]

In [24]:
test_df = VentilatorDataClassification(group_dict = test_grp_dict, breath_df = test[['breath_id']].drop_duplicates()['breath_id'],
                                      categorical_columns = config.dataset.train.kwargs.categorical_columns, 
                                      numerical_columns = config.dataset.train.kwargs.numerical_columns)
test_dl = DataLoader(dataset = test_df,batch_size = 128,num_workers = 8,pin_memory=True,shuffle = False)

In [25]:
len(test_dl)

393

In [26]:
for i in [0]:
# for i in [0]:
    val = train.iloc[folds[i][1]]
    val_df = VentilatorDataClassification(group_dict = train_grp_dict, breath_df = val[['breath_id']].drop_duplicates()['breath_id'],
                                          categorical_columns = config.dataset.train.kwargs.categorical_columns, 
                                          numerical_columns = config.dataset.train.kwargs.numerical_columns, target_column=config.dataset.train.kwargs.target_column)
    val_dl = DataLoader(dataset = val_df,batch_size = 256,num_workers = 8,pin_memory=True,shuffle = False)
    
    wt_dict = torch.load(models[i])
    lit_model = Model(config,topk = 3)
    lit_model.load_state_dict(state_dict=wt_dict['state_dict'])    
    preds = predict(lit_model,val_dl,val[['pressure','id','breath_id','R','C','u_out']],1)
    oot_preds = predict(lit_model,test_dl,test[['id','breath_id','R','C','u_out']],0,is_test=True)
    print('MAE:  ',mean_absolute_error(preds[preds.u_out==0]['pressure'],preds[preds.u_out==0]['preds']))
    oof_preds.append(preds)    
    test_preds.append(oot_preds)        

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_deprecation(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


{'init_type': 'xavier'}
tensor(-60.9272)
tensor(97.7812)


Testing: 0it [00:00, ?it/s]

  rank_zero_deprecation(


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
MAE:   0.10871265711626527


In [27]:
oof_preds = pd.concat(oof_preds,axis=0)

In [28]:
mean_absolute_error(oof_preds[oof_preds.u_out==0]['pressure'],oof_preds[oof_preds.u_out==0]['preds'])

0.10871265711626527

In [29]:
oof_preds.head()

Unnamed: 0,pressure,id,breath_id,R,C,u_out,preds
1200,4.712657,1201,15,1,2,0,4.732855
1201,7.665347,1202,15,1,2,0,7.355991
1202,13.219217,1203,15,1,2,0,13.249399
1203,22.780309,1204,15,1,2,0,23.013123
1204,24.819071,1205,15,1,2,0,25.049059


In [30]:
val = train['pressure'].sort_values().unique()
step = val[0]-val[1]
p_min = val[0]
p_max = val[-1]
oof_preds['processed_pred'] = np.clip(np.round((oof_preds.preds - p_min)/step) * step + p_min, p_min, p_max)


In [31]:
oof_preds.head()

Unnamed: 0,pressure,id,breath_id,R,C,u_out,preds,processed_pred
1200,4.712657,1201,15,1,2,0,4.732855,4.712657
1201,7.665347,1202,15,1,2,0,7.355991,7.384139
1202,13.219217,1203,15,1,2,0,13.249399,13.219217
1203,22.780309,1204,15,1,2,0,23.013123,22.991215
1204,24.819071,1205,15,1,2,0,25.049059,25.029977


In [32]:
mean_absolute_error(oof_preds[oof_preds.u_out==0]['pressure'],oof_preds[oof_preds.u_out==0]['processed_pred'])

0.10724795910338807

In [33]:
oot_preds = pd.concat([x['preds'] for x in test_preds],axis=1).median(axis=1)
oot_preds = pd.concat([test_preds[0]['id'],oot_preds],axis=1)
oot_preds.columns = ['id','pressure']

In [34]:
oot_preds.head()

Unnamed: 0,id,pressure
0,1,6.270146
1,2,5.92106
2,3,7.170955
3,4,7.660649
4,5,9.136593


In [35]:
oot_preds['pressure'] = np.clip(np.round((oot_preds.pressure - p_min)/step) * step + p_min, p_min, p_max)

In [36]:
sub = pd.read_csv(DATA_DIR+'sample_submission.csv')
sub = sub[['id']].merge(oot_preds,on='id',how='left')
sub.pressure = sub.pressure.fillna(0)

In [37]:
sub.to_csv('regression_sub_correct.csv',index=False)

In [35]:
sub.head()

Unnamed: 0,id,pressure
0,1,6.259305
1,2,5.907794
2,3,7.102931
3,4,7.595046
4,5,9.176844


In [28]:
sub.to_csv('../subs/{}.csv'.format(config.experiment_name),index=False)

In [29]:
oof_preds.reset_index(drop=True,inplace=True)

In [30]:
oof_preds.head()

Unnamed: 0,pressure,id,breath_id,R,C,u_out,preds
0,6.259305,241,3,1,1,0,6.259305
1,5.76719,242,3,1,1,0,5.696887
2,6.540513,243,3,1,1,0,6.540513
3,8.649578,244,3,1,1,0,8.860484
4,10.758642,245,3,1,1,0,11.03985


In [31]:
oof_preds = oof_preds[oof_preds.u_out==0]
oof_preds.reset_index(drop=True,inplace=True)

In [32]:
oof_preds.to_feather('../oofs/{}.feather'.format(config.experiment_name))

In [33]:
oof_preds = oof_preds[oof_preds.u_out==0]

In [34]:
oof_preds.head()

Unnamed: 0,pressure,id,breath_id,R,C,u_out,preds
0,6.259305,241,3,1,1,0,6.259305
1,5.76719,242,3,1,1,0,5.696887
2,6.540513,243,3,1,1,0,6.540513
3,8.649578,244,3,1,1,0,8.860484
4,10.758642,245,3,1,1,0,11.03985


In [38]:
(pd.read_csv('../subs/baseline_LSTM_Classification.csv')['pressure']+pd.read_csv('../subs/v2-RNN-classification-top3.csv')['pressure']+
pd.read_csv('../subs/v0-LSTM-classification-top3-smooth-loss.csv')['pressure'])/3

0          6.282739
1          5.907794
2          7.126365
3          7.618480
4          9.141692
             ...   
4023995    0.000000
4023996    0.000000
4023997    0.000000
4023998    0.000000
4023999    0.000000
Name: pressure, Length: 4024000, dtype: float64

In [50]:
m = pd.concat([pd.read_csv('../subs/baseline_LSTM_Classification.csv')['pressure'],pd.read_csv('../subs/v2-RNN-classification-top3.csv')['pressure'],
pd.read_csv('../subs/v0-LSTM-classification-top3-smooth-loss.csv')['pressure']],axis=1)

In [51]:
# m

In [52]:
sub['pressure'] = m.mean(axis=1)

In [53]:
sub.to_csv("avg_lstm_0.17_0.167_0.162.csv",index=False)

In [35]:
(pd.read_csv('../subs/baseline_LSTM_Classification.csv')['pressure']+pd.read_csv('../subs/v2-RNN-classification-top3.csv')['pressure'])/2

0          6.329607
1          5.907794
2          7.173233
3          7.665348
4          9.141692
             ...   
4023995    0.000000
4023996    0.000000
4023997    0.000000
4023998    0.000000
4023999    0.000000
Name: pressure, Length: 4024000, dtype: float64

In [57]:
config = OmegaConf.load('../configs/cnn-lstm-classification.yaml')

In [58]:
config.model

{'class': 'LSTMCNNClassfier', 'kwargs': {'embedding_layer': {'u_out': {'num_embeddings': 2, 'embedding_dim': 32}, 'R': {'num_embeddings': 3, 'embedding_dim': 32}, 'C': {'num_embeddings': 3, 'embedding_dim': 32}}, 'input_dim': 148, 'rnn_layer': {'class': 'LSTM', 'kwargs': {'input_size': 148, 'hidden_size': 512, 'num_layers': 2, 'batch_first': True, 'bidirectional': True, 'dropout': 0.2}}, 'rnn_init': {'class': 'InitRNNWeights', 'kwargs': {'init_type': 'yakama'}}, 'cnn_layer': {'input_layer': {'class': 'Conv1DBnRelu', 'kwargs': {'in_channels': 1024, 'out_channels': 512, 'kernel_size': 3, 'padding': 1}}, 'block1': {'class': 'Conv1DBasicBlock', 'kwargs': {'in_channels': 512, 'out_channels': 512, 'kernel_size': 3, 'padding': 1, 'is_bn': False}}, 'block2': {'class': 'Conv1DBasicBlock', 'kwargs': {'in_channels': 512, 'out_channels': 512, 'kernel_size': 3, 'padding': 1, 'is_bn': False}}, 'block3': {'class': 'Conv1DBasicBlock', 'kwargs': {'in_channels': 512, 'out_channels': 512, 'kernel_size': 

In [59]:
from modellib import LSTMCNNClassfier

In [60]:
config.model.kwargs['output_dim'] = 100

In [61]:
mod = LSTMCNNClassfier(config.model.kwargs)

{'init_type': 'yakama'}
tensor(-8.9367)
tensor(22.9660)


In [62]:
# mod(m)

In [53]:
m = next(iter(val_dl))

In [63]:
preds = mod(m)