In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn

from pl_trainers import ModelTrainer
from config import Config
import gc

In [2]:
# from torchviz import make_dot
from torchsummary import summary
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [3]:
torch.cuda.is_available()

True

#### Training #####

In [4]:
class KFoldTraining():
    def __init__(self, k=5):
        self.k = k
    
    def get_data(self, cfg,fold):
        # INDICES OF TRAIN AND VALID FOLDS
        valid_idx = [2*fold+1, 2*fold+2]
        train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

        print('#'*25)
        print(f'### Fold {fold+1} with valid files', valid_idx)

        # READ TRAIN DATA FROM DISK
        X_train = []; y_train = []
        for k in train_idx:
            X_train.append( np.load(f'{cfg.prc_data_dir}data_{k}.npy'))
            y_train.append( pd.read_parquet(f'{cfg.prc_data_dir}targets_{k}.pqt') )
        X_train = np.concatenate(X_train,axis=0)
        y_train = pd.concat(y_train).target.values
        print('### Training data shapes', X_train.shape, y_train.shape)

        # READ VALID DATA FROM DISK
        X_valid = []; y_valid = []
        for k in valid_idx:
            X_valid.append( np.load(f'{cfg.prc_data_dir}data_{k}.npy'))
            y_valid.append( pd.read_parquet(f'{cfg.prc_data_dir}targets_{k}.pqt') )
        X_valid = np.concatenate(X_valid,axis=0)
        y_valid = pd.concat(y_valid).target.values
        print('### Validation data shapes', X_valid.shape, y_valid.shape)
        print('#'*25)
        
        return X_train, y_train, X_valid, y_valid
    
    def train(self):
        print('='*30, '\n')
        cfg = Config()
        for fold in range(self.k):
            X_train, y_train, X_valid, y_valid = self.get_data(cfg, fold)
            model = ModelTrainer(cfg, X_train, y_train, X_valid, y_valid, num_workers = 15)
            
            checkpoint_callback = ModelCheckpoint( monitor="valid/epoch/amex_metric_mod", 
                                                  dirpath="checkpoints",
                                                  filename=f"{cfg.model_name}_{fold}_{cfg.version}",
                                                  save_top_k=1,
                                                  mode="max")
            
            es_callback = EarlyStopping(monitor="valid/epoch/amex_metric_mod", mode="max", min_delta=0.0005, patience=cfg.es_patience)
            
            
            trainer = Trainer(max_epochs=cfg.EPOCHS,
                  fast_dev_run=False,
                  callbacks=[checkpoint_callback, es_callback],
                  gpus=1)
            
            trainer.fit(model)
            
            print('='*30, f'{fold} Training Ended')
            
            del model, X_train, y_train, X_valid, y_valid
            gc.collect()
            
        
        

In [5]:
five_fold_training = KFoldTraining(k=5)

In [6]:
five_fold_training.train()


#########################
### Fold 1 with valid files [1, 2]
### Training data shapes (367131, 13, 188) (367131,)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


### Validation data shapes (91782, 13, 188) (91782,)
#########################



  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | AttentionModelConv1d | 138 K 
1 | criterion | BCELoss              | 0     
---------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Eval metric value: 36.392% Validation Set


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Eval metric value: 77.410% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.671% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.825% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.830% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.929% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.200% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.117% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.963% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.566% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.437% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.995% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.315% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.391% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.287% Validation Set
#########################
### Fold 2 with valid files [3, 4]
### Training data shapes (367131, 13, 188) (367131,)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


### Validation data shapes (91782, 13, 188) (91782,)
#########################



  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | AttentionModelConv1d | 138 K 
1 | criterion | BCELoss              | 0     
---------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Eval metric value: -3.521% Validation Set


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Eval metric value: 76.898% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.341% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.463% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.708% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.920% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.894% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.858% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.567% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.278% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.246% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.909% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.559% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 76.410% Validation Set
#########################
### Fold 3 with valid files [5, 6]
### Training data shapes (367131, 13, 188) (367131,)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


### Validation data shapes (91782, 13, 188) (91782,)
#########################



  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | AttentionModelConv1d | 138 K 
1 | criterion | BCELoss              | 0     
---------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Eval metric value: -1.430% Validation Set


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Eval metric value: 77.590% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.050% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.118% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.202% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.236% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.260% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.902% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.929% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.753% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.011% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.632% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.462% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.228% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.191% Validation Set
#########################
### Fold 4 with valid files [7, 8]
### Training data shapes (367131, 13, 188) (367131,)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


### Validation data shapes (91782, 13, 188) (91782,)
#########################



  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | AttentionModelConv1d | 138 K 
1 | criterion | BCELoss              | 0     
---------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Eval metric value: -14.379% Validation Set


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Eval metric value: 77.582% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.833% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.126% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.061% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.281% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.817% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.252% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.891% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.024% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.428% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.436% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.326% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.001% Validation Set
#########################
### Fold 5 with valid files [9, 10]
### Training data shapes (367128, 13, 188) (367128,)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


### Validation data shapes (91785, 13, 188) (91785,)
#########################



  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | AttentionModelConv1d | 138 K 
1 | criterion | BCELoss              | 0     
---------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Eval metric value: 4.486% Validation Set


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Eval metric value: 77.588% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 77.943% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.271% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.506% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.500% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.385% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.114% Validation Set


Validating: 0it [00:00, ?it/s]

Eval metric value: 78.015% Validation Set


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.