In [1]:
import os
import re
import pandas as pd
import numpy as np 

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import ExponentialLR,CosineAnnealingWarmRestarts

from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from tokenizers import BertWordPieceTokenizer
from transformers import BertForSequenceClassification, BertForMultipleChoice,  BertTokenizer, AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm import tqdm

In [2]:
class LogDataModule(LightningDataModule):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams
    
    def read_data(self, path):
        if path.endswith('csv'):
            return pd.read_csv(path)
        elif path.endswith('pkl'):
            return pd.read_pickle(path)
        
    def split_data(self, df):
        trn_df, val_df = train_test_split(df, stratify=df['level'], test_size=0.3, random_state=42, shuffle=True)
        return trn_df, val_df
    
    def make_dataset(self, df, stage):   
        if stage=='Train' or stage=='Valid':
            return TensorDataset(torch.tensor(df['input_ids'].to_list(), dtype=torch.long),
                                 torch.tensor(df['attention_mask'].to_list(), dtype=torch.long),
                                 torch.tensor(df['postion_ids'].to_list(), dtype=torch.long),
                                 torch.tensor(df['level'].to_list(), dtype=torch.long))
        else:
            return TensorDataset(torch.tensor(df['input_ids'].to_list(), dtype=torch.long),
                                 torch.tensor(df['attention_mask'].to_list(), dtype=torch.long),
                                 torch.tensor(df['postion_ids'].to_list(), dtype=torch.long))
        
    def setup(self):
        train_df = self.read_data(self.hparams['train_data_path'])
        test_df = self.read_data(self.hparams['test_data_path'])
        
        train_df['postion_ids'] = train_df['input_ids'].map(lambda x: [i for i in range(len(x))])
        test_df['postion_ids'] = test_df['input_ids'].map(lambda x: [i for i in range(len(x))])
        
        trn_df, val_df = self.split_data(train_df)
        
        self.train_ds = self.make_dataset(trn_df, 'Train')
        self.val_ds = self.make_dataset(val_df, 'Valid')
        self.test_ds = self.make_dataset(test_df, 'Test')
        
        
    def dataloader(self, dataset):
        return DataLoader(
            dataset,
            batch_size = self.hparams['batch_size'],
            shuffle = False,
            num_workers=self.hparams['num_workers']
        )
        
    def train_dataloader(self):  
        return self.dataloader(self.train_ds)
    
    def val_dataloader(self):
        return self.dataloader(self.val_ds)
    
    def test_dataloader(self):
        return  DataLoader(self.test_ds,
                           batch_size = self.hparams['test_batch_size'],
                           num_workers=self.hparams['num_workers'],
                           pin_memory=True)

In [3]:
dm_args = {
    'train_data_path':'./dataset/clean_train.pkl',
    'test_data_path':'./dataset/clean_drop_test.pkl',
    'batch_size': 16,
    'test_batch_size': 128,
    'num_workers': 4,
}

log_dm = LogDataModule(dm_args)

In [4]:
log_dm.setup()

In [5]:
class Model(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters() 
        self.bert = BertForSequenceClassification.from_pretrained(self.hparams.pretrained_model, 
                                                                  num_labels = 7)
    def forward(self, **kwargs):
        return self.bert(**kwargs)

    def step(self, batch, batch_idx):
        output = self.bert(input_ids = batch[0], 
                           attention_mask = batch[1], 
                           position_ids = batch[2],
                           labels = batch[-1])

        # Transformers 4.0.0+
        loss = output.loss
        logits = output.logits

        preds = logits.argmax(dim=-1)

        y_true = list(batch[-1].cpu().numpy())
        y_pred = list(preds.cpu().numpy())

        return {
            'loss': loss,
            'y_true': y_true,
            'y_pred': y_pred,
        }

    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def validation_step(self, batch, batch_idx):
        return self.step(batch, batch_idx)

    def epoch_end(self, outputs, state='train'):
        loss = 0.0
        y_true = []
        y_pred = []
        
        for i in outputs:
            loss += i['loss'].item()
            y_true += i['y_true']
            y_pred += i['y_pred']
            
        loss = loss / len(outputs)

        self.log(state+'_loss', float(loss), 
                 on_epoch=True, prog_bar=True)
        self.log(state+'_acc', accuracy_score(y_true, y_pred), 
                 on_epoch=True, prog_bar=True, logger=True)
        self.log(state+'_precision', precision_score(y_true, y_pred, average='macro', labels=np.unique(y_true)),
                 on_epoch=True, prog_bar=True, logger=True)
        self.log(state+'_recall', recall_score(y_true, y_pred, average='macro', labels=np.unique(y_true)),
                 on_epoch=True, prog_bar=True, logger=True)
        self.log(state+'_f1', f1_score(y_true, y_pred, average='macro', labels=np.unique(y_true)),
                 on_epoch=True, prog_bar=True, logger=True)
        return {'loss': loss}

    def train_epoch_end(self, outputs):
        return self.epoch_end(outputs, state='train')

    def validation_epoch_end(self, outputs):
        return self.epoch_end(outputs, state='val')

    def configure_optimizers(self):
        if self.hparams.optimizer == 'AdamW':
            optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        elif self.hparams.optimizer == 'AdamP':
            from adamp import AdamP
            optimizer = AdamP(self.parameters(), lr=self.hparams.lr)
        else:
            raise NotImplementedError('Only AdamW and AdamP is Supported!')
        if self.hparams.lr_scheduler == 'cos':
            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=1, T_mult=2)
        elif self.hparams.lr_scheduler == 'exp':
            scheduler = ExponentialLR(optimizer, gamma=0.5)
        else:
            raise NotImplementedError(
                'Only cos and exp lr scheduler is Supported!')
        return {
            'optimizer': optimizer,
            'scheduler': scheduler,
        }

In [7]:
args = {
    'random_seed': 42,  # Random Seed
    'pretrained_model': 'bert-base-uncased',  # Transformers PLM name
    'lr': 5e-5,  # Starting Learning Rate
    'epochs': 30,  # Max Epochs
    'optimizer': 'AdamW',  # AdamW vs AdamP
    'lr_scheduler': 'exp',  # ExponentialLR vs CosineAnnealingWarmRestarts
    'fp16': True,  # Enable train on FP16
}

model = Model(**args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
checkpoint_callback = ModelCheckpoint(
    filename='{epoch}-{val_f1:.4f}',
    monitor='val_f1',
    save_top_k=1,
    mode='max',
)

early_stop_callback = EarlyStopping(monitor='val_f1', 
                                    patience=3, 
                                    verbose=True, 
                                    mode='max')

In [9]:
seed_everything(42)

trainer = Trainer(
        callbacks=[checkpoint_callback, early_stop_callback],
        max_epochs=args['epochs'],
        fast_dev_run=False,
        deterministic=torch.cuda.is_available(),
        gpus=-1 if torch.cuda.is_available() else None,
        precision= 16 if args['fp16'] else 32
    )

Global seed set to 42
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
Using native 16bit precision.


In [10]:
trainer.fit(model, log_dm)


  | Name | Type                          | Params
-------------------------------------------------------
0 | bert | BertForSequenceClassification | 109 M 
-------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.950   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Validating: 0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Validating: 0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


1

## INFERENCE

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
test_loader = log_dm.test_dataloader()

In [10]:
# best_model_path = trainer.checkpoint_callback.best_model_path
best_model_path = './lightning_logs/version_5/checkpoints/epoch=3-val_f1=0.9486.ckpt'
best_model = model.load_from_checkpoint(best_model_path).to(device)
best_model.freeze()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
seed_everything(42)
best_model.eval()

result = np.zeros(shape=(1, 7))

with torch.no_grad():
    for step, batch in enumerate(tqdm(test_loader)):
        output = best_model(input_ids = batch[0].to(device),
                            attention_mask = batch[1].to(device)
                           )
        predict_proba = torch.nn.functional.softmax(output.logits, dim=1)
        result = np.concatenate((result, predict_proba.detach().cpu().numpy()), axis=0)

Global seed set to 42
100%|██████████| 953/953 [15:08<00:00,  1.05it/s]


In [16]:
final = np.argmax(result[1:], axis=1)
# final[np.where(np.max(result[1:], axis=1) < 0.9)] = 7

In [17]:
from collections import Counter

Counter(final)

Counter({1: 121970})

In [19]:
len(test_drop_df)

121970

In [24]:
output.logits

tensor([[ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.0341,  0.0111, -4.8290],
        [ 1.2143,  3.7178, -6.2712, -0.7919, -6.034

In [20]:
predict_proba

tensor([[7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.1974e-02,
         1.7372e-04],
        [7.3190e-02, 8.9473e-01, 4.1069e-05, 9.8442e-03, 5.2059e-05, 2.19

In [18]:
test_df = pd.read_pickle('./dataset/clean_test.pkl')
test_drop_df = pd.read_pickle('./dataset/clean_drop_test.pkl')

In [18]:
test_drop_df['level'] = np.argmax(result[1:], axis=1)

In [20]:
test_log_df['level'].value_counts()

1    133639
0     11415
Name: level, dtype: int64

In [33]:
test_df = pd.merge(test_df, test_log_df, how='left', on='clean_log')

In [36]:
pred = test_df[['id', 'level']]
pred.to_csv('./bert_baseline.csv', index=False)