# BoolQ (

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cur_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_"

### Requirement

In [None]:
!pip install transformers
!pip install wandb
!pip install pytorch-lightning
!pip install tqdm

### Import packages

In [None]:
import os
import sys
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

import numpy as np

import wandb
import re

from tqdm import tqdm

### Configuration

In [None]:
class config():
  """ Here type your configurations! """
  # paths
  train_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Train.tsv"
  dev_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Dev.tsv"
  test_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Test.tsv"
  train_dev_crop = False

  # model
  model_list = {
      'roberta': "xlm-roberta-large"
      'bert': "monologg/kobert",
      'electra': 'monologg/koelectra-base-v3-discriminator'
  }

  num_classes = 2
  learning_rate = 8e-6

  # dataset
  k_fold = 5

### Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class BoolQ_Dataset(Dataset):
  def __init__(self, config, training=True):
    """ Configuration """ 
    self.config = config

    if training: # for K folding
      self.dataset = self.load_data(config.train_path)
    else: # test data
      self.dataset = self.load_data(config.dev_path)


  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    ## Return text and label
    return {
        "text": self.dataset["text"][idx], 
        "question": self.dataset["question"][idx], 
        "label": self.dataset["label"][idx]
    }


  def load_data(self, dataset_dir):
    dataset = pd.read_csv(dataset_dir, delimiter='\t', names=['ID', 'text', 'question', 'answer'], header=0)
    dataset["label"] = dataset["answer"].astype(int)
    dataset['text'] = dataset['text'].apply(self.pre_process)
    return dataset

  def pre_process(self, st):
    st = re.sub('\(.*\)|\s-\s.*', '', st)
    st = re.sub('\[.*\]|\s-\s.*', '', st)
    st = st.lower()

    st = re.sub('[”“]', '\"', st)
    st = re.sub('[’‘]', '\'', st)
    st = re.sub('[≫〉》＞』」]', '>', st)
    st = re.sub('[《「『〈≪＜]','<',st)
    st = re.sub('[−–—]', '−', st)
    st = re.sub('[･•・‧]','·', st)
    st = st.replace('／', '/')
    st = st.replace('℃', '도')
    st = st.replace('→', '에서')
    st = st.replace('!', '')
    st = st.replace('，', ',')
    st = st.replace('㎢', 'km')
    st = st.replace('∼', '~')
    st = st.replace('㎜', 'mm')
    st = st.replace('×', '곱하기')
    st = st.replace('=', '는')
    st = st.replace('®', '')
    st = st.replace('㎖', 'ml')
    st = st.replace('ℓ', 'l')
    st = st.replace('˚C', '도')
    st = st.replace('˚', '도')
    st = st.replace('°C', '도')
    st = st.replace('°', '도')
    st = st.replace('＋', '+')
    st = st.replace('*', '')
    st = st.replace(';', '.')
    return st
    

In [None]:
test_data = BoolQ_Dataset(config)
print(len(test_data))

for data in test_data:
  print(data)
  break

3665
{'text': '로마 시대의 오리엔트의 범위는 제국 내에 동부 지방은 물론 제국 외부에 있는 다른 국가에 광범위하게 쓰이는 단어였다. 그 후에 로마 제국이 분열되고 서유럽이 그들의 중심적인 세계를 형성하는 과정에서 자신들을 옥시덴트, 서방이라 부르며 오리엔트는 이와 대조되는 문화를 가진 동방세계라는 뜻이 부가되어, 인도와 중국, 일본을 이루는 광범위한 지역을 지칭하는 단어가 되었다.', 'question': '오리엔트는 인도와 중국, 일본을 이루는 광범위한 지역을 지칭하는 단어로 쓰인다.', 'label': 1}


### Define Model

In [None]:
from transformers import (
    BertModel, 
    BertPreTrainedModel, 
    ElectraModel, 
    ElectraPreTrainedModel, 
    XLMRobertaModel, 
    BartModel, 
    BartPretrainedModel, 
    T5Model, 
    RobertaModel 
)

""" KoBert Pre-trained Model """

class Bert_BoolQ(BertPreTrainedModel):
    def __init__(self, config):
        super(Bert, self).__init__(config)
        self.bert = BertModel.from_pretrained(
            'monologg/kobert',
            config=config
        )  # Load pretrained bert
        
        self.num_labels = config.num_labels

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, args.dropout_rate)
        # l2 norm, similarity add
        self.label_classifier = FCLayer(
            config.hidden_size,
            config.num_labels,
            dropout_rate = 0.1, 
            use_activation=False,
        )

    def forward(self, input_ids, attention_mask, token_type_ids, labels):
        outputs = self.bert(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]

        # Dropout -> tanh -> fc_layer (Share FC layer for e1 and e2)
        pooled_output = self.cls_fc_layer(pooled_output)

        # Concat -> fc_layer
        logits = self.label_classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)



""" KoElectra Pre-trained Model """

class Electra_BoolQ(ElectraPreTrainedModel):
    def __init__(self, config):
        super(Electra_BoolQ, self).__init__(config)

        #self.num_labels = config.num_labels
        self.num_labels = config.num_labels
        self.model = ElectraModel.from_pretrained(
            'monologg/koelectra-base-v3-discriminator', config=config)
        self.pooling = PoolingHead(input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1)
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0][:,0,:] #cls
        sequence_output = self.pooling(sequence_output)
        logits = self.qa_classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)



""" XLMRoberta Pre-trained Model """

class XLMRoberta_BoolQ(XLMRobertaModel):
    def __init__(self, config, args):
        super(XLMRoberta, self).__init__(config)
        self.xlmroberta = XLMRobertaModel.from_pretrained("xlm-roberta-large", config=config)  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1)
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.xlmroberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[0][:, 0, :]  # [CLS]

        pooled_output = self.pooling(pooled_output)
        logits = self.qa_classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)



""" Additional Layers """


class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)


class PoolingHead(nn.Module):
    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        return hidden_states



### Training Center

In [None]:
from transformer import AutoConfig, AutoTokenizer
from sklearn.model_selection import StratifiedKFold as SK

# https://visionhong.tistory.com/30
# Here is the code for pl.

class BoolQ_Model_Train():
  def __init__(self, config, model_name):
    super().__init__()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    #####################
    ### Configuration ###
    #####################

    """ define model """

    assert model_name in config.model_list.keys(), "[Training] Give Model Name that have been listed."

    # load configuration of pretrained model
    MODEL_CONFIG = AutoConfig.from_pretrained(config.model_list[model_name])
    MODEL_CONFIG.num_labels = 2

    if model_name == "roberta":
      self.model = XLMRoberta_BoolQ(MODEL_CONFIG)
    elif model_name == "bert":
      self.model = Bert_BoolQ(MODEL_CONFIG)
    elif model_name == "electra":
      self.model = Electra_BoolQ(MODEL_CONFIG)

    self.model.to(device)


    """ Tokenizer """

    self.tokenizer = AutoTokenizer.from_pretrained(config.model_list[model_name])


    """ Dataset """
    # train_dataset
    self.train_dataset = BoolQ_Dataset(config)

    # k_fold index
    skf_iris = StratifiedKFold(n_splits=config.k_fold)
    self.kfold = config.k_fold
    self.KFold_index = list(skf_iris.split(
        self.train_dataset.dataset['text'], self.train_dataset.dataset['label']))
    
    # batch_size
    self.batch_size = config.batch_size


    """ Model """



  def fit(self, epoch):
    for epo in tqdm(range(epoch)):
      ### Stratified KFold
      train_idx, val_idx = self.KFold_index[epo % self.kfold]

      training_set = self.train_dataset[train_idx]
      validation_set = self.train_dataset[val_idx]

      ### make dataloader
      train_loader = Dataloader(training_set, batch_size=64, shuffle=True, collate_fn=self.collate_fn)
      val_loader = Dataloader(validation_set, batch_size=64, shuffle=True, collate_fn=self.collate_fn)

      ### train
      self.training_step(train_loader)
      
      
  def training_step(self, loader):


  # def validation_step(self):

  # def collate_fn(self, batch):


  def collate_fn(self, batch):
    




class Model(pl.LightningModule):
    def __init__(self, vocab_size, Config):
        super().__init__()
        self.model = ClassifierCNN(
            vocab_size,
            Config.embed_dim,
            Config.hidden_dim,
            pretrained_embed_weight = Config._weight
        )

        self.lr = Config.learning_rate
        self.batch_size = Config.batch_size

    def forward(self, sentence):
        return self.model(sentence)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer

    def step(self, batch): # forward and calculate loss
        # (b, len), (b), (b), int
        sentence, label, scr_len, max_scr_len = self._att_tensor(batch)
        y_hat = self(sentence)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(y_hat, label)

        return loss, label, F.softmax(y_hat, dim=1)

    def training_step(self, batch, batch_nb):
        loss, label, y_hat = self.step(batch)
        acc = (torch.argmax(y_hat, dim=1) == label).float().mean().item()
        tensorboard_logs = {'train_loss': loss, 'accuracy': acc}
        return {'loss': loss, 'accuracy': acc, 'tensorboard_log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        loss, label, y_hat = self.step(batch)
        return {'val_loss': loss, 'label': label.detach(), 'y_hat': y_hat.detach()}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        label = torch.cat([x['label'] for x in outputs])
        y_hat = torch.cat([x['y_hat'] for x in outputs])
        acc = (torch.argmax(y_hat, dim=1) == label).float().mean().item()
        print("Epoch {} || acc:{}".format(self.current_epoch, acc))
        tensorboard_logs = {'val_loss': avg_loss, 'val_acc': acc}
        return {
            'avg_val_loss': avg_loss,
            'val_acc': acc,
            'tensorboard_log': tensorboard_logs
        }

    def train_dataloader(self):
        train_loader = DataLoader(
            Corpus_Dataset(True),
            batch_size = 64,
            shuffle = True,
            collate_fn = collate_fn
        )
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(
            Corpus_Dataset(False),
            batch_size = 64,
            shuffle = False,
            collate_fn = collate_fn
        )
        return val_loader

    def _att_tensor(self, batch):
        device = 'cuda' if torch.cuda.is_available() else "cpu"
        sentence_tensor = torch.tensor(batch[0]).long().to(device)
        label_tensor = torch.tensor(batch[1]).long().to(device)
        
        return (sentence_tensor, label_tensor, batch[2], batch[3])

