# BoolQ (판정 의문문, 정현진)

### 기본 세팅 (colab pro)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Dec  7 16:23:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cur_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_"

### Requirement

In [None]:
!pip install transformers
!pip install wandb
!pip install pytorch-lightning
!pip install tqdm
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 558 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

### Import packages

In [None]:
import os
import sys
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

import numpy as np

import wandb
import re

from tqdm import tqdm

### Configuration

In [None]:
class config():
  """ Here type your configurations! """
  # paths
  train_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Train.tsv"
  dev_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Dev.tsv"
  test_path = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/SKT_BoolQ_Test.tsv"
  train_dev_crop = False

  # model
  model_list = {
      'roberta': "klue/roberta-large",
      'bigbird': "monologg/kobigbird-bert-base",
      'electra': 'monologg/koelectra-base-v3-discriminator'
  }

  num_classes = 2

  # dataset
  k_fold = 5
  batch_size = 2

  # optimizer, schedular
  learning_rate = 8e-6
  weight_decay = 0.01
  warmup_steps = 500

  # Save
  log_interval = 10
  mode_wandb = True
  save_dir = "/content/drive/MyDrive/Colab Notebooks/BoolQ_/result/"


### Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader

class BoolQ_Dataset(Dataset):
  def __init__(self, config, training=True):
    """ Configuration """ 
    self.config = config

    if training: # for K folding
      self.dataset = self.load_data(config.train_path)
    else: # test data
      self.dataset = self.load_data(config.dev_path)


  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    ## Return text and label
    return {
        "text": self.dataset["text"].values[idx], 
        "question": self.dataset["question"].values[idx], 
        "label": self.dataset["label"].values[idx]
    }


  def load_data(self, dataset_dir):
    dataset = pd.read_csv(dataset_dir, delimiter='\t', names=['ID', 'text', 'question', 'answer'], header=0)
    dataset["label"] = dataset["answer"].astype(int)
    dataset['text'] = dataset['text'].apply(self.pre_process)
    return dataset

  def pre_process(self, st):
    st = re.sub('\(.*\)|\s-\s.*', '', st)
    st = re.sub('\[.*\]|\s-\s.*', '', st)
    st = st.lower()

    st = re.sub('[”“]', '\"', st)
    st = re.sub('[’‘]', '\'', st)
    st = re.sub('[≫〉》＞』」]', '>', st)
    st = re.sub('[《「『〈≪＜]','<',st)
    st = re.sub('[−–—]', '−', st)
    st = re.sub('[･•・‧]','·', st)
    st = st.replace('／', '/')
    st = st.replace('℃', '도')
    st = st.replace('→', '에서')
    st = st.replace('!', '')
    st = st.replace('，', ',')
    st = st.replace('㎢', 'km')
    st = st.replace('∼', '~')
    st = st.replace('㎜', 'mm')
    st = st.replace('×', '곱하기')
    st = st.replace('=', '는')
    st = st.replace('®', '')
    st = st.replace('㎖', 'ml')
    st = st.replace('ℓ', 'l')
    st = st.replace('˚C', '도')
    st = st.replace('˚', '도')
    st = st.replace('°C', '도')
    st = st.replace('°', '도')
    st = st.replace('＋', '+')
    st = st.replace('*', '')
    st = st.replace(';', '.')
    return st
    

In [None]:
test_data = BoolQ_Dataset(config)
print(len(test_data))

for data in test_data:
  print(data)
  break

batch = test_data[:8]

3665
{'text': '로마 시대의 오리엔트의 범위는 제국 내에 동부 지방은 물론 제국 외부에 있는 다른 국가에 광범위하게 쓰이는 단어였다. 그 후에 로마 제국이 분열되고 서유럽이 그들의 중심적인 세계를 형성하는 과정에서 자신들을 옥시덴트, 서방이라 부르며 오리엔트는 이와 대조되는 문화를 가진 동방세계라는 뜻이 부가되어, 인도와 중국, 일본을 이루는 광범위한 지역을 지칭하는 단어가 되었다.', 'question': '오리엔트는 인도와 중국, 일본을 이루는 광범위한 지역을 지칭하는 단어로 쓰인다.', 'label': 1}


### Define Model

In [None]:
from transformers import (
    BigBirdModel,
    BigBirdPreTrainedModel, 
    ElectraModel, 
    ElectraPreTrainedModel, 
    XLMRobertaModel, 
    BartModel, 
    BartPretrainedModel, 
    T5Model, 
    RobertaModel 
)

""" KoBigBird Pre-trained Model """

class BigBird_BoolQ(BigBirdPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bigbird = BigBirdModel.from_pretrained(
            "monologg/kobigbird-bert-base",
            config=config
        )  # Load pretrained bigbird
        
        self.num_labels = config.num_labels

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, dropout_rate=0.1)
        # l2 norm, similarity add
        self.label_classifier = FCLayer(
            config.hidden_size,
            config.num_labels,
            dropout_rate = 0.1, 
            use_activation=False,
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bigbird(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = outputs[1]  # [CLS]

        # Dropout -> tanh -> fc_layer (Share FC layer for e1 and e2)
        pooled_output = self.cls_fc_layer(pooled_output)

        # Concat -> fc_layer
        logits = self.label_classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)



""" KoElectra Pre-trained Model """

class Electra_BoolQ(ElectraPreTrainedModel):
    def __init__(self, config):
        super(Electra_BoolQ, self).__init__(config)

        #self.num_labels = config.num_labels
        self.num_labels = config.num_labels
        self.model = ElectraModel.from_pretrained(
            'monologg/koelectra-base-v3-discriminator', config=config)
        self.pooling = PoolingHead(input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1)
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0][:,0,:] #cls
        sequence_output = self.pooling(sequence_output)
        logits = self.qa_classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)



""" Roberta Pre-trained Model """
class Roberta_BoolQ(RobertaModel):
    def __init__(self, config):
        super(Roberta, self).__init__(config)
        self.roberta = RobertaModel.from_pretrained("klue/roberta-large", config=config)  # Load pretrained Electra

        self.num_labels = config.num_labels

        self.pooling = PoolingHead(input_dim=config.hidden_size,
            inner_dim=config.hidden_size,
            pooler_dropout=0.1)
        self.qa_classifier = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.roberta(
            input_ids, attention_mask=attention_mask
        )  # sequence_output, pooled_output, (hidden_states), (attentions)
        pooled_output = outputs[0][:, 0, :]  # [CLS]

        pooled_output = self.pooling(pooled_output)
        # pooled_output_cat = torch.cat([pooled_output, pooled_output2], dim=1)
        
        logits = self.qa_classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        return outputs  # logits, (hidden_states), (attentions)        return outputs  # logits, (hidden_states), (attentions)



""" Additional Layers """


class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)


class PoolingHead(nn.Module):
    def __init__(
        self,
        input_dim: int,
        inner_dim: int,
        pooler_dropout: float,
    ):
        super().__init__()
        self.dense = nn.Linear(input_dim, inner_dim)
        self.dropout = nn.Dropout(p=pooler_dropout)

    def forward(self, hidden_states: torch.Tensor):
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.dense(hidden_states)
        hidden_states = torch.tanh(hidden_states)
        return hidden_states



### Training Center

In [None]:
import transformers
from transformers import AutoConfig, AutoTokenizer

from sklearn.model_selection import StratifiedKFold

from torch.utils.data import Subset

# https://visionhong.tistory.com/30
# Here is the code for pl.

class BoolQ_Model_Train():
  def __init__(self, config, model_name):
    super().__init__()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    self.device = device
    self.config = config

    #####################
    ### Configuration ###
    #####################

    """ Model """

    assert model_name in config.model_list.keys(), "[Training] Please Give Correct Model Name which have been listed."
    self.model_name = model_name

    # load configuration of pretrained model
    MODEL_CONFIG = AutoConfig.from_pretrained(config.model_list[model_name])
    MODEL_CONFIG.num_labels = 2

    if model_name == "roberta":
      self.model = Roberta_BoolQ(MODEL_CONFIG)
    elif model_name == "bigbird":
      self.model = BigBird_BoolQ(MODEL_CONFIG)
    elif model_name == "electra":
      self.model = Electra_BoolQ(MODEL_CONFIG)

    self.model.to(device)


    """ Tokenizer """

    self.tokenizer = AutoTokenizer.from_pretrained(config.model_list[model_name])


    """ Dataset """

    # train_dataset
    self.train_dataset = BoolQ_Dataset(config)

    # k_fold index
    skf_iris = StratifiedKFold(n_splits=config.k_fold)
    self.kfold = config.k_fold
    self.KFold_index = list(skf_iris.split(
        self.train_dataset.dataset['text'], self.train_dataset.dataset['label']))
    
    # batch_size
    self.batch_size = config.batch_size


    """ optimizer, scheduler (in fit() function), criterion """

    self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.learning_rate)
    self.criterion = nn.CrossEntropyLoss()


    """ Training Saving """

    self.log_interval = config.log_interval
    self.load_step = 0
    self.best_acc = 0
    self.wandb = config.mode_wandb
    self.save_dir = config.save_dir



  def fit(self, epoch):
    # schedular
    self.scheduler = transformers.get_linear_schedule_with_warmup(
      self.optimizer, 
      num_warmup_steps=config.warmup_steps, 
      num_training_steps=len(self.train_dataset) * epoch, 
      last_epoch= -1
    )

    
    """ GO TRAINING. """
    self.epoch = epoch

    for epo in tqdm(range(epoch)):
      ### Stratified KFold
      train_idx, val_idx = self.KFold_index[epo % self.kfold]

      training_set = Subset(self.train_dataset, train_idx)
      validation_set = Subset(self.train_dataset, val_idx)

      ### make dataloader
      train_loader = DataLoader(training_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
      val_loader = DataLoader(validation_set, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)

      ### train
      self.training_step(train_loader, epo)

      ### val
      self.validation_step(val_loader, epo)

      ### Best model save
      if self.best_acc < self.val_acc:
        self.best_acc = self.val_acc

        print("Best Model Saving!")

        model_to_save = self.model.module if hasattr(model, "module") else self.model
        model_to_save.save_pretrained(f"{self.save_dir}/best/{self.model_name}")
        torch.save(self.config, os.path.join(f"{save_dir}/best/{self.model_name}", "training_config.bin"))


      
      

  def training_step(self, train_loader, epo):
    # allocate model to train mode
    self.model.train()
    tot_acc, tot_loss = 0., 0.

    for texts, labels in train_loader:
      pbar = tqdm(total = len(train_loader), desc="[Training] Epoch {}".format(epo+1))
      ### allocate to cuda or not.
      # texts -> cpu tensor, labels -> array.
      # texts: {input_ids, token_type_ids, attention_mask}
      texts = {key: torch.tensor(value).to(self.device) for key, value in texts.items()}
      labels = torch.tensor(labels).to(self.device)

      ###########################################
      # 1) zero_grad
      self.optimizer.zero_grad()

      # 2) forward
      y_pred = self.model(**texts)[0]

      # 3) calculate loss
      loss = self.criterion(y_pred, labels)

      # 4) backward
      loss.backward()

      # 5) optimier step
      self.optimizer.step()

      # 6) schedular step
      self.schedular.step()

      ###########################################


      ### update, and cumulate match and loss
      pbar.update()
      self.load_step += 1

      preds = torch.argmax(y_pred, dim=-1)
      tot_loss += loss.item()
      tot_acc += (preds == labels).sum().item() / self.batch_size

      ### saving to log
      if self.load_step % self.log_interval == 0:
        train_loss = tot_loss / self.log_interval
        train_acc = tot_acc / self.log_interval
        current_lr = self.get_lr(self.optimizer)

        pbar.set_description(f"Epoch: [{epo}/{self.epochs}]({self.load_step}/{len(train_loader)}) || loss: {train_loss:4.4} || acc: {train_acc:4.2%} || lr {current_lr:4.4}")

        self.train_loss = train_loss
        self.train_acc = train_acc
        self.current_lr = current_lr

        tot_acc, tot_value = 0., 0.



  def validation_step(self, val_loader, epo):
    # allocate model to eval mode
    self.model.eval()
    tot_acc, tot_loss = 0., 0.

    with torch.no_grad():
      for texts, labels in val_loader:
        pbar = tqdm(total = len(val_loader), desc="[Validation] Epoch {}".format(epo+1))
        ### allocate to cuda or not.
        # texts -> cpu tensor, labels -> array.
        # texts: {input_ids, token_type_ids, attention_mask}
        texts = {key: torch.tensor(value).to(self.device) for key, value in texts.items()}
        labels = torch.tensor(labels).to(self.device)

        ###########################################
        # 1) forward
        y_pred = self.model(**texts)[0]

        # 2) calculate loss
        loss = self.criterion(y_pred, labels)

        ###########################################
        """ Update and save loss """

        pbar.update()
    
        preds = torch.argmax(y_pred, dim=-1)
        tot_loss += loss.item()
        tot_acc += (preds == labels).sum().item() / self.batch_size

        ############################################
        

    val_loss = tot_loss / len(val_loader)
    val_acc = tot_acc / len(val_loader)

    pbar.set_description(f"Validation: [{epo}/{self.epochs}] || loss: {val_loss:4.4} || acc: {val_acc:4.2%}")

    if self.wandb:
        wandb.log({"train_loss": self.train_loss, "train_acc": self.train_acc,
            "lr":self.current_lr, "valid_loss":val_loss, "valid_acc":val_acc
        })

    self.val_acc = val_acc



  def collate_fn(self, batch):
    """
      Collate a batch of dataset to same length of text.

    ? INPUT
    dataset: {text: string, question: string, label: int}

    ? OUTPUT
    padded token ids.
    """

    batch_size = len(batch)

    # integrate from dataset (dict) into list
    text_list = [b['text'] for b in batch]
    query_list = [b['question'] for b in batch]
    label_list = [b['label'] for b in batch]
    
    # tokenize
    text_query_list = list(zip(text_list, query_list))

    if self.model_name == 'bigbird':
      max_length = 1024
    else:
      max_length = 512

    tokenized_sentence = self.tokenizer(
        text_query_list,
        return_tensors="np",
        padding=True,
        truncation=True,
        max_length=max_length,
        add_special_tokens=True,
        return_token_type_ids = True
    )

    # output of tokenized_sentence: {input_ids, token_type_ids, attention_mask}
    return tokenized_sentence, label_list

  def get_lr(self, optimizer):
    for param_group in optimizer.param_groups:
      return param_group['lr']




In [None]:
if config.mode_wandb:
    wandb.login()
    wandb.init(project='HyunJin-BoolQ', name="hello")

Trainer = BoolQ_Model_Train(config, 'bigbird')
Trainer.fit(epoch = 10)



VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

##### Test Code

In [None]:
from transformers import AutoTokenizer, BigBirdTokenizer
tokenizer = AutoTokenizer.from_pretrained("monologg/kobigbird-bert-base")
#tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
#tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

from torch.utils.data import Subset
dataset = BoolQ_Dataset(config)
print(dataset)
idx = np.asarray([1, 3, 5, 6])
print(Subset(dataset, idx))
loader = DataLoader(
    dataset,
    batch_size = 8,
    shuffle = True,
    collate_fn = collate_fn
)

for batch, label_list in loader:
  print(batch)
  print(batch['input_ids'].shape)
  print(batch['token_type_ids'].shape)
  print(batch['attention_mask'].shape)

  print(tokenizer.batch_decode(batch['input_ids'].tolist()))
  break