<a href="https://colab.research.google.com/github/minshyee/NLP_project/blob/baseline_song/STS_baseline_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# transformer library
!pip install transformers==3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import sys
import pandas as pd
import numpy as np 
import tarfile


import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader,  RandomSampler, SequentialSampler
from torch.nn.utils import clip_grad_norm_

from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# device type
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

# available GPUs : 1
GPU name : Tesla T4
cuda


### 경로설정

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/AIBootCamp/NLP

/content/drive/MyDrive/AIBootCamp/NLP


In [None]:
_CUR_DIR = os.path.abspath(os.curdir)
print(f"My current directory : {_CUR_DIR}")
_DATA_DIR = os.path.join(_CUR_DIR, "data")

My current directory : /content/drive/MyDrive/AIBootCamp/NLP


### data download

In [None]:
# 데이터 다운로드
!wget https://aistages-prod-server-public.s3.amazonaws.com/app/Competitions/000067/data/klue-sts-v1.1.tar.gz

--2022-05-25 22:38:56--  https://aistages-prod-server-public.s3.amazonaws.com/app/Competitions/000067/data/klue-sts-v1.1.tar.gz
Resolving aistages-prod-server-public.s3.amazonaws.com (aistages-prod-server-public.s3.amazonaws.com)... 52.218.216.130
Connecting to aistages-prod-server-public.s3.amazonaws.com (aistages-prod-server-public.s3.amazonaws.com)|52.218.216.130|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1349881 (1.3M) [application/x-gzip]
Saving to: ‘klue-sts-v1.1.tar.gz’


2022-05-25 22:38:58 (1.29 MB/s) - ‘klue-sts-v1.1.tar.gz’ saved [1349881/1349881]



In [None]:
# 압출 풀기
tar_bz2_file = tarfile.open(_CUR_DIR + "/klue-sts-v1.1.tar.gz")
tar_bz2_file.extractall(path=_DATA_DIR)
tar_bz2_file.close()

### data load

In [None]:
train_all = pd.read_json('./data/klue-sts-v1.1/klue-sts-v1.1_train.json')
test = pd.read_json('./data/klue-sts-v1.1/klue-sts-v1.1_dev.json')

In [None]:
train_all.sample(frac=1, random_state = seed, replace=True)

Unnamed: 0,guid,source,sentence1,sentence2,labels,annotations
7270,klue-sts-v1_train_07270,paraKQC-para,네이버 블로그 내용을 백업없이 두지 마세요. 꼭 컴퓨터에 백업하세요.,반드시 네이버 블로그 내용을 노트북 대신 컴퓨터에 백업하시오.,"{'label': 3.5, 'real-label': 3.5, 'binary-labe...","{'agreement': '0:0:1:1:4:0', 'annotators': ['0..."
860,klue-sts-v1_train_00860,airbnb-rtt,위치도 시설도 호스트분도 부족함 없이 완벽했습니다.,"위치, 시설, 호스트 모두 완벽했습니다.","{'label': 4.1, 'real-label': 4.142857142857143...","{'agreement': '0:0:0:2:2:3', 'annotators': ['0..."
5390,klue-sts-v1_train_05390,airbnb-sampled,하나 단점이라면 방에서의 와이파이가 좀 약한편입니다.,완전 도심지는 아닌데 교통이 굉장히 편리한 편입니다.,"{'label': 0.0, 'real-label': 0.0, 'binary-labe...","{'agreement': '6:0:0:0:0:0', 'annotators': ['0..."
5191,klue-sts-v1_train_05191,policy-rtt,"정부는 우리 경제와 산업, 민생을 반드시 지켜낼 것입니다.","정부는 우리의 경제, 산업, 그리고 민생을 보호할 것입니다.","{'label': 4.1, 'real-label': 4.142857142857143...","{'agreement': '0:0:0:2:2:3', 'annotators': ['1..."
11284,klue-sts-v1_train_11284,policy-rtt,한편으로는 병사 급여 인상 등 장병 처우 개선에도 3조8천억 원을 반영했습니다.,"반면, 3조 8천억 원은 군인들의 처우 개선에도 반영되었습니다.","{'label': 3.6, 'real-label': 3.571428571428572...","{'agreement': '0:0:0:3:4:0', 'annotators': ['0..."
...,...,...,...,...,...,...
1660,klue-sts-v1_train_01660,airbnb-sampled,저는 빈에 온다면 꼭 다시 그녀의 숙소를 이용할거예요,밀라노에 다시온다면 이 숙소를 또 이용할 것입니다.,"{'label': 2.0, 'real-label': 2.0, 'binary-labe...","{'agreement': '0:1:4:1:0:0', 'annotators': ['0..."
5692,klue-sts-v1_train_05692,paraKQC-sampled,조명등 밝기는 높게 말고 낮게 조정해서 써주시기 바랍니다.,너희들이 좋아하는 클래식 곡이 뭔지 알고싶구나,"{'label': 0.0, 'real-label': 0.0, 'binary-labe...","{'agreement': '7:0:0:0:0:0', 'annotators': ['1..."
2354,klue-sts-v1_train_02354,paraKQC-para,중국에서 지메일을 사용하는 방법을 알려줘,지메일을 중국에서 사용하는 방법을 알려줘,"{'label': 4.5, 'real-label': 4.5, 'binary-labe...","{'agreement': '0:0:0:1:1:4', 'annotators': ['1..."
8457,klue-sts-v1_train_08457,airbnb-sampled,기본적인 요리도구와 향신료가 있습니다.,감각적인 인테리어가 돋보이던 숙소였습니다.,"{'label': 0.0, 'real-label': 0.0, 'binary-labe...","{'agreement': '6:0:0:0:0:0', 'annotators': ['0..."


In [None]:
train_length = int(len(train_all)*0.9)
train = train_all[:train_length]
valid = train_all[train_length:]

In [None]:
train.shape, valid.shape, test.shape

((10501, 6), (1167, 6), (519, 6))

# DATASET

In [None]:
def make_input_ex_sts(dataset):
  '''
  transform to the Input example
  '''
  sent_pairs = [sent1+' '+sent2 for sent1, sent2 in zip(dataset['sentence1'], dataset['sentence2'])]
  score=[]
  b_score=[]
  for i in dataset['labels']:
    score.append(i['label'])
    b_score.append(i['binary-label'])
  input_examples = pd.DataFrame({'guid': dataset['guid'], 'sent_pair' : sent_pairs, 'label' : score, 'bi': b_score})

  return input_examples

In [None]:
sts_train_examples = make_input_ex_sts(train)
sts_valid_examples = make_input_ex_sts(valid)
sts_test_examples = make_input_ex_sts(test)

In [None]:
class CustomDataset(Dataset):
    """
    - input_data: list of string
    - target_data: list of int
    """
    
    def __init__(self, input_data:list, target_data:list):
        self.X = input_data
        self.Y = target_data
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [None]:
train_dataset = CustomDataset(sts_train_examples.sent_pair.to_list(), sts_train_examples.label.to_list())
valid_dataset = CustomDataset(sts_valid_examples.sent_pair.to_list(), sts_valid_examples.label.to_list())

In [None]:
len(train_dataset), len(valid_dataset)

(10501, 1167)

# MODEL

#### 접기 - 실패


In [None]:
from tqdm import tqdm
def batch_encode(tokenizer, sent_pairs):
    batch_encoding = tokenizer.batch_encode_plus(
                              sent_pairs,
                              max_length=512,
                              padding="longest",    
                              add_special_tokens=True,
                              truncation=True,
                              return_tensors="pt"
                              )
    return batch_encoding

In [None]:
# Do encode
transformers.logging.set_verbosity_error() # https://github.com/huggingface/transformers/issues/14285
t_batch_encoding = batch_encode(tokenizer, sts_train_examples['sent_pair'])
v_batch_encoding = batch_encode(tokenizer, sts_valid_examples['sent_pair']) 
d_batch_encoding = batch_encode(tokenizer, sts_test_examples['sent_pair'])

100%|██████████| 10501/10501 [00:00<00:00, 503773.19it/s]
100%|██████████| 1167/1167 [00:00<00:00, 714041.25it/s]
100%|██████████| 519/519 [00:00<00:00, 769202.75it/s]


In [None]:
# Inputs 수 확인
print("\t\t   train\tvalid\t\tdev\n")
print(f"input_ids        : {len(t_batch_encoding['input_ids'])}\t{len(v_batch_encoding['input_ids'])}\t\t{len(d_batch_encoding['input_ids'])}")
print(f"attention_mask   : {len(t_batch_encoding['attention_mask'])}\t{len(v_batch_encoding['attention_mask'])}\t\t{len(d_batch_encoding['attention_mask'])}")

		   train	valid		dev

input_ids        : 10501	1167		519
attention_mask   : 10501	1167		519


### 여기부터

In [None]:
_CUR_DIR = os.path.abspath(os.curdir)
print(f"My current directory : {_CUR_DIR}")

My current directory : /content/drive/MyDrive/AIBootCamp/NLP


In [None]:
# argument setting
task = "sts"
model_checkpoint = "roberta-base" 
train_batch_size = 32
eval_batch_size = 32
epochs=3
loss_fct = nn.CrossEntropyLoss()

In [None]:
# Custom collate_fn 
def custom_collate_fn(batch):
    """
    한 배치 내 문장들을 tokenizing 한 후 텐서로 변환함. 
    이때, dynamic padding (즉, 같은 배치 내 토큰의 개수가 동일할 수 있도록, 부족한 문장에 [PAD] 토큰을 추가하는 작업)을 적용
    
    한 배치 내 레이블(target)은 텐서화 함.
    
    - batch: list of tuples (input_data(string), target_data(int))
    """
    input_list, target_list = [], []

    tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
    
    for _input, _target in batch:
        input_list.append(_input)
        target_list.append(_target)
    
    tensorized_input = tokenizer(
        input_list,
        add_special_tokens=True,
        padding="longest", # 배치내 가장 긴 문장을 기준으로 부족한 문장은 [PAD] 토큰을 추가
        truncation=True, # max_length를 넘는 문장은 이 후 토큰을 제거함
        max_length=512,
        return_tensors='pt' # 토크나이즈된 결과 값을 텐서 형태로 반환
    )
    
    tensorized_label = torch.tensor(target_list)
    
    return tensorized_input, tensorized_label

In [None]:
train_dataloader = DataLoader(train_dataset,
                             sampler = RandomSampler(train_dataset),
                             batch_size=train_batch_size,
                             collate_fn = custom_collate_fn)

valid_dataloader = DataLoader(valid_dataset,
                             sampler = RandomSampler(train_dataset),
                             batch_size=eval_batch_size,
                             collate_fn = custom_collate_fn)


In [None]:
class CustomClassifier(nn.Module):

    def __init__(self, hidden_size: int, n_label: int):
        super(CustomClassifier, self).__init__()

        self.roberta = RobertaModel.from_pretrained(model_checkpoint)

        dropout_rate = 0.1
        # linear_layer_hidden_size = 32

        self.classifier = nn.Sequential(
        # nn.Linear(hidden_size, linear_layer_hidden_size),
        nn.Linear(hidden_size, n_label)
        )

    

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        
        # RoBERTa 모델의 마지막 레이어의 첫번재 토큰을 인덱싱
        last_hidden_states = outputs[0] # last hidden states (batch_size, sequence_len, hidden_size)
        cls_token_last_hidden_states = last_hidden_states[:,0,:] # (batch_size, first_token, hidden_size)

        logits = self.classifier(cls_token_last_hidden_states)

        return logits

In [None]:
def initializer(train_dataloader, epochs=2):
    """
    모델, 옵티마이저, 스케쥴러 초기화
    """
    
    # model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) # https://github.com/huggingface/transformers/issues/8879
    model = CustomClassifier(hidden_size=128, n_label=5)

    optimizer = AdamW(
        model.parameters(), # update 대상 파라미터를 입력
        lr=2e-5,
        eps=1e-8
    )
    
    total_steps = len(train_dataloader) * epochs
    print(f"Total train steps with {epochs} epochs: {total_steps}")

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0, # 여기서는 warmup을 사용하지 않는다.
        num_training_steps = total_steps
    )

    return model, optimizer, scheduler

In [None]:
model, optimizer, scheduler = initializer(train_dataloader, epochs)

Total train steps with 3 epochs: 987


In [None]:
def save_checkpoint(path, model, optimizer, scheduler, epoch, loss):
    file_name = f'{path}/model.ckpt.{epoch}'
    
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

In [None]:
def validate(model, valid_dataloader):
   
    # 모델을 evaluate 모드로 설정 & device 할당
    model.eval()
    model.to(device)
    
    total_loss, total_acc= 0,0
        
    for step, batch in enumerate(valid_dataloader):
        
        # tensor 연산 전, 각 tensor에 device 할당
        batch = tuple(item.to(device) for item in batch)
            
        batch_input, batch_label = batch
            
        # gradient 계산하지 않음
        with torch.no_grad():
            logits = model(**batch_input)
            
        # loss
        loss = loss_fct(logits, batch_label)
        total_loss += loss.item()
        
        # accuracy
        probs = F.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1).flatten()
        acc = (preds == batch_label).cpu().numpy().mean()
        total_acc+=acc
    
    total_loss = total_loss/(step+1)
    total_acc = total_acc/(step+1)*100

    return total_loss, total_acc

In [None]:
def train_model(model, train_dataloader, valid_dataloader=None, epochs=2):
        
        # train_dataloaer 학습을 epochs만큼 반복
        for epoch in range(epochs):
            print(f"*****Epoch {epoch} Train Start*****")
            
            # 배치 단위 평균 loss와 총 평균 loss 계산하기위해 변수 생성
            total_loss, batch_loss, batch_count = 0,0,0
        
            # model을 train 모드로 설정 & device 할당
            model.train()
            model.to(device)
            
            # data iterator를 돌면서 하나씩 학습
            for step, batch in enumerate(train_dataloader):
                batch_count+=1
                
                # tensor 연산 전, 각 tensor에 device 할당
                batch = tuple(item.to(device) for item in batch)
            
                batch_input, batch_label = batch
            
                # batch마다 모델이 갖고 있는 기존 gradient를 초기화
                model.zero_grad()
            
                # forward
                logits = model(**batch_input)
            
                # loss
                loss = loss_fct(logits, batch_label)
                batch_loss += loss.item()
                total_loss += loss.item()
            
                # backward -> 파라미터의 미분(gradient)를 자동으로 계산
                loss.backward()
                
                # gradient clipping 적용 
                clip_grad_norm_(model.parameters(), 1.0)
                
                # optimizer & scheduler 업데이트
                optimizer.step()
                scheduler.step()
                
                # 배치 10개씩 처리할 때마다 평균 loss와 lr를 출력
                if (step % 10 == 0 and step != 0):
                    learning_rate = optimizer.param_groups[0]['lr']
                    print(f"Epoch: {epoch}, Step : {step}, LR : {learning_rate}, Avg Loss : {batch_loss / batch_count:.4f}")

                    # reset 
                    batch_loss, batch_count = 0,0
            
            print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
            print(f"*****Epoch {epoch} Train Finish*****\n")
            
            if valid_dataloader is not None:
                print(f"*****Epoch {epoch} Valid Start*****")
                valid_loss, valid_acc = validate(model, valid_dataloader)
                print(f"Epoch {epoch} Valid Loss : {valid_loss:.4f} Valid Acc : {valid_acc:.2f}")
                print(f"*****Epoch {epoch} Valid Finish*****\n")
            
            # checkpoint 저장
            save_checkpoint(".", model, optimizer, scheduler, epoch, total_loss/(step+1))
                
        print("Train Completed. End Program.")

In [None]:
train_model(model, train_dataloader, valid_dataloader, epochs)

*****Epoch 0 Train Start*****


RuntimeError: ignored

In [None]:
!nvidia-smi

Thu May 26 08:18:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    31W /  70W |  15068MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces