<a href="https://colab.research.google.com/github/yeonhee-ryou/sigpl23-tutorial/blob/main/2_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 환경설정

* 라이브러리 설치
* 구글 드라이브 마운트
* 텐서보드 연결

In [None]:
! pip install transformers datasets evaluate accelerate

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

home_dir = "/content/gdrive/MyDrive/Colab-Data"
model_dir = f"{home_dir}/models/codebert-refinement"

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir $model_dir

# CodeBERT Fine-tuning 학습하기

### 토크나이저, 데이터셋, 사전학습된 CodeBERT 모델 준비

#### `RobertaLMHeadModel` 모델 구조
* Roberta 모델 + Causal Language Model 구조 사용
* Embedding Layer + 12 x Encoder Layer + Pooler Layer
  * Embedding Layer: batch_size * 514 * 50,265 -> batch_size * 514 * 768
  * Encoder Layer: batch_size * 514 * 768 -> batch_size * 514 * 768
* LM Layer: batch_size * 514 * 768 -> batch_size * 514

In [None]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("microsoft/codebert-base", is_decoder=True)
print(repr(model))

In [None]:
from datasets import load_dataset

ds = load_dataset("code_x_glue_cc_code_refinement", "small")

In [None]:
ds["train"][0]

### 데이터 전처리

In [None]:
def tokenize(examples):
  tokenized_inputs = tokenizer(examples["buggy"], padding="max_length", truncation=True)
  labels = tokenizer(examples["fixed"], padding="max_length", truncation=True).input_ids
  return dict(labels=labels, **tokenized_inputs)

tokenized_datasets = ds.map(tokenize, batched=True)
tokenized_datasets

### 샘플 데이터 준비

In [None]:
from datasets import DatasetDict

sample = dict()
sample_ratio = 0.01
for split in tokenized_datasets:
  size = round(tokenized_datasets[split].num_rows * sample_ratio)
  sample[split] = tokenized_datasets[split].shuffle(seed=1234).select(range(size))

sample_datasets = DatasetDict(sample)
sample_datasets.num_rows

In [None]:
sample_datasets["train"][0].keys()

### 모델 학습 설정 정의
* 평가식
* Hyperparameters

In [None]:
import evaluate
import numpy as np

bleu = evaluate.load("bleu")

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  preds_ids = np.argmax(preds, axis=-1)
  decoded_preds = tokenizer.batch_decode(preds_ids, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  res = bleu.compute(predictions=decoded_preds, references=decoded_labels)
  return {"bleu": res["bleu"]}

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
  output_dir=f"{model_dir}/sampled",
  evaluation_strategy="epoch",
  save_strategy="epoch",
  num_train_epochs=3.0,
  per_device_train_batch_size=8,
  per_device_eval_batch_size =16,
  learning_rate=2e-5,
  lr_scheduler_type="linear",
  warmup_ratio=0.1,
  logging_steps=10
)


### 샘플 데이터에서 학습해보기

In [None]:
from transformers import Trainer

trainer = Trainer(
  model=model,
  args=args,
  train_dataset=sample_datasets["train"],
  eval_dataset=sample_datasets["validation"],
  compute_metrics=compute_metrics
)
trainer.train(resume_from_checkpoint=None)
trainer.save_model(args.output_dir)

### 전체 데이터에서 학습하기

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
  output_dir=f"{model_dir}/full",
  evaluation_strategy="epoch",
  save_strategy="epoch",
  num_train_epochs=5.0,
  per_device_train_batch_size=8,
  per_device_eval_batch_size =16,
  learning_rate=2e-5,
  lr_scheduler_type="linear",
  warmup_ratio=0.1,
  logging_steps=20,
  seed=1234,
)

trainer = Trainer(
  model=model,
  args=args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation'],
  compute_metrics=compute_metrics
)
trainer.train(resume_from_checkpoint=None)
trainer.save_model(args.output_dir)

### 테스트 데이터에서 정확도 검토하기

※ 참고: CodeXGLUE 리더보드 https://microsoft.github.io/CodeXGLUE/

In [None]:
trainer.evaluate(sample_datasets["test"])

In [None]:
from torch.utils.data import DataLoader

predicts = []
labels = []
ds_test = sample_datasets["test"]
for batch in DataLoader(ds_test, batch_size=32):
  input_ids = torch.stack(batch["input_ids"], dim=1).to(device)
  attention_mask = torch.stack(batch["attention_mask"], dim=1).to(device)
  with torch.no_grad():
    model_out = model(input_ids=input_ids, attention_mask=attention_mask)
    batch_preds = tokenizer.batch_decode(torch.argmax(model_out.logits, dim=-1).detach(), skip_special_tokens=True)
  batch_labels = tokenizer.batch_decode(torch.stack(batch["labels"], dim=-1), skip_special_tokens=True)
  predicts.extend(batch_preds)
  labels.extend(batch_labels)
  torch.cuda.empty_cache()

bleu.compute(predictions=predicts, references=labels)

### 저장된 모델 읽어서 실행해보기

In [None]:
import torch
from transformers import RobertaForCausalLM

ds_test = tokenized_datasets['test']

home_dir = "/content/gdrive/MyDrive/Colab-Data"

model = RobertaForCausalLM.from_pretrained(f"{model_dir}/full")
model.eval()
model.to(torch.device('cuda'))

In [None]:
device = torch.device('cuda')
ex = ds_test[0]
input_data = dict(
  input_ids=torch.tensor([ex["input_ids"]]).to(device),
  attention_mask=torch.tensor([ex["attention_mask"]]).to(device)
)
model_out = model(**input_data)
ex["target"], tokenizer.batch_decode(torch.argmax(model_out.logits, dim=-1).detach(), skip_special_tokens=True)