<a href="https://colab.research.google.com/github/soavril/graduation_thesis/blob/main/graduation_ver3_distilKobert_cost_weight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
!pip install kobert-transformers

In [None]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
!wget https://raw.githubusercontent.com/monologg/KoBERT-Transformers/master/kobert_transformers/tokenization_kobert.py

In [None]:
!wget https://github.com/agaldran/cost_sensitive_loss_classification/blob/master/utils/losses.py

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from mxnet.gluon import nn
from mxnet import gluon
import mxnet as mx
import gluonnlp as nlp
from tqdm import tqdm, tqdm_notebook

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, DistilBertModel
from kobert_transformers import get_kobert_model, get_distilkobert_model
from kobert_transformers import get_tokenizer

from kobert import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.metrics import f1_score

In [None]:
device = torch.device("cuda:0")

In [None]:
distilbert_model = DistilBertModel.from_pretrained('monologg/distilkobert')

In [None]:
no_use, vocab = get_pytorch_kobert_model()

In [None]:
dataset_train = nlp.data.TSVDataset('/content/drive/MyDrive/train_data_cost.tsv', field_indices=[0,1], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset('/content/drive/MyDrive/test_data_cost.tsv', field_indices=[0,1], num_discard_samples=1)

# 새 섹션

In [None]:
max_len = 300
batch_size = 16
warmup_ratio = 0.1
num_epochs = 3
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
class BERTDataset(mx.gluon.data.Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        sent_dataset = gluon.data.SimpleDataset([[
            i[sent_idx],
        ] for i in dataset])
        self.sentences = sent_dataset.transform(transform)
        self.labels = gluon.data.SimpleDataset(
            [np.array(np.int32(i[label_idx])) for i in dataset])

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
#BERTDataset 클래스 이용, TensorDataset으로 만들어주기
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=2)

In [None]:
class distilBERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes= 2, ##주의: 클래스 수 바꾸어 주세요!##
                 dr_rate=None,
                 params=None):
        super(distilBERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, valid_length, segment_ids = None):
      
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        
        output = self.bert(input_ids = token_ids, attention_mask = attention_mask.float().to(token_ids.device))
        hidden_state = output[0]
        output = hidden_state[:, 0]
        output = self.dropout(output)
        output = self.classifier(output)

        return output

In [None]:
model = distilBERTClassifier(distilbert_model, dr_rate=0.5).to(device) #gpudistilBERTClassifier

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

class_weights = torch.Tensor([0.1597, 0.8403]).to(device)

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(weight = class_weights)

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length.to(device)
        label = label.long().to(device)
        out = model(token_ids, valid_length)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    with torch.no_grad():
      model.eval()
      output = []
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length = valid_length.to(device)
          label = label.long().to(device)
          out = model(token_ids, valid_length)
          test_acc += calc_accuracy(out, label)
          for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()
            output.append(logits)
      print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
result = np.argmax(output, axis=1)

In [None]:
result_data = pd.DataFrame(result)

In [None]:
result_data.to_csv('/content/drive/MyDrive/result_cost.csv', encoding = 'utf-8', header = False)

In [None]:
while True:pass