# Setting

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


라이브러리 다운로드

In [2]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-r34_bkn_
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-r34_bkn_
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.2-cp37-none-any.whl size=12708 sha256=5cdd899cf8e229d4e5f102352103be525521b80bad313493489aedecfbdaa865
  Stored in directory: /tmp/pip-ephem-wheel-cache-frur2tdb/wheels/a2/b0/41/435ee4e918f91918be41529283c5ff86cd010f02e7525aecf3
Successfully built kobert


라이브러리 불러오기

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import pandas as pd
import numpy as np
import re
import tarfile
import pickle as pickle
from tqdm import tqdm
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split

GPU 설정

In [4]:
device = torch.device("cuda:0")

kobert 불러오기

In [5]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


# Preprocessing

In [6]:
def load_data(dataset_dir):
    with open('/content/drive/MyDrive/Colab Notebooks/label_type.pkl', 'rb') as f:
        label_type = pickle.load(f)
    dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
    dataset = preprocessing_dataset(dataset, label_type)
    return dataset

def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({'sentence':dataset[1],'entity_01':dataset[2],'entity_02':dataset[5],'label':label,})
    return out_dataset

In [7]:
dataset_path = r"/content/drive/MyDrive/Colab Notebooks/train.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

In [8]:
train, vali = train_test_split(dataset, test_size=0.2, random_state=42)
train[['sentence','label']].to_csv("/content/drive/MyDrive/Colab Notebooks/train_train.txt", sep='\t', index=False)
vali[['sentence','label']].to_csv("/content/drive/MyDrive/Colab Notebooks/train_vali.txt", sep='\t', index=False)

In [9]:
dataset_train = nlp.data.TSVDataset("/content/drive/MyDrive/Colab Notebooks/train_train.txt", field_indices=[0,1], num_discard_samples=1)
dataset_vali = nlp.data.TSVDataset("/content/drive/MyDrive/Colab Notebooks/train_vali.txt", field_indices=[0,1], num_discard_samples=1)

In [10]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [11]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [12]:
max_len = 128
batch_size = 64
num_epochs = 20
max_grad_norm = 1
log_interval = 50
learning_rate = 5e-5

In [13]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_vali = BERTDataset(dataset_vali, 0, 1, tok, max_len, True, False)

In [14]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
vali_dataloader = torch.utils.data.DataLoader(data_vali, batch_size=batch_size, num_workers=5)

  cpuset_checked))


# Classification

In [15]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 hidden_size_lstm = 84,
                 num_classes = 42,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.hidden_size = hidden_size
        self.hidden_size_lstm = hidden_size_lstm
        self.lstm = nn.LSTM(hidden_size, hidden_size_lstm, 1, bias=True, batch_first=True, dropout=dr_rate, bidirectional=True)
        self.linear = nn.Linear(hidden_size_lstm*2, num_classes)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        out, _ = self.lstm(pooler.view(-1,1,self.hidden_size))
        out = self.linear(out.view(-1,self.hidden_size_lstm*2))
        return out

In [16]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

  "num_layers={}".format(dropout, num_layers))


In [17]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [18]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=42, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [19]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = LabelSmoothingLoss()

In [20]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [21]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    best_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(vali_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    if test_acc >= best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/model_state_dict.pt")

  cpuset_checked))


epoch 1 batch id 1 loss 3.7276082038879395 train acc 0.0
epoch 1 batch id 51 loss 2.763206720352173 train acc 0.47824754901960786
epoch 1 batch id 101 loss 2.268035888671875 train acc 0.47957920792079206
epoch 1 train acc 0.4831305309734513
epoch 1 test acc 0.5
epoch 2 batch id 1 loss 2.3801136016845703 train acc 0.4375
epoch 2 batch id 51 loss 2.3603382110595703 train acc 0.49356617647058826
epoch 2 batch id 101 loss 2.034639835357666 train acc 0.4969059405940594
epoch 2 train acc 0.5026272123893806
epoch 2 test acc 0.5366379310344828
epoch 3 batch id 1 loss 2.1176462173461914 train acc 0.484375
epoch 3 batch id 51 loss 1.999138355255127 train acc 0.5373774509803921
epoch 3 batch id 101 loss 1.5816508531570435 train acc 0.5454826732673267
epoch 3 train acc 0.5529590707964602
epoch 3 test acc 0.5818965517241379
epoch 4 batch id 1 loss 1.7162022590637207 train acc 0.578125
epoch 4 batch id 51 loss 1.781417727470398 train acc 0.6243872549019608
epoch 4 batch id 101 loss 1.307090401649475

# Predict

In [22]:
dataset_path = r"/content/drive/MyDrive/Colab Notebooks/test.tsv"

dataset = load_data(dataset_path)

dataset['sentence'] = dataset['entity_01'] + ' [SEP] ' + dataset['entity_02'] + ' [SEP] ' + dataset['sentence']

dataset[['sentence','label']].to_csv("/content/drive/MyDrive/Colab Notebooks/test.txt", sep='\t', index=False)

In [23]:
dataset_test = nlp.data.TSVDataset("/content/drive/MyDrive/Colab Notebooks/test.txt", field_indices=[0,1], num_discard_samples=1)

data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [24]:
model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model_state_dict.pt"))

model.eval()

Predict = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    _, predict = torch.max(out,1)
    Predict.extend(predict.tolist())

  cpuset_checked))


In [25]:
output = pd.DataFrame(Predict, columns=['pred'])
output.to_csv('/content/drive/MyDrive/Colab Notebooks/submission.csv', index=False)