In [1]:
import os
import random
import pickle
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
from importlib import import_module

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from transformers import ElectraModel, ElectraPreTrainedModel, ElectraTokenizer, AdamW

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

seed = 108
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)

In [3]:
num_epochs = 30
lr = 1e-5
batch_size = 32

In [4]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
def load_data(data_dir):
    data = pd.read_csv(data_dir, sep='\t', index_col=0)
    
    new_sentences = [list(), list(), list(), list()]
    for idx in data.index:
        sentence = data.loc[idx, 'sentence']
        question = data.loc[idx, 'question']
        one = data.loc[idx, '1']
        two = data.loc[idx, '2']
        if question == '결과':
            new_sentences[0].append('[결과]' + sentence)
            new_sentences[1].append(one)
            new_sentences[2].append('[결과]' + sentence)
            new_sentences[3].append(two)
        else:
            new_sentences[0].append('[원인]' + one)
            new_sentences[1].append(sentence)
            new_sentences[2].append('[원인]' + two)
            new_sentences[3].append(sentence)
    data['1_1'] = new_sentences[0]
    data['1_2'] = new_sentences[1]
    data['2_1'] = new_sentences[2]
    data['2_2'] = new_sentences[3]
    return data

In [6]:
def tokenize_data(data, tokenizer):
    tokenized_data_1 = tokenizer(
        data['1_1'].tolist(),
        data['1_2'].tolist(),
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True
    )
    tokenized_data_2 = tokenizer(
        data['2_1'].tolist(),
        data['2_2'].tolist(),
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=150,
        add_special_tokens=True,
        return_token_type_ids=True
    )
    for key, value in tokenized_data_2.items():
        tokenized_data_1[key+'2'] = value
    
    return tokenized_data_1

# tokenizer 불러오기

In [7]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

# data 불러오기

In [8]:
base_path = os.path.join(os.curdir, 'data', 'task3')
file_name = 'SKT_COPA_Train.tsv'
train_data = load_data(os.path.join(base_path, file_name))
file_name = 'SKT_COPA_Dev.tsv'
val_data = load_data(os.path.join(base_path, file_name))

# label 생성

In [9]:
train_label = (train_data['Answer'].astype(int) - 1).values
val_label = (val_data['Answer'].astype(int) - 1).values

# data tokenize

In [10]:
tokenized_train_data = tokenize_data(train_data, tokenizer)
tokenized_val_data = tokenize_data(val_data, tokenizer)

# dataset 생성

In [11]:
train_dataset = MyDataset(tokenized_train_data, train_label)
val_dataset = MyDataset(tokenized_val_data, val_label)

In [12]:
train_data_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)
val_data_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False
)

# Pooling Layer

In [13]:
class MyPooling(nn.Module):
    def __init__(self, input_dim, output_dim, dropout):
        super(MyPooling, self).__init__()
        self.dense = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        return x

# My Model

In [14]:
class ElectraClassifier(ElectraPreTrainedModel):
    def __init__(self, config):
        super(ElectraClassifier, self).__init__(config)

        self.num_labels = config.num_labels
        self.model = ElectraModel.from_pretrained('monologg/koelectra-base-v3-discriminator', config=config)
        
        self.pooling = MyPooling(
            input_dim=config.hidden_size,
            output_dim=config.hidden_size,
            dropout=0.1
        )
        
        self.fc = nn.Linear(config.hidden_size, self.num_labels -1)

    def forward(
        self, input_ids, attention_mask, token_type_ids, input_ids2, attention_mask2, token_type_ids2, labels
    ):
        x1 = self.model(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )
        x2 = self.model(
            input_ids2, attention_mask=attention_mask2, token_type_ids=token_type_ids2
        )
        seq_x1 = x1[0]
        seq_x2 = x2[0]
        temp_x1 = x1[0][:, 0, :]
        temp_x2 = x2[0][:, 0, :]

        pooled_x1 = self.pooling(temp_x1)
        pooled_x2 = self.pooling(temp_x2)
        
        logits1 = self.fc(pooled_x1)
        logits2 = self.fc(pooled_x2)

        logits = torch.cat([logits1, logits2], dim=1)

        x1 = (logits,) + x1[2:]

        return x1

# Model 생성

In [15]:
config_module = getattr(import_module("transformers"), "ElectraConfig")

In [16]:
model_config = config_module.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [17]:
model_config.num_labels = 2

In [18]:
model = ElectraClassifier(config=model_config)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
model.to(device)

ElectraClassifier(
  (model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

# Criterion, Optimizer, Scheduler

In [19]:
criterion = nn.CrossEntropyLoss()
criterion.to(device)
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01, eps=1e-8)

In [23]:
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,
    num_training_steps=len(train_data_loader) * num_epochs,
    last_epoch=-1
)

# 학습

In [22]:
best_val_acc = 0
for epoch in range(num_epochs):
    model.train()
    loss_value = 0
    matches = 0
    for temp_items in tqdm(train_data_loader, desc='training: %d/%d' % (epoch+1, num_epochs)):
        items = {key: val.to(device) for key, val in temp_items.items()}
        
        optimizer.zero_grad()
        outs = model(**items)
        loss = criterion(outs[0], items['labels'].type('torch.LongTensor'))
        
        preds = torch.argmax(outs[0], dim=-1)
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        loss_value += loss.item()
        matches += (preds == items['labels']).sum().item()
    train_loss = loss_value / len(train_data_loader)
    train_acc = matches / batch_size / len(train_data_loader)
    print("Epoch: %d/%d" % (epoch+1, num_epochs))
    print("train loss: %4.4f" % train_loss)
    print("train accuracy: %.4f" % train_acc)

    with torch.no_grad():
        model.eval()
        val_loss_list = list()
        correct = 0
        n = 0
        
        for temp_items in val_data_loader:
            items = {key: val.to(device) for key, val in temp_items.items()}
            outs = model(**items)
            preds = torch.argmax(outs[0], dim=-1)
            val_loss = criterion(outs[0], items['labels'].type('torch.LongTensor')).item()
            val_acc = (items['labels'] == preds).sum().item()
            
            val_loss_list.append(val_loss)
            correct += val_acc
            n += len(preds)
        
        val_loss = np.sum(val_loss_list) / len(val_loss_list)
        val_acc = correct / n
        
        print("validation accuracy: %.4f" % val_acc)
        if val_acc > best_val_acc:
            print("New best model!!")
            best_val_acc = val_acc
            torch.save(model.state_dict(), './model/task3_best_model.pt')

training: 1/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:02<00:00, 10.03s/it]


Epoch: 1/30
train loss: 0.6903
train accuracy: 0.5290
validation accuracy: 0.7060
New best model!!


training: 2/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [17:05<00:00, 10.68s/it]


Epoch: 2/30
train loss: 0.6520
train accuracy: 0.6751
validation accuracy: 0.7960
New best model!!


training: 3/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:22<00:00, 10.24s/it]


Epoch: 3/30
train loss: 0.4389
train accuracy: 0.8203
validation accuracy: 0.8480
New best model!!


training: 4/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 4/30
train loss: 0.2879
train accuracy: 0.8812
validation accuracy: 0.8720
New best model!!


training: 5/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:08<00:00, 10.09s/it]


Epoch: 5/30
train loss: 0.2077
train accuracy: 0.9170
validation accuracy: 0.8680


training: 6/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 6/30
train loss: 0.1443
train accuracy: 0.9453
validation accuracy: 0.8660


training: 7/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:06<00:00, 10.07s/it]


Epoch: 7/30
train loss: 0.1024
train accuracy: 0.9671
validation accuracy: 0.8680


training: 8/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:16<00:00, 10.17s/it]


Epoch: 8/30
train loss: 0.0835
train accuracy: 0.9730
validation accuracy: 0.8600


training: 9/30: 100%|██████████████████████████████████████████████████████████████████| 96/96 [16:22<00:00, 10.23s/it]


Epoch: 9/30
train loss: 0.0606
train accuracy: 0.9801
validation accuracy: 0.8580


training: 10/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:36<00:00, 10.38s/it]


Epoch: 10/30
train loss: 0.0416
train accuracy: 0.9863
validation accuracy: 0.8700


training: 11/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:08<00:00, 10.09s/it]


Epoch: 11/30
train loss: 0.0396
train accuracy: 0.9857
validation accuracy: 0.8740
New best model!!


training: 12/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:03<00:00, 10.03s/it]


Epoch: 12/30
train loss: 0.0316
train accuracy: 0.9889
validation accuracy: 0.8740


training: 13/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:14<00:00, 10.15s/it]


Epoch: 13/30
train loss: 0.0276
train accuracy: 0.9925
validation accuracy: 0.8640


training: 14/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:08<00:00, 10.09s/it]


Epoch: 14/30
train loss: 0.0223
train accuracy: 0.9938
validation accuracy: 0.8720


training: 15/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:09<00:00, 10.10s/it]


Epoch: 15/30
train loss: 0.0244
train accuracy: 0.9919
validation accuracy: 0.8720


training: 16/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:12<00:00, 10.13s/it]


Epoch: 16/30
train loss: 0.0200
train accuracy: 0.9941
validation accuracy: 0.8760
New best model!!


training: 17/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 17/30
train loss: 0.0216
train accuracy: 0.9935
validation accuracy: 0.8720


training: 18/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:21<00:00, 10.23s/it]


Epoch: 18/30
train loss: 0.0175
train accuracy: 0.9935
validation accuracy: 0.8720


training: 19/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:16<00:00, 10.17s/it]


Epoch: 19/30
train loss: 0.0160
train accuracy: 0.9958
validation accuracy: 0.8660


training: 20/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:10<00:00, 10.11s/it]


Epoch: 20/30
train loss: 0.0108
train accuracy: 0.9971
validation accuracy: 0.8680


training: 21/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:18<00:00, 10.19s/it]


Epoch: 21/30
train loss: 0.0133
train accuracy: 0.9961
validation accuracy: 0.8700


training: 22/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:18<00:00, 10.19s/it]


Epoch: 22/30
train loss: 0.0097
train accuracy: 0.9984
validation accuracy: 0.8700


training: 23/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:16<00:00, 10.17s/it]


Epoch: 23/30
train loss: 0.0089
train accuracy: 0.9967
validation accuracy: 0.8740


training: 24/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:17<00:00, 10.18s/it]


Epoch: 24/30
train loss: 0.0090
train accuracy: 0.9974
validation accuracy: 0.8700


training: 25/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 25/30
train loss: 0.0070
train accuracy: 0.9977
validation accuracy: 0.8740


training: 26/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:16<00:00, 10.17s/it]


Epoch: 26/30
train loss: 0.0099
train accuracy: 0.9974
validation accuracy: 0.8760


training: 27/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 27/30
train loss: 0.0110
train accuracy: 0.9958
validation accuracy: 0.8780
New best model!!


training: 28/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 28/30
train loss: 0.0069
train accuracy: 0.9990
validation accuracy: 0.8760


training: 29/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:15<00:00, 10.16s/it]


Epoch: 29/30
train loss: 0.0067
train accuracy: 0.9987
validation accuracy: 0.8760


training: 30/30: 100%|█████████████████████████████████████████████████████████████████| 96/96 [16:17<00:00, 10.18s/it]


Epoch: 30/30
train loss: 0.0095
train accuracy: 0.9971
validation accuracy: 0.8780


# Load Saved Model

In [20]:
model.load_state_dict(torch.load('./model/task3_best_model.pt', map_location=torch.device(device)))
model.to(device)

ElectraClassifier(
  (model): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [23]:
with torch.no_grad():
    model.eval()
    val_loss_list = list()
    correct = 0
    n = 0

    for temp_items in val_data_loader:
        items = {key: val.to(device) for key, val in temp_items.items()}
        outs = model(**items)
        preds = torch.argmax(outs[0], dim=-1)
        # val_loss = criterion(outs[0], items['labels'].type('torch.LongTensor')).item()
        val_acc = (items['labels'] == preds).sum().item()

        # val_loss_list.append(val_loss)
        correct += val_acc
        n += len(preds)

    # val_loss = np.sum(val_loss_list) / len(val_loss_list)
    val_acc = correct / n

    print("validation accuracy: %.4f" % val_acc)

validation accuracy: 0.8840
