<a href="https://colab.research.google.com/github/seopp/DACON_Sentence_Type_Classification/blob/main/DACON_%EB%AC%B8%EC%9E%A5%EC%9C%A0%ED%98%95%EB%B6%84%EB%A5%98_%EB%B2%A0%EC%9D%B4%EC%8A%A4%EB%9D%BC%EC%9D%B8%EC%88%98%EC%A0%95_0210.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# !pip install transformers

## Import

In [7]:
import pandas as pd
import numpy as np
import torch
import os
import random
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer



# for graphing
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [9]:
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':256,
    'SEED':41
}

## Fixed RandomSeed

In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [11]:
df = pd.read_csv('/content/drive/MyDrive/DACON/문장유형분류/train.csv')
df.drop(columns=['ID'], inplace=True)

test = pd.read_csv('/content/drive/MyDrive/DACON/문장유형분류/test.csv')
test.drop(columns=['ID'], inplace=True)

submission = pd.read_csv('/content/drive/MyDrive/DACON/문장유형분류/sample_submission.csv')

In [12]:
df.tail()

Unnamed: 0,문장,유형,극성,시제,확실성,label
16536,"＇신동덤＇은 ＇신비한 동물사전＇과 ＇해리 포터＇ 시리즈를 잇는 마법 어드벤처물로, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16537,"수족냉증은 어릴 때부터 심했으며 관절은 어디 한 곳이 아니고 목, 어깨, 팔꿈치, ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실
16538,김금희 소설가는 ＂계약서 조정이 그리 어려운가 작가를 격려한다면서 그런 문구 하나 ...,사실형,긍정,과거,확실,사실형-긍정-과거-확실
16539,1만명이 넘는 방문자수를 기록한 이번 전시회는 총 77개 작품을 넥슨 사옥을 그대로...,사실형,긍정,과거,불확실,사실형-긍정-과거-불확실
16540,《목민심서》의 내용이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실


## Train / Validation Split

In [13]:
# 제공된 학습데이터를 학습 / 검증 데이터셋으로 재 분할
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [None]:
model_nm = 'klue/roberta-small'
base_model = AutoModel.from_pretrained(model_nm)
tokenizer = AutoTokenizer.from_pretrained(model_nm)

## Pre-processing

In [15]:
type_le = LabelEncoder()
train["유형"] = type_le.fit_transform(train["유형"].values)
val["유형"] = type_le.transform(val["유형"].values)

polarity_le = LabelEncoder()
train["극성"] = polarity_le.fit_transform(train["극성"].values)
val["극성"] = polarity_le.transform(val["극성"].values)

tense_le = LabelEncoder()
train["시제"] = tense_le.fit_transform(train["시제"].values)
val["시제"] = tense_le.transform(val["시제"].values)

certainty_le = LabelEncoder()
train["확실성"] = certainty_le.fit_transform(train["확실성"].values)
val["확실성"] = certainty_le.transform(val["확실성"].values)


In [16]:
train_type = train["유형"].values # sentence type
train_polarity = train["극성"].values # sentence polarity
train_tense = train["시제"].values # sentence tense
train_certainty = train["확실성"].values # sentence certainty

train_labels = {
    'type' : train_type,
    'polarity' : train_polarity,
    'tense' : train_tense,
    'certainty' : train_certainty
}

In [17]:
val_type = val["유형"].values # sentence type
val_polarity = val["극성"].values # sentence polarity
val_tense = val["시제"].values # sentence tense
val_certainty = val["확실성"].values # sentence certainty

val_labels = {
    'type' : val_type,
    'polarity' : val_polarity,
    'tense' : val_tense,
    'certainty' : val_certainty
}

## CustomDataset

In [18]:
class SentenceTypeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, labels=None):
        texts = dataframe['문장'].values.tolist()
        
        # return_tensors='pt'는 tensor로 리턴
        self.texts = [tokenizer(text, padding='max_length', max_length=90, truncation=True, return_tensors='pt') for text in texts] 
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        if self.labels is not None:
            type_tmp = self.labels['type'][idx]
            polarity_tmp = self.labels['polarity'][idx]
            tense_tmp = self.labels['tense'][idx]
            certainty_tmp = self.labels['certainty'][idx]
            # return text, torch.Tensor(type_tmp), torch.Tensor(polarity_tmp), torch.Tensor(tense_tmp), torch.Tensor(certainty_tmp)
            return text, type_tmp, polarity_tmp, tense_tmp, certainty_tmp

        else:
            return text

In [19]:
train_dataset = SentenceTypeDataset(train, tokenizer, labels = train_labels)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = SentenceTypeDataset(val, tokenizer, labels = val_labels)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Model Define

In [20]:
class SentenceClassifier(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.klue = base_model # from transformers package

        self.fc1 = nn.Linear(768, 32)
        self.relu = nn.ReLU()
        self.type_clf = nn.Linear(32,4)
        self.polarity_clf = nn.Linear(32,3)
        self.tense_clf = nn.Linear(32,3)
        self.certainty_clf = nn.Linear(32,2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # input_ids : token's id / attention_mask : make a model to focus on which token
        klue_out = self.klue(input_ids= input_ids, attention_mask = attention_mask)[0][:,0]

        x = self.fc1(klue_out)
        x = self.relu(x)

        type_output = self.type_clf(x)
        type_output = self.softmax(type_output)
        polarity_output = self.polarity_clf(x)
        polarity_output = self.softmax(polarity_output)
        tense_output = self.tense_clf(x)
        tense_output = self.softmax(tense_output)
        certainty_output = self.certainty_clf(x)
        certainty_output = self.softmax(certainty_output)

        return type_output, polarity_output, tense_output, certainty_output

## Train

In [21]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    
    criterion = {
        'type' : nn.CrossEntropyLoss().to(device),
        'polarity' : nn.CrossEntropyLoss().to(device),
        'tense' : nn.CrossEntropyLoss().to(device),
        'certainty' : nn.CrossEntropyLoss().to(device)
    }
    
    best_loss = 999999
    best_model = None
    
    # optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])

    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for sentence, type_label, polarity_label, tense_label, certainty_label in tqdm(iter(train_loader)):

            attention_mask = sentence['attention_mask'].to(device)
            input_ids = sentence['input_ids'].squeeze(1).to(device)

            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)
            
            optimizer.zero_grad()
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids, attention_mask)
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_type_f1, val_polarity_f1, val_tense_f1, val_certainty_f1 = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] 유형 F1 : [{val_type_f1:.5f}] 극성 F1 : [{val_polarity_f1:.5f}] 시제 F1 : [{val_tense_f1:.5f}] 확실성 F1 : [{val_certainty_f1:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_loss)
            
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            
    return best_model

## Validation


In [22]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    
    with torch.no_grad():
        for sentence, type_label, polarity_label, tense_label, certainty_label in tqdm(iter(val_loader)):
            attention_mask = sentence['attention_mask'].to(device)
            input_ids = sentence['input_ids'].squeeze(1).to(device)

            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)
            
            type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids, attention_mask)
            
            loss = 0.25 * criterion['type'](type_logit, type_label) + \
                    0.25 * criterion['polarity'](polarity_logit, polarity_label) + \
                    0.25 * criterion['tense'](tense_logit, tense_label) + \
                    0.25 * criterion['certainty'](certainty_logit, certainty_label)
            
            val_loss.append(loss.item())
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            type_labels += type_label.detach().cpu().numpy().tolist()
            
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_labels += polarity_label.detach().cpu().numpy().tolist()
            
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_labels += tense_label.detach().cpu().numpy().tolist()
            
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_labels += certainty_label.detach().cpu().numpy().tolist()
    
    type_f1 = f1_score(type_labels, type_preds, average='weighted')
    polarity_f1 = f1_score(polarity_labels, polarity_preds, average='weighted')
    tense_f1 = f1_score(tense_labels, tense_preds, average='weighted')
    certainty_f1 = f1_score(certainty_labels, certainty_preds, average='weighted')
    
    return np.mean(val_loss), type_f1, polarity_f1, tense_f1, certainty_f1

## Run!!

In [23]:
model = SentenceClassifier(base_model)
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

100%|██████████| 52/52 [01:27<00:00,  1.69s/it]
100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Epoch : [1] Train Loss : [0.83772] Val Loss : [0.76403] 유형 F1 : [0.73452] 극성 F1 : [0.93030] 시제 F1 : [0.80505] 확실성 F1 : [0.87274]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Epoch : [2] Train Loss : [0.72375] Val Loss : [0.68951] 유형 F1 : [0.77461] 극성 F1 : [0.93030] 시제 F1 : [0.88923] 확실성 F1 : [0.91219]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


Epoch : [3] Train Loss : [0.66695] Val Loss : [0.65931] 유형 F1 : [0.78562] 극성 F1 : [0.93030] 시제 F1 : [0.88362] 확실성 F1 : [0.89582]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


Epoch : [4] Train Loss : [0.64005] Val Loss : [0.64049] 유형 F1 : [0.81445] 극성 F1 : [0.93030] 시제 F1 : [0.89356] 확실성 F1 : [0.91461]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Epoch : [5] Train Loss : [0.62303] Val Loss : [0.63266] 유형 F1 : [0.84537] 극성 F1 : [0.93030] 시제 F1 : [0.89126] 확실성 F1 : [0.91707]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


Epoch : [6] Train Loss : [0.61146] Val Loss : [0.62962] 유형 F1 : [0.87214] 극성 F1 : [0.93030] 시제 F1 : [0.88641] 확실성 F1 : [0.92717]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


Epoch : [7] Train Loss : [0.60282] Val Loss : [0.62809] 유형 F1 : [0.87074] 극성 F1 : [0.93030] 시제 F1 : [0.88507] 확실성 F1 : [0.91880]


100%|██████████| 52/52 [01:24<00:00,  1.62s/it]
100%|██████████| 13/13 [00:07<00:00,  1.81it/s]


Epoch : [8] Train Loss : [0.60103] Val Loss : [0.63133] 유형 F1 : [0.86753] 극성 F1 : [0.93030] 시제 F1 : [0.88011] 확실성 F1 : [0.92177]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.82it/s]


Epoch : [9] Train Loss : [0.59657] Val Loss : [0.62534] 유형 F1 : [0.87152] 극성 F1 : [0.93030] 시제 F1 : [0.88692] 확실성 F1 : [0.92598]


100%|██████████| 52/52 [01:24<00:00,  1.63s/it]
100%|██████████| 13/13 [00:07<00:00,  1.82it/s]

Epoch : [10] Train Loss : [0.59546] Val Loss : [0.62608] 유형 F1 : [0.86746] 극성 F1 : [0.93030] 시제 F1 : [0.88805] 확실성 F1 : [0.92742]





## Inference

In [24]:
test_dataset = SentenceTypeDataset(test, tokenizer, None)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [28]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    
    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    
    with torch.no_grad():
        for sentence in tqdm(test_loader):
            attention_mask = sentence['attention_mask'].to(device)
            input_ids = sentence['input_ids'].squeeze(1).to(device)
                        
            type_logit, polarity_logit, tense_logit, certainty_logit = model(input_ids, attention_mask)
            
            type_preds += type_logit.argmax(1).detach().cpu().numpy().tolist()
            polarity_preds += polarity_logit.argmax(1).detach().cpu().numpy().tolist()
            tense_preds += tense_logit.argmax(1).detach().cpu().numpy().tolist()
            certainty_preds += certainty_logit.argmax(1).detach().cpu().numpy().tolist()
            
    return type_preds, polarity_preds, tense_preds, certainty_preds

In [29]:
type_preds, polarity_preds, tense_preds, certainty_preds = inference(model, test_loader, device)

100%|██████████| 28/28 [00:16<00:00,  1.71it/s]


In [30]:
type_preds = type_le.inverse_transform(type_preds)
polarity_preds = polarity_le.inverse_transform(polarity_preds)
tense_preds = tense_le.inverse_transform(tense_preds)
certainty_preds = certainty_le.inverse_transform(certainty_preds)

In [31]:
predictions = []
for type_pred, polarity_pred, tense_pred, certainty_pred in zip(type_preds, polarity_preds, tense_preds, certainty_preds):
    predictions.append(type_pred+'-'+polarity_pred+'-'+tense_pred+'-'+certainty_pred)

## Submission

In [32]:
submission['label'] = predictions

In [33]:
submission.head()

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실


In [34]:
submission.to_csv('baseline_submit.csv', index=False)