In [1]:
#!pip install transformers
#!pip install torch
#!pip install pandas
#!pip install scikit-learn
#!pip install accelerate -U
#!pip install ipdb

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import warnings
warnings.filterwarnings("ignore")

import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)

In [3]:
import pdb
import ipdb

from transformers import BertTokenizer, BertPreTrainedModel
from transformers import AdamW, EarlyStoppingCallback
from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import BertModel, BertLayer
from transformers.modeling_outputs import SequenceClassifierOutput

import pandas as pd
import numpy as np
from datasets import load_dataset
from torch import nn


import math
import os
import random
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import copy

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from scipy.stats import pearsonr, spearmanr

import torch
import torch.utils.checkpoint
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

In [6]:
dataset_n = 'stsb'
            #['cola', 'sst2', 'mrpc', 'stsb', 'qqp', 
            #'mnli', 'ax', 'qnli', 'rte', 'wnli']
file_n = 'STS-B'
            #['CoLA', 'SST-2', 'MRPC', 'STS-B', 'QQP', 
            #'MNLI-m', 'AX', 'MNLI-mm', 'QNLI', 'RTE', 'WNLI']

In [7]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [8]:
class BERT_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset1, dataset2, label, tokenizer): #, dataset2
        self.dataset1 = dataset1
        self.dataset2 = dataset2
        self.label = label
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text1 = self.dataset1[idx]
        text2 = self.dataset2[idx]
        tokens = self.tokenizer(text1, text2,
                               max_length=128,
                               padding="max_length",
                               truncation=True,
                               )
        tokens['label'] = torch.FloatTensor([self.label[idx]])
        #LongTensor FloatTensor 
        
        return tokens

    def __len__(self):
        return len(self.label)

In [9]:
train = load_dataset('glue', dataset_n, split='train')
valid = load_dataset('glue', dataset_n, split='validation')
# train validation validation_matched validation_mismatched

train_dataset = BERT_Dataset(train['sentence1'],
                             train['sentence2'],
                             train['label'],
                             tokenizer,
                            )
valid_dataset = BERT_Dataset(valid['sentence1'],
                             valid['sentence2'],
                             valid['label'],
                             tokenizer,
                            )
# sentence sentence1 sentence2 question question1 question2 premise hypothesis 

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    #pco = pearsonr(labels, predictions)[0]
    #spco = spearmanr(labels, predictions)[0]
    #matthew = matthews_corrcoef(labels, predictions)
    #_, _, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    #acc = accuracy_score(labels, predictions)
    return {
        #'mat': matthew
        #'f1': f1,
        #'accuracy': acc,
        #'pco': pco,
        #'spco': spco
    }

In [11]:
class Attention(nn.Module):
    def __init__(self, hidden_dim, method):
        super().__init__()
        self.method = method
        self.hidden_dim = hidden_dim

        if method == 'general':
            self.w = nn.Linear(self.hidden_dim, self.hidden_dim)
        elif method == 'concat':
            self.w = nn.Linear(self.hidden_dim*2, self.hidden_dim)
            self.v = torch.nn.Parameter(torch.FloatTensor(self.hidden_dim))
        
    def forward(self, dec_out, enc_outs):
        if self.method == 'dot' or self.method == 'scaled_dot':
            attn_energies = self.dot(dec_out, enc_outs)
            if self.method == 'scaled_dot': #scaled dot product attention
                attn_energies = attn_energies / math.sqrt(self.hidden_dim)
            else: # dot product attention
                pass
        elif self.method == 'general': #luong attention
            attn_energies = self.general(dec_out, enc_outs)
        elif self.method == 'concat': #bahdanau attention
            attn_energies = self.concat(dec_out, enc_outs)
        elif self.method == 'cosine': #content-based attention
            attn_energies = self.cosine(dec_out, enc_outs)
        
        attn_weights = F.softmax(attn_energies, dim=0)
        attn_weights = attn_weights.view([len(attn_weights), -1, 1])
        return torch.sum(attn_weights*enc_outs, dim=0)

    def dot(self, dec_out, enc_outs):
        return torch.sum(dec_out*enc_outs, dim=2)

    def general(self, dec_out, enc_outs):
        energy = self.w(enc_outs)
        return torch.sum(dec_out*energy, dim=2)

    def concat(self, dec_out, enc_outs): #todo
        dec_out = dec_out.expand(enc_outs.shape[0], -1, -1)
        energy = torch.cat((dec_out, enc_outs), 2)
        return torch.sum(self.v * self.w(energy).tanh(), dim=2)
    
    def cosine(self, dec_out, enc_outs):
        norm = torch.norm(dec_out) * torch.norm(enc_outs)
        return torch.sum(dec_out*enc_outs, dim=2) / norm

In [12]:
class AttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.key = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )
        self.query = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )
        self.value = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size, bias=False),
            nn.RReLU()
        )

    def forward(self, source_pooler_outputs, target_outputs):
        #source_pooler_outputs: 각 계층의 CLS 토큰들
        #target_outputs: 각 계층의 토큰 평균
        target_pooler_output_list = []
        for idx, source_pooler_output in enumerate(source_pooler_outputs):
            concated = target_outputs
            num_target_outputs = len(concated)
            concated = torch.stack(concated, dim=1)
            
            K = self.key(concated)
            V = self.value(concated)
            Q = self.query(source_pooler_output)

            score_list = torch.bmm(K, Q.unsqueeze(dim=-1)).squeeze(dim=-1)
            score_list /= num_target_outputs
            score_list = F.softmax(score_list, dim=-1)
            
            V = torch.mul(V, score_list.unsqueeze(dim=-1))
            target_pooler_output = torch.sum(V, dim=1)
            target_pooler_output_list.append(target_pooler_output)

        target_pooler_outputs = torch.stack(target_pooler_output_list, dim=1)
        target_pooler_output = torch.mean(target_pooler_outputs, dim=1)

        pooler_output = target_pooler_output
        #pooler_output = torch.cat([source_pooler_outputs.pop(-1), target_pooler_output], dim=1)
        return pooler_output

In [13]:
class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

In [14]:
class CustomForSequenceClassification(BertPreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        #self.attention_layer = BertLayer(config)
        self.layerwise_attention = AttentionLayer(config)
        self.pooler = BertPooler(config)
        self.attn_func = 'cosine'
        #dot, scaled_dot, concat, cosine, general, layerwise
        self.attn = Attention(hidden_dim=config.hidden_size, method=self.attn_func)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.layerForAttention = 'pooled'
        #pooled, input, mean

        self.post_init()
        
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        #return_dict=False일 때, outputs에서 받아오는 tuple 내부
        #outputs = tuple(sequence_output, pooled_output, all_hidden_states,)

        #최종 인코더 출력
        last_hidden_states = outputs[1]
        #각 인코더의 출력 받아오는 부분추가
        all_hidden_states = outputs[2]
        
        all_hidden_states = list(all_hidden_states)
        for i in range(len(all_hidden_states)):
            all_hidden_states[i] = self.pooler(all_hidden_states[i])
        all_hidden_states = torch.stack(all_hidden_states)
        input_hidden_states = all_hidden_states[0]
        all_hidden_states = all_hidden_states[1:]
        
        # 이전 연구 실험하기
        if self.attn_func == 'layerwise':
            target_pooled_result = [((hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)) for hidden in outputs[2][1:]]
            pooled_output = self.layerwise_attention(all_hidden_states, target_pooled_result)
            
        else:
            #TODO: 받아온 인코더 출력으로 layer self-attention 코드 구현
            #tuple을 list로 변경, pooler 계층 통과


            #입력 임베딩과 인코더 출력끼리의 attention
            if self.layerForAttention == 'input':
                pooled_output = self.attn(input_hidden_states, all_hidden_states)

            #최종 인코더 출력과 인코더 출력끼리의 attention
            elif self.layerForAttention == 'pooled':
                pooled_output = self.attn(last_hidden_states, all_hidden_states)

            #인코더 출력의 평균과 인코더 출력끼리의 attention
            elif self.layerForAttention == 'mean':
                mean_hidden_states = self.mean(all_hidden_states, len(all_hidden_states))
                pooled_output = self.attn(mean_hidden_states, all_hidden_states)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)


        loss = None
        self.contrastive = None
        if self.contrastive is not None:
            pass
            
        elif labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
                
        if not return_dict:
            output = (logits,) #+ outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return CustomSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    
    def mean(self, all_hidden_states, num_layers):
        return torch.sum(all_hidden_states, dim=0) / num_layers

In [15]:
model = CustomForSequenceClassification.from_pretrained(model_name,
                                                        output_hidden_states=True,
                                                        return_dict=False,
                                                        num_labels=1,
                                                       )

In [26]:
training_ars = TrainingArguments(
    output_dir="./checkpoint/stsb_time",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    #logging_strategy = "epoch",
    logging_steps = 1,
    save_strategy = "epoch",
    eval_accumulation_steps = 30,
    load_best_model_at_end = True,
)


trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [27]:
import time

start = time.time()
trainer.train()
end = time.time()

Epoch,Training Loss,Validation Loss
1,0.8139,0.680018


In [28]:
print(end-start)

24.321671962738037


In [None]:
class BERT_Dataset_forpredict(torch.utils.data.Dataset):
    def __init__(self, dataset1, dataset2, tokenizer): #, dataset2
        self.dataset1 = dataset1
        self.dataset2 = dataset2
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text1 = self.dataset1[idx]
        text2 = self.dataset2[idx]
        tokens = self.tokenizer(text1, text2,
                               max_length=128,
                               padding="max_length",
                               truncation=True,
                               )
        
        return tokens

    def __len__(self):
        return len(self.dataset1)
    
    
test = load_dataset('glue', dataset_n, split='test')
# test test_matched test_mismatched
test_dataset = BERT_Dataset_forpredict(test['sentence1'], 
                                       test['sentence2'], 
                                       tokenizer)
# sentence sentence1 sentence2 question question1 question2 premise hypothesis 

In [None]:
training_ars_for_test = TrainingArguments(
    output_dir="./checkpoint",
    learning_rate=1e-5,
    weight_decay=0.01,
)

trainer_forpredict = Trainer(
                    model = model,
                    args=training_ars_for_test,
                    )

In [None]:
predictions = trainer_forpredict.predict(test_dataset,)

In [None]:
pred_list = []

for i in range(len(predictions.predictions)):
    '''
    if predictions.predictions[i] < 0:
        pred_list.append('0.000')
    elif predictions.predictions[i] > 5:
        pred_list.append('5.000')
    else:
        pred_list.append('{0:0.3f}'.format(float(predictions.predictions[i])))
    '''    
    pred_list.append(predictions.predictions[i].argmax())

In [None]:
'''
for i in range(len(pred_list)):
    if pred_list[i] == 0:
        pred_list[i] = 'entailment'
    elif pred_list[i] == 1:
        pred_list[i] = 'neutral' #not_entailment neutral
    elif pred_list[i] == 2:
        pred_list[i] = 'contradiction'
'''

In [None]:
result = pd.DataFrame(pred_list, columns=['prediction'])

In [None]:
result.to_csv("./GLUE_pred/general/mini_mean_3/"+file_n+".tsv", sep='\t', index_label='index')