In [1]:
"""BERT finetuning runner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import os
import logging
import argparse
import random
from tqdm import tqdm, trange

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
#from transformers import BertAdam
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

from transformers import PYTORCH_PRETRAINED_BERT_CACHE

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)



In [2]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        sample 하나 당 하나의 InputExample. 다음과 같은 argument가 전달 됨.  
            guid: 하나의 example에 대한 고유 번호.
            text_a: 토큰화 하기 전 첫번째 문장
            text_b: 토큰화 하기 전 두번째 문장
            label: (문자열) example의 라벨. 
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [3]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        """
        Constructs a InputExample.
        sample 하나 당 하나의 InputFeatures. 다음과 같은 argument가 전달 됨.  
            input_ids: 토큰화된 문장을 지정 인덱스로 변환한 것
            input_mask: 패딩된 토큰에 대해서는 0으로 둠. 
            segment_ids: 첫문장(0)인지 두번째 문장(1)인지 구분.
            label_id: example의 라벨. 여기서는 0 또는 1. 
        """
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [4]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
        
    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file.
            파일에서 한 줄 씩 읽어 lines 리스트에 저장합니다. 
        """
        with open(input_file, "r", encoding='utf-8') as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            next(reader, None) #header 제외하기 
            lines = []
            for line in reader: 
                lines.append(line)
            return lines

In [5]:
"""이 코드는 thyroid 데이터를 사용해 학습하기 때문에 ThyProcess만을 다룹니다 """

class ThyProcessor(DataProcessor):
    """Processor for the Thyroid Sonography data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
    
    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2"] # 문서 클래스가 3종류면 여기를 0, 1, 2로 바꾸면 됨. 순서 꼭 맞추기
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets.
           각 줄에서 guid, text_a, text_b, label을 추출하여 InputExample을 통해 데이터 재정렬.    
        """
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i) #train/dev + index 
            text_a = line[0]
            text_b = None
            label = line[1]
            #print("label: ", label)
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

In [6]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length.
    두 문장이 들어오고 max_length가 주어졌을 때, 두 문장의 길이를 공평하게 맞추어 truncate하고자 함. 
    1) 합친 문장이 max_length보다 작을 때 -> 종료 
    2) 첫 문장이 두번째 문장보다 길때 -> 첫 문장에서 맨 뒤 토큰 하나 없애기 
    3) 두번째 문장이 첫 문장보다 길때 -> 두번째 문장에서 맨 뒤 토큰 하나 없애기 
    -> max_length에 도달할 때까지 무한 반복. 
    """
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop() #pop() 안에 index argument가 없으면 맨 뒤의 것을 없앤다. 
        else:
            tokens_b.pop()

In [7]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """InputExample >> InputFeatures로 변환해주는 기능을 합니다."""
    """Loads a data file into a list of `InputBatch`s."""

    # label_list: 0,1 label 의 종류
    label_map = {label : i for i, label in enumerate(label_list)}
    #print("label_map: ", label_map) 
    # label_map:  {'0': 0, '1': 1, '2':2}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        
#         print("tokens_a: ")
#         print(tokens_a)

        tokens_b = None
        if example.text_b: #만약 text_b가 존재한다면 토큰화 해준다. 
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            
            # Account for [CLS], [SEP], [SEP] with "- 3"
            #첫문장과 끝문장의 길이를 max_seq_length에 적절히 맞추어 붙인다. 
            
#             print("tokens_b: ", tokens_b)
            
            
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        
        else:
            # Account for [CLS] and [SEP] with "- 2"
            #만약 첫문장이 special tokens를 고려했을 때 max_seq_length보다 더 크다면?
            # 첫문장 뒷부분을 자른다. 
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        
        #special tokens과 max_seq_length를 고려하여 길이 조절된 토큰화된 문장에 대해 special tokens 추가 
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b: #만약 두번째 문장이 존재한다면 
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        
#         print("tokens: ", tokens)
        
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
#         print("input_ids: ", input_ids)
        
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        #input-mask 0인 부분은 attention score을 적용하지 않도록 함. 
        """positional embedding은 bert내부에 있거나 안쓴 가능성 있음. """
        input_mask += padding
        segment_ids += padding

        # 세 가지 임베딩 벡터의 길이가 max_seq_length와 맞는지 확인 
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]
        
        #5개의 examples에 대해서 처리된 데이터 출력하기 
#         if ex_index < 5:
#             logger.info("*** Example ***")
#             logger.info("guid: %s" % (example.guid))
#             logger.info("tokens: %s" % " ".join(
#                     [str(x) for x in tokens]))
#             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
#             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
#             logger.info(
#                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
#             logger.info("label: %s (id = %d)" % (example.label, label_id))
            # print("label: ", example.label, label_id)
        
        #생성된 features는 features리스트에 저장되어 리턴됨. 
        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


In [8]:
# def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
#     """InputExample >> InputFeatures로 변환해주는 기능을 합니다."""
#     """Loads a data file into a list of `InputBatch`s."""

#     # label_list: 0,1 label 의 종류
#     label_map = {label : i for i, label in enumerate(label_list)}
#     #print("label_map: ", label_map) 
#     # label_map:  {'0': 0, '1': 1, '2':2}

#     features = []
#     for (ex_index, example) in enumerate(examples):
#         """첫문장 a를 white_space단위로 토큰화"""
#         tokens_a = tokenizer.tokenize(example.text_a)
#          """첫문장 b는 없으므로 생략"""
#         tokens_b = None
#         """문장 길이 최대값에 맞게 문장을 자르기"""
#         # Account for [CLS] and [SEP] with "- 2"
#         #만약 첫문장이 special tokens를 고려했을 때 max_seq_length보다 더 크다면?
#         # 첫문장 뒷부분을 자른다. 
#         if len(tokens_a) > max_seq_length - 2:
#             tokens_a = tokens_a[:(max_seq_length - 2)]
                
#         """token embedding 생성"""
#         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
#         input_ids = tokenizer.convert_tokens_to_ids(tokens)
        
#         """segment embedding 생성"""
#         segment_ids = [0] * len(tokens) #cls, sep 토큰까지 모두 0으로

#         # The mask has 1 for real tokens and 0 for padding tokens. Only real
#         # tokens are attended to.
#         input_mask = [1] * len(input_ids)

#         # Zero-pad up to the sequence length.
#         padding = [0] * (max_seq_length - len(input_ids))
#         input_ids += padding
#         input_mask += padding
#         segment_ids += padding

#         # 세 가지 임베딩 벡터의 길이가 max_seq_length와 맞는지 확인 
#         assert len(input_ids) == max_seq_length
#         assert len(input_mask) == max_seq_length
#         assert len(segment_ids) == max_seq_length

#         label_id = label_map[example.label]
        
#         #5개의 examples에 대해서 처리된 데이터 출력하기 
# #         if ex_index < 5:
# #             logger.info("*** Example ***")
# #             logger.info("guid: %s" % (example.guid))
# #             logger.info("tokens: %s" % " ".join(
# #                     [str(x) for x in tokens]))
# #             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
# #             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
# #             logger.info(
# #                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
# #             logger.info("label: %s (id = %d)" % (example.label, label_id))
#             # print("label: ", example.label, label_id)
        
#         #생성된 features는 features리스트에 저장되어 리턴됨. 
#         features.append(
#                 InputFeatures(input_ids=input_ids,
#                               input_mask=input_mask,
#                               segment_ids=segment_ids,
#                               label_id=label_id))
#     return features


In [9]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

# train function

In [10]:
def do_train_func(args, device, processor, num_labels, label_list, tokenizer, n_gpu):
    
    args.bert_model = args.bert_models[args.case]
    args.data_dir = args.data_dirs[args.case]
    args.output_dir = args.output_dirs[args.case]
    
#     print("args.bert_model_type:",args.bert_model_type)
#     print("args.output_dir:",args.output_dir)
    
    logger = logging.getLogger(__name__)
    
    """데이터 파일 경로를 입력하고 InputExample에 대한 리턴값을 받는다."""
    train_examples = processor.get_train_examples(args.data_dir)
    num_train_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    # PYTORCH_PRETRAINED_BERT_CACHE: /home/yoonjin/.cache/torch/transformers
    # 여러 gpu 를 쓰면 local_rank 가 -1이 아니게 되는데, 각 gpu 의 번호를 의미할 가능성이 있음
    # 이때 각 gpu 에서 쓰는 모델을 캐싱(잠시 저장) 해두는데, 그 경로를 의미하는 듯 함
    """from_pretrained method
    the model is set in evaluation mode by default using model.eval().
    (Dropout modules are deactivated)"""
    model = BertForSequenceClassification.from_pretrained(args.bert_model_type,
              cache_dir=PYTORCH_PRETRAINED_BERT_CACHE+"/"+'distributed_{}'.format(args.local_rank),
              num_labels = num_labels)
    
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)


    global_step = 0
    
    # Prepare optimizer
    # Separate the `weight` parameters from the `bias` parameters. 
    # - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01. 
    # - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    t_total = num_train_steps
    if args.local_rank != -1:
        # 가지고 있는 gpu 개수만큼 나눠줌(get_word_size)
        t_total = t_total // torch.distributed.get_world_size()

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = AdamW(optimizer_grouped_parameters,
                lr=args.learning_rate)
        warmup_steps = t_total*args.warmup_proportion # 초반 step 을 작은 learning rate 로 학습
        lr_scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)


    # 한 줄을 features 로 바꾼 형태
    # 토큰의 아이디, mask 여부(전부1), segment id, label id 가 들어있음
    train_features = convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer)

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
#         print("all_label_ids: ", all_label_ids.size())
#         print("all_label_ids: ", all_label_ids)

    # 따로 빼낸 각 줄의 각 feature 를 다시 TensorDataset 으로 묶어줌
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    # epoch 의 eval set 에서의 성능
    best_eval_accuracy = -1
    
    # 학습 루프
    model.train()
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # EPOCH
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        epoch_loss = 0
        epoch_examples, epoch_steps = 0, 0
        
        # 각 epoch 루프
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            # 4개 feature 묶음 * batchsize = batch 을 풀어서 루프를 돌린다
            input_ids, input_mask, segment_ids, label_ids = batch

#               #print("label_ids: ", label_ids)

            # 한줄로 forward
            # loss 의 0: loss
            # loss 의 1: logits
            loss = model(input_ids=input_ids, token_type_ids=segment_ids, 
                         attention_mask=input_mask, labels=label_ids)
            #print("logits: ", loss[1])
            
            loss = loss[0]
            
            #print("loss: ", loss)


            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(loss)
            else:
                # 미분. 매개변수에 대한 손실의 변화도를 계산합니다. 
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            
            epoch_loss += loss.item()
            epoch_examples += input_ids.size(0)
            
            nb_tr_steps += 1
            epoch_steps += 1
            
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
                
                
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                
                optimizer.step() # 미분한것을 가중치에서 빼줌. 매개변수가 갱신됩니다. 
                lr_scheduler.step() # 학습 스텝 개수를 세어주는 역할. warmup 에서 learning rate 를 계산해주는 역할
                global_step += 1
                optimizer.zero_grad() # 갱신할 변수들에 대한 변화도를 0으로 만듭니다. 
                                      # 그 이유는 기본적으로 .backward()를 호출할 때마다 
                                      # 변화도가 버퍼(buffer)에 누적되기 때문입니다. 
                    
#                 result = {'global_step': global_step,
#                           'loss': tr_loss/nb_tr_steps}
#                 output_eval_file = os.path.join(args.output_dir, "train_results.txt")
#                 with open(output_eval_file, "a") as writer:
#                     logger.info("***** Train results *****")
#                     for key in sorted(result.keys()):
#                         #logger.info("  %s = %s", key, str(result[key]))
#                         writer.write("%s = %s\n" % (key, str(result[key])))
            
    
        # 한번의 epoch 이 끝났을 때
        # Estimation on train set 
        print("epoch ", epoch, " 결과 출력")
        result_epoch = {'epoch': epoch, 'loss': epoch_loss/epoch_steps}
        output_eval_file_epoch = os.path.join(args.output_dir, "train_results_epoch.txt")
        with open(output_eval_file_epoch, "a") as writer:
            logger.info("***** Train results *****")
            for key in sorted(result_epoch.keys()):
                #logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result_epoch[key])))
        
        # Estimation on Evaluation set
        if args.do_eval==True:
            model.eval()
            eval_loss, eval_accuracy = do_eval_func(args, device, processor, 
                                                    num_labels, label_list, tokenizer, n_gpu, 
                                                    model=model, epoch=epoch)
            
            if eval_accuracy > best_eval_accuracy:
                # Save a trained model
                model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.bert_model, "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)

                best_eval_accuracy = eval_accuracy
            model.train()
        
        

#     # Save a trained model
#     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
#     output_model_file = os.path.join(args.bert_model, "pytorch_model.bin")
#     torch.save(model_to_save.state_dict(), output_model_file)

#     ## test code
#     print("train:: output_model_file: ", output_model_file)
#     #output_model_file = os.path.join(args.bert_model, "pytorch_model.bin")
#     output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
#     model_state_dict = torch.load(output_model_file)
#     model = BertForSequenceClassification.from_pretrained(args.bert_model_type, state_dict=model_state_dict, num_labels = num_labels)
#     model.to(device)
#     print("로드 정상적으로 됨")

# evaluation function

In [12]:
 ######################### Evaluation ######################### 
    # Load a trained model that you have fine-tuned
    # 평가를 위해 fine-tuned model 불러오기 
    
def do_eval_func(args, device, processor, num_labels, label_list, tokenizer, n_gpu, model=None, epoch=0):
    
    args.bert_model = args.bert_models[args.case]
    args.data_dir = args.data_dirs[args.case]
    args.output_dir = args.output_dirs[args.case]
#     print("args.bert_model_type:",args.bert_model_type)
#     print("args.bert_model:",args.bert_model)
#     print("args.output_dir:",args.output_dir)
    logger = logging.getLogger(__name__)
    
    if model==None:
        model_dir = os.path.join(args.bert_model, "pytorch_model.bin")
    #     print("eval:: model_dir: ", model_dir) #./models/Thyroid/pytorch_model.bin
        model_state_dict = torch.load(model_dir)
        model = BertForSequenceClassification.from_pretrained(args.bert_model_type, 
                                                              state_dict=model_state_dict, 
                                                              num_labels = num_labels)
        model.to(device)
    
    # prediction
    head = "class0\tclass1\tclass2\tlabel\n"
    prediction_arr = []

    if args.local_rank == -1 or torch.distributed.get_rank() == 0: #사용 GPU가 하나일 때 
        """eval 데이터에 대한 example >> features 생성"""
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # evaluation batch 루프. 
        # 4개 feature 묶음 * batchsize = batch 을 풀어서 루프를 돌린다
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
                
            with torch.no_grad():
#                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
#                 logits = model(input_ids, segment_ids, input_mask)
                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, 
                             attention_mask=input_mask, labels=label_ids)
                logits = tmp_eval_loss[1]
                tmp_eval_loss = tmp_eval_loss[0]
                 
#                 print("logits.size(): ", logits.size())
#                 print("label_ids: ", label_ids)
                
                # logits.size()[0] = 4 이하
                for b in range(logits.size()[0]):
                    row_result = []
                    row_result.append(str(logits[b][0].item()))
                    row_result.append(str(logits[b][1].item()))
                    row_result.append(str(logits[b][2].item()))
                    row_result.append(str(label_ids[b].item()))
                    
                    row_result = "\t".join(row_result)
#                     print(row_result)
                    prediction_arr.append(row_result)
                
                

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        
        result = {'epoch': epoch,
                  'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy}
        
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                #logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
        
        # logits 를 출력
        output_txt = "\n".join(prediction_arr)
        output_txt = head + output_txt

        import codecs
        output_eval_file = os.path.join(args.output_dir, "eval_logits.txt")
        file = codecs.open(output_eval_file, "w", "utf-8")
        file.write(output_txt)
        file.close()
        
    return eval_loss, eval_accuracy

# test function

In [13]:
from sklearn.metrics import classification_report
# 수정 참고 코드 : https://www.kaggle.com/qhd0081/crime-classification-bert-k-fold
#batch size 4에서 한 sample씩 들어옴. 
def saving(out, labels,test_preds, test_y): 
    outputs = np.argmax(out)
    test_preds.append(outputs)
    test_y.append(labels)
    return test_preds, test_y

def evaluate(prediction, target):
    print("prediction:",prediction)
    print("target:",target)
    report = classification_report(target, prediction)
    print(report)
    return report

def do_test_func(args, device, processor, num_labels, label_list, tokenizer, n_gpu):
    
    args.bert_model = args.bert_models[args.case]
    args.data_dir = args.data_dirs[args.case]
    args.output_dir = args.output_dirs[args.case]
#     print("args.bert_model_type:",args.bert_model_type)
#     print("args.bert_model:",args.bert_model)
#     print("args.output_dir:",args.output_dir)
    
    logger = logging.getLogger(__name__)
    
    model_dir = os.path.join(args.bert_model, "pytorch_model.bin")
    model_state_dict = torch.load(model_dir)
    model = BertForSequenceClassification.from_pretrained(args.bert_model_type, 
                                                          state_dict=model_state_dict, 
                                                          num_labels = num_labels)
    model.to(device)
    
    # prediction
    head = "class0\tclass1\tclass2\tlabel\n"
    prediction_arr = []

    if args.local_rank == -1 or torch.distributed.get_rank() == 0: #사용 GPU가 하나일 때 
        """eval 데이터에 대한 example >> features 생성"""
        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running test *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        # test batch 루프. 
        # 4개 feature 묶음 * batchsize = batch 을 풀어서 루프를 돌린다
        test_preds = []
        test_y=[]
        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
                
            with torch.no_grad():
#                 tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
#                 logits = model(input_ids, segment_ids, input_mask)
                tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, 
                             attention_mask=input_mask, labels=label_ids)
                logits = tmp_eval_loss[1]
                tmp_eval_loss = tmp_eval_loss[0]
                
                # logits.size()[0] = 4 이하
                
                for b in range(logits.size()[0]):
                    row_result = []
                    row_result.append(str(logits[b][0].item()))
                    row_result.append(str(logits[b][1].item()))
                    row_result.append(str(logits[b][2].item()))
                    print("row_result:",np.array(row_result),"label:", label_ids[b].item())
                    test_preds, test_y = saving(np.array(row_result), label_ids[b].item(), test_preds, test_y)
                    
                    row_result.append(str(label_ids[b].item()))
                    
                    row_result = "\t".join(row_result)
#                     print(row_result)
                    prediction_arr.append(row_result)
            
                
                

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        report = evaluate(test_preds, test_y)
        
        result = {'test_loss': eval_loss,
                      'test_accuracy': eval_accuracy}
        
        output_eval_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Test results *****")
            for key in sorted(result.keys()):
                #logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
        
        # logits 를 출력
        output_txt = "\n".join(prediction_arr)
        output_txt = head + output_txt

        import codecs
        output_eval_file = os.path.join(args.output_dir, "test_logits.txt")
        file = codecs.open(output_eval_file, "w", "utf-8")
        file.write(output_txt)
        file.close()

In [14]:
def main(
    ## Required parameters
    case, 
    data_dirs, # 입력데이터 경로
    bert_model_type, # 모델 타입
    bert_models, # 모델의 경로(로컬)
    task_name, # Glue task name
    output_dirs, # 출력경로
    
    ## Other parameters
    max_seq_length=512, #첫번째 문장 + 두번째 문장 + special tokens 다 합칠 때 최대 허용 토큰 개수  
    do_train=True, 
    do_eval=False,
    do_test=False, 
    do_lower_case=False, 
    train_batch_size=8, #8개의 example씩 모델에서 학습. 학습 시간을 줄이는 효과가 있음. 
    eval_batch_size=8, 
    learning_rate=5e-5, 
    num_train_epochs=1, 
    warmup_proportion=0.3, 
    no_cuda=False, #cuda를 사용함. 
    local_rank=-1, 
    seed=123, 
    gradient_accumulation_steps=1, 
    fp16=False, #빠르게 계산하는 기능 사용하지 않음. 
    loss_scale=0
    ):
    
    parser = argparse.ArgumentParser()
    args = parser.parse_args("")
    
    ## Required parameters
    args.case = case
    args.data_dirs = data_dirs
    args.bert_model_type = bert_model_type # 모델 타입
    args.bert_models = bert_models # 모델 경로
    args.task_name = task_name  # Glue task name
    args.output_dirs = output_dirs # 출력경로
    args.max_seq_length = max_seq_length 
    args.do_train = do_train
    args.do_eval = do_eval
    args.do_test = do_test
    args.do_lower_case = do_lower_case
    args.train_batch_size = train_batch_size
    args.eval_batch_size = eval_batch_size
    args.learning_rate = learning_rate
    args.num_train_epochs = num_train_epochs
    args.warmup_proportion = num_train_epochs
    args.no_cuda = no_cuda
    args.local_rank = local_rank
    args.seed = seed
    args.gradient_accumulation_steps = gradient_accumulation_steps
    args.fp16 = fp16
    args.loss_scale = loss_scale
    

    processors = {
#         "cola": ColaProcessor,
#         "mnli": MnliProcessor,
        "thy": ThyProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2, #mrpc의 클래스는 2개(0, 1)이다.  
        "thy": 3,
    }
    """local_rank, no_cuda argument에 대하여 - gpu 설정"""
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError("At least one of `do_train` or `do_eval` or `do_test` must be True.")

#     for d in args.output_dirs: 
#         if os.path.exists(d) and os.listdir(d):
#             raise ValueError("Output directory ({}) already exists and is not empty.".format(d))
#         os.makedirs(d, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels() # 레이블의 종류가 들어감

#     print("main():: num_labels:", num_labels)
    
    tokenizer = BertTokenizer.from_pretrained(args.bert_model_type, do_lower_case=args.do_lower_case)

    if args.do_train:
        do_train_func(args, device, processor, num_labels, label_list, tokenizer, n_gpu)
    if args.do_test:
        do_test_func(args, device, processor, num_labels, label_list, tokenizer, n_gpu)

# 실행하기

In [15]:
data_dir_list = ["../data/uneven-even/","../data/even-even/",
                "../data/upeven-even/"]
bert_model_type = "bert-base-uncased" # 모델 타입을 지정
model_dir_list = ["./bert_result/uneven-even/model/","./bert_result/even-even/model/",
                  "./bert_result/upeven-even/model/"]
output_dir_list =  ["./bert_result/uneven-even/","./bert_result/even-even/",
                    "./bert_result/upeven-even/"]

In [16]:

#training with uneven - uneven 
main(
    case = 0,#0: uneven-even/ 1: even-even/ 2: upeven-even  
    data_dirs=data_dir_list, # 입력데이터 경로
#     data_dir=data_dir_test[0], # 입력데이터 경로
    bert_model_type=bert_model_type, # 돌리는 모델의 타입
    bert_models=model_dir_list, # 모델 경로. 실제모델과 모델의 설정(json)을 같은 폴더에 넣고 돌려야 함
    
    #config.json, pytorch_model.bin, vocab.txt 3가지 파일이 있어야 한다. 
    #이때 파일 이름과 bert_model의 경로 설정에 주의해야 한다. 
    task_name="THY", # Glue task name
    output_dirs=output_dir_list, # 출력경로
    
    learning_rate=5e-3, 
    num_train_epochs=100,
    warmup_proportion=0.1,
    #warmup proportion: training에서 주는 것 중요. 예를 들어, 총 epoch이 1000일 때,
    #warmup_proportion이 0.1이면 초반 1000*0.1=100의 epoch동안 lr/100*k k가 1~100까지 늘어나면서 계산됨. 
    
    train_batch_size=8, 
    eval_batch_size=8, 

    do_train=True, 
    do_eval=True,
    do_test=False, 

#     do_train=False, 
#     do_eval=True,
#     do_test=False, 
)

11/23/2020 12:18:26 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False
11/23/2020 12:18:28 - INFO - __main__ -   LOOKING AT ../data/uneven-even/train.tsv
11/23/2020 12:18:29 - INFO - filelock -   Lock 140276203646096 acquired on /home/yoonjin/.cache/torch/transformers/distributed_-1/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))

11/23/2020 12:19:33 - INFO - filelock -   Lock 140276203646096 released on /home/yoonjin/.cache/torch/transformers/distributed_-1/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[101, 3300, 2365, 2080, 2561, 29610, 22471, 16940, 2110, 100, 1012, 3300, 100, 3742, 100, 100, 100, 100, 1048, 24335, 8458, 13045, 100, 1012, 2187, 16492, 25339, 100, 100, 1048, 24335, 8458, 13045, 100, 100, 1012, 100, 100, 2053, 2695, 7361, 1012, 5866, 4531, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 




RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 7.79 GiB total capacity; 6.41 GiB already allocated; 13.88 MiB free; 6.45 GiB reserved in total by PyTorch)

In [None]:
#training with uneven - uneven 
main(
    case = 1,#0: uneven-even/ 1: even-even/ 2: upeven-even  
    data_dirs=data_dir_list, # 입력데이터 경로
#     data_dir=data_dir_test[0], # 입력데이터 경로
    bert_model_type=bert_model_type, # 돌리는 모델의 타입
    bert_models=model_dir_list, # 모델 경로. 실제모델과 모델의 설정(json)을 같은 폴더에 넣고 돌려야 함
    
    #config.json, pytorch_model.bin, vocab.txt 3가지 파일이 있어야 한다. 
    #이때 파일 이름과 bert_model의 경로 설정에 주의해야 한다. 
    task_name="THY", # Glue task name
    output_dirs=output_dir_list, # 출력경로
    
    learning_rate=1e-1,
    num_train_epochs=10,
    warmup_proportion=0.0,
    
    train_batch_size=8, 
    eval_batch_size=8, 

    do_train=True, 
    do_eval=True,
    do_test=False, 

#     do_train=False, 
#     do_eval=True,
#     do_test=False, 
)

In [None]:
#training with uneven - uneven 
main(
    case = 2,#0: uneven-even/ 1: even-even/ 2: upeven-even  
    data_dirs=data_dir_list, # 입력데이터 경로
#     data_dir=data_dir_test[0], # 입력데이터 경로
    bert_model_type=bert_model_type, # 돌리는 모델의 타입
    bert_models=model_dir_list, # 모델 경로. 실제모델과 모델의 설정(json)을 같은 폴더에 넣고 돌려야 함
    
    #config.json, pytorch_model.bin, vocab.txt 3가지 파일이 있어야 한다. 
    #이때 파일 이름과 bert_model의 경로 설정에 주의해야 한다. 
    task_name="THY", # Glue task name
    output_dirs=output_dir_list, # 출력경로
    
    learning_rate=1e-1,
    num_train_epochs=10,
    warmup_proportion=0.0,
    
    train_batch_size=8, 
    eval_batch_size=8, 

    do_train=True, 
    do_eval=True,
    do_test=False, 

#     do_train=False, 
#     do_eval=True,
#     do_test=False, 
)