# PostBERT
- https://github.com/monologg/KoBERT-Transformers
- https://github.com/SKTBrain/KoBERT

In [None]:
##Parameter setting
setEpoch = 51
setLearningRate = 0.00002
setEpsilon = 1e-8
setBatch = 16
setMaxLength = 128
setSeed = 42
setTry = 3

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')
    
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
!pip install transformers
!pip install kobert_transformers
!pip install tensorflow
!pip install keras
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 649 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 88.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 68.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime
import sentencepiece as spm

In [None]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for KoBert model."""


import logging
import os
import unicodedata
from shutil import copyfile

from transformers import PreTrainedTokenizer


logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}

SPIECE_UNDERLINE = u'▁'


class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)

        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return out_vocab_model, out_vocab_txt

In [None]:
for currentTry in range(2,setTry):
  postpositions = ["Eyse","Ey"] #"Lo"
  labelNumber = 0
  
  for postposition in postpositions:

    import pandas as pd

    fileDir = "drive/My Drive/2022/AdverbialPostpositions/Data/test_"+postposition+".csv"
    fr = open(fileDir, 'r')
    contents= fr.readlines()
    fr.close()

    test = pd.DataFrame(columns=('index', 'Label', 'Sentence'))
    i = 0
    index = ""
    label = ""
    sentence = ""
    for content in contents:
        if i == 0:
            pass
        else:
            infos = content.split(",")
            index = infos[0]
            label = int(infos[1])
            sentence = infos[2].replace("\n","")
            test.loc[i] = [index, label, sentence]
        i = i + 1

    fileDir = "drive/My Drive/2022/AdverbialPostpositions/Data/train_"+postposition+".csv"
    fr = open(fileDir, 'r')
    contents= fr.readlines()
    fr.close()

    train = pd.DataFrame(columns=('index', 'Label', 'Sentence'))
    i = 0
    index = ""
    label = ""
    sentence = ""
    for content in contents:
        if i == 0:
            pass
        else:
            infos = content.split(",")
            index = infos[0]
            label = int(infos[1])
            sentence = infos[2].replace("\n","")
            train.loc[i] = [index, label, sentence]
        i = i + 1


    #정제하기

    train['Sentence'] = train['Sentence'].str.replace(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》\\n\t]+', " ", regex=True)
    test['Sentence'] = test['Sentence'].str.replace(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', " ", regex=True)
    train['Sentence'] = train['Sentence'].str.replace(r'\t+', " ", regex=True)
    test['Sentence'] = test['Sentence'].str.replace(r'\t+', " ", regex=True)
    train['Sentence'] = train['Sentence'].str.replace(r'[\\n]+'," ", regex=True)
    train['Sentence'] = train['Sentence'].str.replace(r'[\s]+', " ", regex=True)
    train['Sentence'] = train['Sentence'].str.strip()
    test['Sentence'] = test['Sentence'].str.replace(r'[\\n]+'," ", regex=True)
    test['Sentence'] = test['Sentence'].str.replace(r'[\s]+', " ", regex=True)
    test['Sentence'] = test['Sentence'].str.strip()

    # train 문장 추출
    sentences = train['Sentence']
    # BERT의 입력 형식에 맞게 변환
    sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

    # 라벨 추출
    labels = train['Label'].values
    labels_re = []
    for label in labels:
      labels_re.append(label)
    labels = labels_re

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = setMaxLength

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")


    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 훈련셋과 검증셋으로 분리
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids,
                                                                                        labels, 
                                                                                        random_state=2018, 
                                                                                        test_size=0.1)

    # 어텐션 마스크를 훈련셋과 검증셋으로 분리
    train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                          input_ids,
                                                          random_state=2018, 
                                                          test_size=0.1)

    # 데이터를 파이토치의 텐서로 변환
    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    train_masks = torch.tensor(train_masks)
    validation_inputs = torch.tensor(validation_inputs)
    validation_labels = torch.tensor(validation_labels)
    validation_masks = torch.tensor(validation_masks)		

    # 배치 사이즈
    batch_size = setBatch

    # 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
    # 학습시 배치 사이즈 만큼 데이터를 가져옴
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    # text 문장 추출
    sentences = test['Sentence']

    # BERT의 입력 형식에 맞게 변환
    sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

    # 라벨 추출
    labels = test['Label'].values
    labels_re = []
    for label in labels:
      labels_re.append(label)
    labels = labels_re

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    test_inputs = torch.tensor(input_ids)
    test_labels = torch.tensor(labels)
    test_masks = torch.tensor(attention_masks)

    # 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
    # 학습시 배치 사이즈 만큼 데이터를 가져옴
    test_data = TensorDataset(test_inputs, test_masks, test_labels)
    test_sampler = RandomSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    

    # 입력 데이터 변환
    def convert_input_data(sentences):

      # BERT의 토크나이저로 문장을 토큰으로 분리
      tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

      # 입력 토큰의 최대 시퀀스 길이
      MAX_LEN = setMaxLength

      # 토큰을 숫자 인덱스로 변환
      input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
      
      # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
      input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

      # 어텐션 마스크 초기화
      attention_masks = []

      # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
      # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
      for seq in input_ids:
          seq_mask = [float(i>0) for i in seq]
          attention_masks.append(seq_mask)

      # 데이터를 파이토치의 텐서로 변환
      inputs = torch.tensor(input_ids)
      masks = torch.tensor(attention_masks)

      return inputs, masks

  
    if postposition == "Lo":#6
      labelNumber = 6

      # 분류를 위한 BERT 모델 생성
      model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=labelNumber)
      model.cuda()


      # 정확도 계산 함수
      def flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          return np.sum(pred_flat == labels_flat) / len(labels_flat)
          
      def FNS_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 0):
              match_num += 1
            if labels_flat[i] == 0:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def INS_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 1):
              match_num += 1
            if labels_flat[i] == 1:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def DIR_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 2):
              match_num += 1
            if labels_flat[i] == 2:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def EFF_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 3):
              match_num += 1
            if labels_flat[i] == 3:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def CRT_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 4):
              match_num += 1
            if labels_flat[i] == 4:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def LOC_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 5):
              match_num += 1
            if labels_flat[i] == 5:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      # 시간 표시 함수
      def format_time(elapsed):
          # 반올림
          elapsed_rounded = int(round((elapsed)))
          
          # hh:mm:ss으로 형태 변경
          return str(datetime.timedelta(seconds=elapsed_rounded))

      def outreault(guess):
        guess = int(guess)
        outFunction = ""
        if guess == 0:
          outFunction = "FNS"
        elif guess == 1:
          outFunction = "INS"
        elif guess == 2:
          outFunction = "DIR"
        elif guess == 3:
          outFunction = "EFF"
        elif guess == 4:
          outFunction = "CRT"
        elif guess == 5:
          outFunction = "LOC"
        return outFunction

      # 옵티마이저 설정
      optimizer = AdamW(model.parameters(),
                        lr = setLearningRate, # 학습률
                        eps = setEpsilon # 0으로 나누는 것을 방지하기 위한 epsilon 값
                      )

      # 에폭수
      epochs = setEpoch

      # 총 훈련 스텝 : 배치반복 횟수 * 에폭
      total_steps = len(train_dataloader) * epochs

      # 학습률을 조금씩 감소시키는 스케줄러 생성
      scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                  num_warmup_steps = 0,
                                                  num_training_steps = total_steps)
      

      # 재현을 위해 랜덤시드 고정
      seed_val = setSeed
      random.seed(seed_val)
      np.random.seed(seed_val)
      torch.manual_seed(seed_val)
      torch.cuda.manual_seed_all(seed_val)

      # 그래디언트 초기화
      model.zero_grad()

      final_info = {}

      f = open("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/Outcomes/"+postposition+"_accuracy_trial_"+str(currentTry)+"_epoch_"+str(epochs)+".txt", 'w')
      f.write("epoch,sentence,originalLabel,predictedLabel,predictedFunction,result"+"\n")

      # 에폭만큼 반복
      for epoch_i in range(1, epochs):
          
          # ========================================
          #               Training
          # ========================================
          
          print("")
          print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs))
          print('Training...')

          # 시작 시간 설정
          t0 = time.time()

          # 로스 초기화
          total_loss = 0

          # 훈련모드로 변경
          model.train()
              
          # 데이터로더에서 배치만큼 반복하여 가져옴
          for step, batch in enumerate(train_dataloader):
              # 경과 정보 표시
              if step % 500 == 0 and not step == 0:
                  elapsed = format_time(time.time() - t0)
                  print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch

              # Forward 수행                
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
              
              # 로스 구함
              loss = outputs[0]

              # 총 로스 계산
              total_loss += loss.item()

              # Backward 수행으로 그래디언트 계산
              loss.backward()

              # 그래디언트 클리핑
              torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

              # 그래디언트를 통해 가중치 파라미터 업데이트
              optimizer.step()

              # 스케줄러로 학습률 감소
              scheduler.step()

              # 그래디언트 초기화
              model.zero_grad()

          # 평균 로스 계산
          avg_train_loss = total_loss / len(train_dataloader)            

          print("")
          print("  Average training loss: {0:.2f}".format(avg_train_loss))
          print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
              
          # ========================================
          #               Validation
          # ========================================

          print("")
          print("Running Validation...")

          #시작 시간 설정
          t0 = time.time()

          # 평가모드로 변경
          model.eval()

          # 변수 초기화
          eval_loss, eval_accuracy = 0, 0
          nb_eval_steps, nb_eval_examples = 0, 0
          FNS_nb_eval_steps, FNS_eval_accuracy = 0, 0
          INS_nb_eval_steps, INS_eval_accuracy = 0, 0
          DIR_nb_eval_steps, DIR_eval_accuracy = 0, 0
          EFF_nb_eval_steps, EFF_eval_accuracy = 0, 0
          CRT_nb_eval_steps, CRT_eval_accuracy = 0, 0
          LOC_nb_eval_steps, LOC_eval_accuracy = 0, 0

          epoch_info = {}

          # 데이터로더에서 배치만큼 반복하여 가져옴
          for batch in test_dataloader:
              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch
              
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)
              
              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()
              
              # 출력 로짓과 라벨을 비교하여 정확도 계산
              tmp_eval_accuracy = flat_accuracy(logits, label_ids)
              eval_accuracy += tmp_eval_accuracy
              nb_eval_steps += 1

              FNS_tmp_eval_accuracy = FNS_flat_accuracy(logits, label_ids)
              FNS_eval_accuracy += FNS_tmp_eval_accuracy
              FNS_nb_eval_steps += 1

              INS_tmp_eval_accuracy = INS_flat_accuracy(logits, label_ids)
              INS_eval_accuracy += INS_tmp_eval_accuracy
              INS_nb_eval_steps += 1

              DIR_tmp_eval_accuracy = DIR_flat_accuracy(logits, label_ids)
              DIR_eval_accuracy += DIR_tmp_eval_accuracy
              DIR_nb_eval_steps += 1

              EFF_tmp_eval_accuracy = EFF_flat_accuracy(logits, label_ids)
              EFF_eval_accuracy += EFF_tmp_eval_accuracy
              EFF_nb_eval_steps += 1

              CRT_tmp_eval_accuracy = CRT_flat_accuracy(logits, label_ids)
              CRT_eval_accuracy += CRT_tmp_eval_accuracy
              CRT_nb_eval_steps += 1

              LOC_tmp_eval_accuracy = LOC_flat_accuracy(logits, label_ids)
              LOC_eval_accuracy += LOC_tmp_eval_accuracy
              LOC_nb_eval_steps += 1

          print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
          print("  Validation took: {:}".format(format_time(time.time() - t0)))
          print("")
          print("  Detail accuracy  ")
          print("  FNS_Accuracy: {0:.2f}".format(FNS_eval_accuracy/FNS_nb_eval_steps))
          print("  INS_Accuracy: {0:.2f}".format(INS_eval_accuracy/INS_nb_eval_steps))
          print("  DIR_Accuracy: {0:.2f}".format(DIR_eval_accuracy/DIR_nb_eval_steps))
          print("  EFF_Accuracy: {0:.2f}".format(EFF_eval_accuracy/EFF_nb_eval_steps))
          print("  CRT_Accuracy: {0:.2f}".format(CRT_eval_accuracy/CRT_nb_eval_steps))
          print("  LOC_Accuracy: {0:.2f}".format(LOC_eval_accuracy/LOC_nb_eval_steps))

          epoch_info["Total"] = round(eval_accuracy/nb_eval_steps,3)
          epoch_info["Loss"] = round(avg_train_loss,3)
          epoch_info["FNS"] = round(FNS_eval_accuracy/FNS_nb_eval_steps,3)
          epoch_info["INS"] = round(INS_eval_accuracy/INS_nb_eval_steps,3)
          epoch_info["DIR"] = round(DIR_eval_accuracy/DIR_nb_eval_steps,3)
          epoch_info["EFF"] = round(EFF_eval_accuracy/EFF_nb_eval_steps,3)
          epoch_info["CRT"] = round(CRT_eval_accuracy/CRT_nb_eval_steps,3)
          epoch_info["LOC"] = round(LOC_eval_accuracy/LOC_nb_eval_steps,3)

          final_info["epoch"+str(epoch_i)] = epoch_info


          # 평가모드로 변경
          model.eval()
          test_input_ids = []
          test_input_mask = []
          test_labels = []

          num = 0
          for step, batch in enumerate(test_data):   #467, 128
            # print("batch",batch)
            # 배치를 GPU에 넣음
            batch = tuple(t.to(device) for t in batch)
            
            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch
            input_ids_arr = []
            input_mask_arr = []

            

            for i in range(0,len(b_input_ids)):
              input_ids_arr.append(int(b_input_ids[i]))
              input_mask_arr.append(int(b_input_mask[i]))

            
            test_input_ids.append(input_ids_arr)
            test_input_mask.append(input_mask_arr)
            test_labels.append(int(b_labels))


          test_input_ids = torch.tensor(test_input_ids)
          test_input_mask = torch.tensor(test_input_mask)
          test_labels = test_labels

          test_input_ids = test_input_ids.to(device)
          test_input_mask = test_input_mask.to(device)


          # 그래디언트 계산 안함
          with torch.no_grad():     
              # Forward 수행
              outputs = model(test_input_ids, 
                              token_type_ids=None, 
                              attention_mask=test_input_mask)
              

          sentence_vecs_sum = outputs[0]

          sentence_array = []
          for i in range(0,len(sentence_vecs_sum)):
            each_array = []
            for j in range(0,len(sentence_vecs_sum[i])):
              each_array.append(float(sentence_vecs_sum[i][j]))
            sentence_array.append(each_array)

          initial_df = pd.DataFrame(sentence_array)

          from sklearn.manifold import TSNE
          tsne = TSNE(n_components=2, random_state=0)
          tsne_obj= tsne.fit_transform(initial_df)

          tsne_df = pd.DataFrame({'X':tsne_obj[:,0],'Y':tsne_obj[:,1],'Label':test_labels})

          tsne_df.to_csv("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/t-SNE/"+postposition+"_tSNE_trial_"+str(currentTry)+"_epoch_"+str(epoch_i)+".csv")

          import numpy as np   
          import pandas as pd 
          from plotnine import *

          print("")
          print("  Network visualization  ")
          print(ggplot(tsne_df, aes(x='X', y='Y')) + geom_point(aes(colour = 'Label')))

          # 문장 테스트
          def test_sentences(sentences):

              # 평가모드로 변경
              model.eval()

              # 문장을 입력 데이터로 변환
              inputs, masks = convert_input_data(sentences)

              # 데이터를 GPU에 넣음
              b_input_ids = inputs.to(device)
              b_input_mask = masks.to(device)
                      
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)

              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()

              return logits


          testSentences = test['Sentence']

          totalNum = 0
          correctNum = 0
          for each in range(0, len(testSentences)):
              # print(test['Label'][each + 1], test['Sentence'][each + 1])
              logits = test_sentences([test['Sentence'][each + 1]])
              guess = str(np.argmax(logits))
              if guess == str(test['Label'][each + 1]) :
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(O)")
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess)+ ",1" + "\n")
                  correctNum = correctNum + 1
              else:
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess) + ",0" + "\n")       
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(X)")
              totalNum = totalNum + 1

          print("totalNum: ", totalNum, " correctNum: ", correctNum, " accuracy: ", (correctNum/totalNum))

      print("")
      print("Training complete!")
      print("")
      print("Final result is below!")
      print(final_info)

      f.close()



    elif postposition == "Eyse":#2
      labelNumber = 2

      # 분류를 위한 BERT 모델 생성
      model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=labelNumber)
      model.cuda()


      # 정확도 계산 함수
      def flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          return np.sum(pred_flat == labels_flat) / len(labels_flat)
          
      def SRC_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 0):
              match_num += 1
            if labels_flat[i] == 0:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def LOC_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 1):
              match_num += 1
            if labels_flat[i] == 1:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      # 시간 표시 함수
      def format_time(elapsed):
          # 반올림
          elapsed_rounded = int(round((elapsed)))
          
          # hh:mm:ss으로 형태 변경
          return str(datetime.timedelta(seconds=elapsed_rounded))

      def outreault(guess):
        guess = int(guess)
        outFunction = ""
        if guess == 0:
          outFunction = "SRC"
        elif guess == 1:
          outFunction = "LOC"
        return outFunction

      # 옵티마이저 설정
      optimizer = AdamW(model.parameters(),
                        lr = setLearningRate, # 학습률
                        eps = setEpsilon # 0으로 나누는 것을 방지하기 위한 epsilon 값
                      )

      # 에폭수
      epochs = setEpoch

      # 총 훈련 스텝 : 배치반복 횟수 * 에폭
      total_steps = len(train_dataloader) * epochs

      # 학습률을 조금씩 감소시키는 스케줄러 생성
      scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                  num_warmup_steps = 0,
                                                  num_training_steps = total_steps)
      

      # 재현을 위해 랜덤시드 고정
      seed_val = setSeed
      random.seed(seed_val)
      np.random.seed(seed_val)
      torch.manual_seed(seed_val)
      torch.cuda.manual_seed_all(seed_val)

      # 그래디언트 초기화
      model.zero_grad()

      final_info = {}

      f = open("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/Outcomes/"+postposition+"_accuracy_trial_"+str(currentTry)+"_epoch_"+str(epochs)+".txt", 'w')
      f.write("epoch,sentence,originalLabel,predictedLabel,predictedFunction,result"+"\n")

      # 에폭만큼 반복
      for epoch_i in range(1, epochs):
          
          # ========================================
          #               Training
          # ========================================
          
          print("")
          print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs))
          print('Training...')

          # 시작 시간 설정
          t0 = time.time()

          # 로스 초기화
          total_loss = 0

          # 훈련모드로 변경
          model.train()
              
          # 데이터로더에서 배치만큼 반복하여 가져옴
          for step, batch in enumerate(train_dataloader):
              # 경과 정보 표시
              if step % 500 == 0 and not step == 0:
                  elapsed = format_time(time.time() - t0)
                  print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch

              # Forward 수행                
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
              
              # 로스 구함
              loss = outputs[0]

              # 총 로스 계산
              total_loss += loss.item()

              # Backward 수행으로 그래디언트 계산
              loss.backward()

              # 그래디언트 클리핑
              torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

              # 그래디언트를 통해 가중치 파라미터 업데이트
              optimizer.step()

              # 스케줄러로 학습률 감소
              scheduler.step()

              # 그래디언트 초기화
              model.zero_grad()

          # 평균 로스 계산
          avg_train_loss = total_loss / len(train_dataloader)            

          print("")
          print("  Average training loss: {0:.2f}".format(avg_train_loss))
          print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
              
          # ========================================
          #               Validation
          # ========================================

          print("")
          print("Running Validation...")

          #시작 시간 설정
          t0 = time.time()

          # 평가모드로 변경
          model.eval()

          # 변수 초기화
          eval_loss, eval_accuracy = 0, 0
          nb_eval_steps, nb_eval_examples = 0, 0
          SRC_nb_eval_steps, SRC_eval_accuracy = 0, 0
          LOC_nb_eval_steps, LOC_eval_accuracy = 0, 0

          epoch_info = {}

          # 데이터로더에서 배치만큼 반복하여 가져옴
          for batch in test_dataloader:
              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch
              
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)
              
              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()
              
              # 출력 로짓과 라벨을 비교하여 정확도 계산
              tmp_eval_accuracy = flat_accuracy(logits, label_ids)
              eval_accuracy += tmp_eval_accuracy
              nb_eval_steps += 1

              SRC_tmp_eval_accuracy = SRC_flat_accuracy(logits, label_ids)
              SRC_eval_accuracy += SRC_tmp_eval_accuracy
              SRC_nb_eval_steps += 1

              LOC_tmp_eval_accuracy = LOC_flat_accuracy(logits, label_ids)
              LOC_eval_accuracy += LOC_tmp_eval_accuracy
              LOC_nb_eval_steps += 1

          print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
          print("  Validation took: {:}".format(format_time(time.time() - t0)))
          print("")
          print("  Detail accuracy  ")
          print("  SRC_Accuracy: {0:.2f}".format(SRC_eval_accuracy/SRC_nb_eval_steps))
          print("  LOC_Accuracy: {0:.2f}".format(LOC_eval_accuracy/LOC_nb_eval_steps))

          epoch_info["Total"] = round(eval_accuracy/nb_eval_steps,3)
          epoch_info["Loss"] = round(avg_train_loss,3)
          epoch_info["SRC"] = round(SRC_eval_accuracy/SRC_nb_eval_steps,3)
          epoch_info["LOC"] = round(LOC_eval_accuracy/LOC_nb_eval_steps,3)

          final_info["epoch"+str(epoch_i)] = epoch_info


          # 평가모드로 변경
          model.eval()
          test_input_ids = []
          test_input_mask = []
          test_labels = []

          num = 0
          for step, batch in enumerate(test_data):   #467, 128
            # print("batch",batch)
            # 배치를 GPU에 넣음
            batch = tuple(t.to(device) for t in batch)
            
            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch
            input_ids_arr = []
            input_mask_arr = []

            

            for i in range(0,len(b_input_ids)):
              input_ids_arr.append(int(b_input_ids[i]))
              input_mask_arr.append(int(b_input_mask[i]))

            
            test_input_ids.append(input_ids_arr)
            test_input_mask.append(input_mask_arr)
            test_labels.append(int(b_labels))


          test_input_ids = torch.tensor(test_input_ids)
          test_input_mask = torch.tensor(test_input_mask)
          test_labels = test_labels

          test_input_ids = test_input_ids.to(device)
          test_input_mask = test_input_mask.to(device)


          # 그래디언트 계산 안함
          with torch.no_grad():     
              # Forward 수행
              outputs = model(test_input_ids, 
                              token_type_ids=None, 
                              attention_mask=test_input_mask)
              

          sentence_vecs_sum = outputs[0]

          sentence_array = []
          for i in range(0,len(sentence_vecs_sum)):
            each_array = []
            for j in range(0,len(sentence_vecs_sum[i])):
              each_array.append(float(sentence_vecs_sum[i][j]))
            sentence_array.append(each_array)

          initial_df = pd.DataFrame(sentence_array)

          from sklearn.manifold import TSNE
          tsne = TSNE(n_components=2, random_state=0)
          tsne_obj= tsne.fit_transform(initial_df)

          tsne_df = pd.DataFrame({'X':tsne_obj[:,0],'Y':tsne_obj[:,1],'Label':test_labels})

          tsne_df.to_csv("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/t-SNE/"+postposition+"_tSNE_trial_"+str(currentTry)+"_epoch_"+str(epoch_i)+".csv")

          import numpy as np   
          import pandas as pd 
          from plotnine import *

          print("")
          print("  Network visualization  ")
          print(ggplot(tsne_df, aes(x='X', y='Y')) + geom_point(aes(colour = 'Label')))

          # 문장 테스트
          def test_sentences(sentences):

              # 평가모드로 변경
              model.eval()

              # 문장을 입력 데이터로 변환
              inputs, masks = convert_input_data(sentences)

              # 데이터를 GPU에 넣음
              b_input_ids = inputs.to(device)
              b_input_mask = masks.to(device)
                      
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)

              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()

              return logits


          testSentences = test['Sentence']

          totalNum = 0
          correctNum = 0
          for each in range(0, len(testSentences)):
              # print(test['Label'][each + 1], test['Sentence'][each + 1])
              logits = test_sentences([test['Sentence'][each + 1]])
              guess = str(np.argmax(logits))
              if guess == str(test['Label'][each + 1]) :
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(O)")
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess)+ ",1" + "\n")
                  correctNum = correctNum + 1
              else:
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess) + ",0" + "\n")       
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(X)")
              totalNum = totalNum + 1

          print("totalNum: ", totalNum, " correctNum: ", correctNum, " accuracy: ", (correctNum/totalNum))

      print("")
      print("Training complete!")
      print("")
      print("Final result is below!")
      print(final_info)

      f.close()

        
    elif postposition == "Ey":#8
      labelNumber = 8

      # 분류를 위한 BERT 모델 생성
      model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=labelNumber)
      model.cuda()


      # 정확도 계산 함수
      def flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          return np.sum(pred_flat == labels_flat) / len(labels_flat)
          
      def FNS_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 0):
              match_num += 1
            if labels_flat[i] == 0:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def INS_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 1):
              match_num += 1
            if labels_flat[i] == 1:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def GOL_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 2):
              match_num += 1
            if labels_flat[i] == 2:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def EFF_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 3):
              match_num += 1
            if labels_flat[i] == 3:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def CRT_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 4):
              match_num += 1
            if labels_flat[i] == 4:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def LOC_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 5):
              match_num += 1
            if labels_flat[i] == 5:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def AGT_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 6):
              match_num += 1
            if labels_flat[i] == 6:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      def THM_flat_accuracy(preds, labels):
          
          pred_flat = np.argmax(preds, axis=1).flatten()
          labels_flat = labels.flatten()

          match_num = 0
          func_num = 0
          for i in range(0,len(pred_flat)):
            #print(pred_flat[i]," / ",labels_flat[i])
            if (pred_flat[i] == labels_flat[i]) and (labels_flat[i] == 7):
              match_num += 1
            if labels_flat[i] == 7:
              func_num += 1

          if match_num == 0 or func_num == 0:
            return 0
          else:
            return match_num / func_num

      # 시간 표시 함수
      def format_time(elapsed):
          # 반올림
          elapsed_rounded = int(round((elapsed)))
          
          # hh:mm:ss으로 형태 변경
          return str(datetime.timedelta(seconds=elapsed_rounded))

      def outreault(guess):
        guess = int(guess)
        outFunction = ""
        if guess == 0:
          outFunction = "FNS"
        elif guess == 1:
          outFunction = "INS"
        elif guess == 2:
          outFunction = "GOL"
        elif guess == 3:
          outFunction = "EFF"
        elif guess == 4:
          outFunction = "CRT"
        elif guess == 5:
          outFunction = "LOC"
        elif guess == 6:
          outFunction = "AGT"
        elif guess == 7:
          outFunction = "THM"
        return outFunction

      # 옵티마이저 설정
      optimizer = AdamW(model.parameters(),
                        lr = setLearningRate, # 학습률
                        eps = setEpsilon # 0으로 나누는 것을 방지하기 위한 epsilon 값
                      )

      # 에폭수
      epochs = setEpoch

      # 총 훈련 스텝 : 배치반복 횟수 * 에폭
      total_steps = len(train_dataloader) * epochs

      # 학습률을 조금씩 감소시키는 스케줄러 생성
      scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                  num_warmup_steps = 0,
                                                  num_training_steps = total_steps)
      

      # 재현을 위해 랜덤시드 고정
      seed_val = setSeed
      random.seed(seed_val)
      np.random.seed(seed_val)
      torch.manual_seed(seed_val)
      torch.cuda.manual_seed_all(seed_val)

      # 그래디언트 초기화
      model.zero_grad()

      final_info = {}

      f = open("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/Outcomes/"+postposition+"_accuracy_trial_"+str(currentTry)+"_epoch_"+str(epochs)+".txt", 'w')
      f.write("epoch,sentence,originalLabel,predictedLabel,predictedFunction,result"+"\n")

      # 에폭만큼 반복
      for epoch_i in range(1, epochs):
          
          # ========================================
          #               Training
          # ========================================
          
          print("")
          print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs))
          print('Training...')

          # 시작 시간 설정
          t0 = time.time()

          # 로스 초기화
          total_loss = 0

          # 훈련모드로 변경
          model.train()
              
          # 데이터로더에서 배치만큼 반복하여 가져옴
          for step, batch in enumerate(train_dataloader):
              # 경과 정보 표시
              if step % 500 == 0 and not step == 0:
                  elapsed = format_time(time.time() - t0)
                  print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch

              # Forward 수행                
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)
              
              # 로스 구함
              loss = outputs[0]

              # 총 로스 계산
              total_loss += loss.item()

              # Backward 수행으로 그래디언트 계산
              loss.backward()

              # 그래디언트 클리핑
              torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

              # 그래디언트를 통해 가중치 파라미터 업데이트
              optimizer.step()

              # 스케줄러로 학습률 감소
              scheduler.step()

              # 그래디언트 초기화
              model.zero_grad()

          # 평균 로스 계산
          avg_train_loss = total_loss / len(train_dataloader)            

          print("")
          print("  Average training loss: {0:.2f}".format(avg_train_loss))
          print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
              
          # ========================================
          #               Validation
          # ========================================

          print("")
          print("Running Validation...")

          #시작 시간 설정
          t0 = time.time()

          # 평가모드로 변경
          model.eval()

          # 변수 초기화
          eval_loss, eval_accuracy = 0, 0
          nb_eval_steps, nb_eval_examples = 0, 0
          FNS_nb_eval_steps, FNS_eval_accuracy = 0, 0
          INS_nb_eval_steps, INS_eval_accuracy = 0, 0
          GOL_nb_eval_steps, GOL_eval_accuracy = 0, 0
          EFF_nb_eval_steps, EFF_eval_accuracy = 0, 0
          CRT_nb_eval_steps, CRT_eval_accuracy = 0, 0
          LOC_nb_eval_steps, LOC_eval_accuracy = 0, 0
          AGT_nb_eval_steps, AGT_eval_accuracy = 0, 0
          THM_nb_eval_steps, THM_eval_accuracy = 0, 0

          epoch_info = {}

          # 데이터로더에서 배치만큼 반복하여 가져옴
          for batch in test_dataloader:
              # 배치를 GPU에 넣음
              batch = tuple(t.to(device) for t in batch)
              
              # 배치에서 데이터 추출
              b_input_ids, b_input_mask, b_labels = batch
              
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)
              
              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()
              label_ids = b_labels.to('cpu').numpy()
              
              # 출력 로짓과 라벨을 비교하여 정확도 계산
              tmp_eval_accuracy = flat_accuracy(logits, label_ids)
              eval_accuracy += tmp_eval_accuracy
              nb_eval_steps += 1

              FNS_tmp_eval_accuracy = FNS_flat_accuracy(logits, label_ids)
              FNS_eval_accuracy += FNS_tmp_eval_accuracy
              FNS_nb_eval_steps += 1

              INS_tmp_eval_accuracy = INS_flat_accuracy(logits, label_ids)
              INS_eval_accuracy += INS_tmp_eval_accuracy
              INS_nb_eval_steps += 1

              GOL_tmp_eval_accuracy = GOL_flat_accuracy(logits, label_ids)
              GOL_eval_accuracy += GOL_tmp_eval_accuracy
              GOL_nb_eval_steps += 1

              EFF_tmp_eval_accuracy = EFF_flat_accuracy(logits, label_ids)
              EFF_eval_accuracy += EFF_tmp_eval_accuracy
              EFF_nb_eval_steps += 1

              CRT_tmp_eval_accuracy = CRT_flat_accuracy(logits, label_ids)
              CRT_eval_accuracy += CRT_tmp_eval_accuracy
              CRT_nb_eval_steps += 1

              LOC_tmp_eval_accuracy = LOC_flat_accuracy(logits, label_ids)
              LOC_eval_accuracy += LOC_tmp_eval_accuracy
              LOC_nb_eval_steps += 1

              AGT_tmp_eval_accuracy = AGT_flat_accuracy(logits, label_ids)
              AGT_eval_accuracy += AGT_tmp_eval_accuracy
              AGT_nb_eval_steps += 1

              THM_tmp_eval_accuracy = THM_flat_accuracy(logits, label_ids)
              THM_eval_accuracy += THM_tmp_eval_accuracy
              THM_nb_eval_steps += 1

          print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
          print("  Validation took: {:}".format(format_time(time.time() - t0)))
          print("")
          print("  Detail accuracy  ")
          print("  FNS_Accuracy: {0:.2f}".format(FNS_eval_accuracy/FNS_nb_eval_steps))
          print("  INS_Accuracy: {0:.2f}".format(INS_eval_accuracy/INS_nb_eval_steps))
          print("  GOL_Accuracy: {0:.2f}".format(GOL_eval_accuracy/GOL_nb_eval_steps))
          print("  EFF_Accuracy: {0:.2f}".format(EFF_eval_accuracy/EFF_nb_eval_steps))
          print("  CRT_Accuracy: {0:.2f}".format(CRT_eval_accuracy/CRT_nb_eval_steps))
          print("  LOC_Accuracy: {0:.2f}".format(LOC_eval_accuracy/LOC_nb_eval_steps))
          print("  AGT_Accuracy: {0:.2f}".format(AGT_eval_accuracy/AGT_nb_eval_steps))
          print("  THM_Accuracy: {0:.2f}".format(THM_eval_accuracy/THM_nb_eval_steps))

          epoch_info["Total"] = round(eval_accuracy/nb_eval_steps,3)
          epoch_info["Loss"] = round(avg_train_loss,3)
          epoch_info["FNS"] = round(FNS_eval_accuracy/FNS_nb_eval_steps,3)
          epoch_info["INS"] = round(INS_eval_accuracy/INS_nb_eval_steps,3)
          epoch_info["GOL"] = round(GOL_eval_accuracy/GOL_nb_eval_steps,3)
          epoch_info["EFF"] = round(EFF_eval_accuracy/EFF_nb_eval_steps,3)
          epoch_info["CRT"] = round(CRT_eval_accuracy/CRT_nb_eval_steps,3)
          epoch_info["LOC"] = round(LOC_eval_accuracy/LOC_nb_eval_steps,3)
          epoch_info["AGT"] = round(AGT_eval_accuracy/AGT_nb_eval_steps,3)
          epoch_info["THM"] = round(THM_eval_accuracy/THM_nb_eval_steps,3)

          final_info["epoch"+str(epoch_i)] = epoch_info


          # 평가모드로 변경
          model.eval()
          test_input_ids = []
          test_input_mask = []
          test_labels = []

          num = 0
          for step, batch in enumerate(test_data):   #467, 128
            # print("batch",batch)
            # 배치를 GPU에 넣음
            batch = tuple(t.to(device) for t in batch)
            
            # 배치에서 데이터 추출
            b_input_ids, b_input_mask, b_labels = batch
            input_ids_arr = []
            input_mask_arr = []

            

            for i in range(0,len(b_input_ids)):
              input_ids_arr.append(int(b_input_ids[i]))
              input_mask_arr.append(int(b_input_mask[i]))

            
            test_input_ids.append(input_ids_arr)
            test_input_mask.append(input_mask_arr)
            test_labels.append(int(b_labels))


          test_input_ids = torch.tensor(test_input_ids)
          test_input_mask = torch.tensor(test_input_mask)
          test_labels = test_labels

          test_input_ids = test_input_ids.to(device)
          test_input_mask = test_input_mask.to(device)


          # 그래디언트 계산 안함
          with torch.no_grad():     
              # Forward 수행
              outputs = model(test_input_ids, 
                              token_type_ids=None, 
                              attention_mask=test_input_mask)
              

          sentence_vecs_sum = outputs[0]

          sentence_array = []
          for i in range(0,len(sentence_vecs_sum)):
            each_array = []
            for j in range(0,len(sentence_vecs_sum[i])):
              each_array.append(float(sentence_vecs_sum[i][j]))
            sentence_array.append(each_array)

          initial_df = pd.DataFrame(sentence_array)

          from sklearn.manifold import TSNE
          tsne = TSNE(n_components=2, random_state=0)
          tsne_obj= tsne.fit_transform(initial_df)

          tsne_df = pd.DataFrame({'X':tsne_obj[:,0],'Y':tsne_obj[:,1],'Label':test_labels})

          tsne_df.to_csv("drive/My Drive/2022/AdverbialPostpositions/Output/BERT/"+postposition+"/t-SNE/"+postposition+"_tSNE_trial_"+str(currentTry)+"_epoch_"+str(epoch_i)+".csv")

          import numpy as np   
          import pandas as pd 
          from plotnine import *

          print("")
          print("  Network visualization  ")
          print(ggplot(tsne_df, aes(x='X', y='Y')) + geom_point(aes(colour = 'Label')))

          # 문장 테스트
          def test_sentences(sentences):

              # 평가모드로 변경
              model.eval()

              # 문장을 입력 데이터로 변환
              inputs, masks = convert_input_data(sentences)

              # 데이터를 GPU에 넣음
              b_input_ids = inputs.to(device)
              b_input_mask = masks.to(device)
                      
              # 그래디언트 계산 안함
              with torch.no_grad():     
                  # Forward 수행
                  outputs = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask)

              # 로스 구함
              logits = outputs[0]

              # CPU로 데이터 이동
              logits = logits.detach().cpu().numpy()

              return logits


          testSentences = test['Sentence']

          totalNum = 0
          correctNum = 0
          for each in range(0, len(testSentences)):
              # print(test['Label'][each + 1], test['Sentence'][each + 1])
              logits = test_sentences([test['Sentence'][each + 1]])
              guess = str(np.argmax(logits))
              if guess == str(test['Label'][each + 1]) :
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(O)")
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess)+ ",1" + "\n")
                  correctNum = correctNum + 1
              else:
                  f.write(str(epoch_i) + "," + test['Sentence'][each + 1] + "," + str(test['Label'][each + 1]) + "," + guess + "," + outreault(guess) + ",0" + "\n")       
                  # print("input: ", test['Sentence'][each + 1], ", predict: ", guess, "(X)")
              totalNum = totalNum + 1

          print("totalNum: ", totalNum, " correctNum: ", correctNum, " accuracy: ", (correctNum/totalNum))

      print("")
      print("Training complete!")
      print("")
      print("Final result is below!")
      print(final_info)

      f.close()

      


Output hidden; open in https://colab.research.google.com to view.