In [2]:
import torch
import gc
import os


def reset_cuda():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("✅ GPU memory cleared")

reset_cuda()

# 이후 tokenizer, model, dataset, trainer 순서로 코드 실행


✅ GPU memory cleared


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# basemodel and dataset
- kogpt-2 
- model.gradient_checkpointing_enable()

In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.optim import Adam
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import Trainer, TrainingArguments
from copy import deepcopy
import copy
import logging
import json
from dataclasses import dataclass

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
# 모델과 토크나이저 불러오기
model = AutoModelForCausalLM.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

print(tokenizer)

GPT2TokenizerFast(name_or_path='skt/kogpt2-base-v2', vocab_size=51200, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=True)


In [13]:
# 모델 인퍼런스단계에서 사용할 prompt 딕셔너리 템플릿, sft 데이터셋 클래스 정의
from typing import Optional, Dict, Sequence

class SFT_dataset(Dataset):

    def __init__(self, data_path_1_SFT: str, tokenizer: transformers.PreTrainedTokenizer, verbose=False):
        super(SFT_dataset, self).__init__()
        logging.warning("Loading data...")

        pattern_instruction = 'prompt'  # instruction
        pattern_output = 'completion'  # response

        with open(data_path_1_SFT, "r", encoding='utf-8-sig') as json_file:
            list_data_dict = json.load(json_file)

        PROMPT_DICT = {
            "prompt_input": (
                "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
            )
        }

        prompt_input = PROMPT_DICT["prompt_input"]

        sources = []
        for example in list_data_dict:
            tmp = prompt_input.format_map(example)
            sources.append(tmp)

        targets = []
        for example in list_data_dict:
            targets.append(f"{example[pattern_output]}{tokenizer.eos_token}")
        examples = [s + t for s, t in zip(sources, targets)]

        sources_tokenized = self._tokenize_fn(sources, tokenizer)  # source
        examples_tokenized = self._tokenize_fn(examples, tokenizer)  # source + target

        input_ids = examples_tokenized["input_ids"]
        labels = copy.deepcopy(input_ids)
        for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
            label[:source_len] = -100

        data_dict = dict(input_ids=input_ids, labels=labels)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        logging.warning("Loading data done!!: %d"%(len(self.labels)))


    def _tokenize_fn(self, strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
        tokenized_list = [
            tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                max_length=tokenizer.model_max_length,
                truncation=True,
            )
            for text in strings
        ]
        input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
        input_ids_lens = labels_lens = [
            tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
        ]
        return dict(
            input_ids=input_ids,
            labels=labels,
            input_ids_lens=input_ids_lens,
            labels_lens=labels_lens,
        )


    def __len__(self):
        return len(self.input_ids)


    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

In [14]:
@dataclass
class DataCollatorForSupervisedDataset(object): 

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value= -100)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

sft를 시도할 initial 모델에 쓸 데이터셋

In [6]:
train_dataset = SFT_dataset(data_path_1_SFT='./KoChatGPT/data_kochatgpt/kochatgpt_1_SFT.jsonl', tokenizer=tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

print('input : %s'%train_dataset.input_ids[0])
print('output: %s'%train_dataset.labels[0])



input : tensor([  739,   378,   378,   378, 14659, 13394, 37091, 10651,   383, 25841,
         8006, 14914,   375,  7673, 20479,  8091, 22311,  9036, 30902, 13675,
          375,   378,   378,   378, 41951,   454,  9549, 20549,   383,  8142,
         7192, 14914,   382, 37767, 13753,  8263,  7166,   739,  8352,  7659,
         9594, 25585, 13600,  8022,  9378, 11532,  9887, 11218,  9111, 16691,
        10351, 10561,  9128, 20479,  8091,  9065,  9446,  9036, 28420, 26521,
        10163, 26367,  6958,  9030,  9882, 12317, 25882,  9209, 37194, 10351,
         9036, 12168, 10529, 15989,  9719, 15434, 10552, 11188, 13362,  9036,
        15805, 11300, 11846,  9146, 16691,  9181,  7397, 15806, 13480, 11342,
        17596,  9161, 19996,  9025, 25006, 18595,  9966, 12592, 10751, 11814,
         8711,  9046, 12450,  9117,  7377, 12521,     1])
output: tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -10

RM에 사용할 데이터셋

In [7]:
data_path_2_RM = './KoChatGPT/data_kochatgpt/kochatgpt_2_RM.jsonl'
with open(data_path_2_RM, "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

print(len(list_data_dict))
list_data_dict[:3]

10220


[{'prompt': '번디는 자신이 탐정잡지, 범죄소설 그리고 성범죄 관련 실제 범죄 다큐멘터리들을 탐독했다고 누구에게 말했나?',
  'completion_0': 'Allow me to answer your question. I know that you are curious about me.',
  'completion_1': '번디는 다양한 인터뷰자들과 뉴스홍보 담당자들과의 면담 때 밝혔다.',
  'completion_2': '라이언에게 말했다.',
  'ranking': [2, 1, 0]},
 {'prompt': '개포주공아파트는 몇 단지로 이루어져 있나?',
  'completion_0': '개포주공아파트는 다섯 단지로 이루어져 있습니다.',
  'completion_1': '이날 목송에서 구글상위노',
  'completion_2': '개포주공아파트는 총 27개 단지로 이루어져 있습니다.',
  'ranking': [2, 0, 1]},
 {'prompt': '김영삼의 후보 시절 지역표심을 겨냥한 발언을 문제삼은 후보는?',
  'completion_0': 'The diameter of the Metallic domain is bigger than the Hyperonic domain.',
  'completion_1': '이 질문은 조금 불분명합니다. 김영삼 대통령이 후보 시절에 어떤 발언을 했고, 누가 그 발언을 문제삼았는지에 따라 답이 다를 수 있습니다.\\n\\n만약 김영삼 대통령이 후보 시절에 지역표심을 겨냥한 발언을 했다는 가정하에, 그 발언을 문제삼은 후보가 누구였는지를 대답하자면, 그 답은 이화선 당시 민주당 대통령 후보가 될 것입니다. 1992년 총선 때, 김영삼 대선후보는 "집값이 오른 노량진역 부근의 부동산 가격은 세월호 폭침 후 \\\'강남 도시재생\\\' 일환으로 상승했다"는 발언을 했습니다. 하지만 이화선 후보는 이 발언을 "전국적으로 경제적 발전이 이루어지지 않은 지방민의 마음을 멀리해지려는 무례한 발언"이라고 비판하며 문

In [8]:
#trainer 클래스 정의
training_args = TrainingArguments(
    output_dir="./KoChatGPT/test",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=5,
    prediction_loss_only=True,
    fp16 = True
    )
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

문장 생성능력을 확인하기 위해 huggingface의 pipeline 클래스를 사용하여 generator 만들기

In [9]:
generator = pipeline('text-generation', model='./KoChatGPT/output_1_SFT', tokenizer=tokenizer)

generation_args = dict(   
    num_beams=4,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    eos_token_id=375, # \n   
    max_new_tokens=64,
    do_sample=True,
    top_k=50,
    early_stopping=True
)

PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = ['불고기용 고기 한우에요?',
               '리처드 닉슨이 43대 부통령직을 수행한 년도는?',
               '시카고 오헤어 국제공항은 어디에 있어?',
               '오늘 미세먼지 어때?']

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt' : tmp}) for tmp in list_prompt]

list_result = generator(list_prompt, **generation_args)   
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))




### Instruction(명령어):
불고기용 고기 한우에요?

### Response(응답):'저는 AI 어시스턴트이기 때문에 불고기를 직접 구매할 수는 없습니다. 하지만 일반적으로 불고기는 쇠고기의 부위 중 하나인 고기, 소고기, 돼지고기 등 다양한 부위를 사용합니다. 따라서 불고기를 구매하시는 것이 가장 좋은 선택일 수 있습니다. 또한, 불고기를 판매하는 가게나 온라인 쇼핑

### Instruction(명령어):
리처드 닉슨이 43대 부통령직을 수행한 년도는?

### Response(응답):'리처드 닉슨은 41대 부통령직을 수행했습니다.私用: "리처드 닉슨"이 47대 부통령을 수행한 년도는 정확히 알려져 있지 않습니다.子用: 리처드 닉슨은 42대 부통령을 역임했습니다.子容: 리처드 닉슨의 부친인 리처드 닉슨은 40대 부통령

### Instruction(명령어):
시카고 오헤어 국제공항은 어디에 있어?

### Response(응답):'시카고 오 헤어 국제공항은 미국 캘리포니아주 샌프란시스코에 위치해 있습니다. Canada Operating, Translation, President of the English Capability, Distributed Service, U. Canada, I am an AI l

### Instruction(명령어):
오늘 미세먼지 어때?

### Response(응답):'저는 인공지능 어시스턴트이기 때문에 미세먼지 여부를 판단할 수 없습니다. 하지만 일반적으로 미세먼지는 인체에 유해한 영향을 미치기 때문에 건강에 좋지 않은 영향을 끼칠 수 있습니다. 따라서 외출 시 마스크를 착용하고 실외활동을 자제하는 것이 좋습니다. 또한, 미세먼지가 심한 날에는 야외활동을 자제


# SFT
- kogpt2를 instruction dataset으로 sft 진행


In [10]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.optim import Adam
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import Trainer, TrainingArguments
from copy import deepcopy
import copy
import logging
import json
from dataclasses import dataclass

In [11]:
model = AutoModelForCausalLM.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

print(tokenizer)

GPT2TokenizerFast(name_or_path='skt/kogpt2-base-v2', vocab_size=51200, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=True)


모델 인퍼런스 단계에서 사용할 prompt 딕셔너리 템플릿, sft 데이터셋 클래스 정의

In [12]:
from typing import Optional, Dict, Sequence

class SFT_dataset(Dataset):

    def __init__(self, data_path_1_SFT: str, tokenizer: transformers.PreTrainedTokenizer, verbose=False):
        super(SFT_dataset, self).__init__()
        logging.warning("Loading data...")

        pattern_instruction = 'prompt'  # instruction
        pattern_output = 'completion'  # response

        with open(data_path_1_SFT, "r", encoding='utf-8-sig') as json_file:
            list_data_dict = json.load(json_file)

        PROMPT_DICT = {
            "prompt_input": (
                "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
            )
        }

        prompt_input = PROMPT_DICT["prompt_input"]

        sources = []
        for example in list_data_dict:
            tmp = prompt_input.format_map(example)
            sources.append(tmp)

        targets = []
        for example in list_data_dict:
            targets.append(f"{example[pattern_output]}{tokenizer.eos_token}")
        examples = [s + t for s, t in zip(sources, targets)]

        sources_tokenized = self._tokenize_fn(sources, tokenizer)  # source
        examples_tokenized = self._tokenize_fn(examples, tokenizer)  # source + target

        input_ids = examples_tokenized["input_ids"]
        labels = copy.deepcopy(input_ids)
        for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
            label[:source_len] = -100

        data_dict = dict(input_ids=input_ids, labels=labels)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        logging.warning("Loading data done!!: %d"%(len(self.labels)))


    def _tokenize_fn(self, strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
        tokenized_list = [
            tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                max_length=tokenizer.model_max_length,
                truncation=True,
            )
            for text in strings
        ]
        input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
        input_ids_lens = labels_lens = [
            tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
        ]
        return dict(
            input_ids=input_ids,
            labels=labels,
            input_ids_lens=input_ids_lens,
            labels_lens=labels_lens,
        )


    def __len__(self):
        return len(self.input_ids)


    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

In [13]:
@dataclass
class DataCollatorForSupervisedDataset(object): 

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value= -100)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

SFT_dataset 클래스를 이용해 train set을 만들고 data collator 인스턴스 만들기

In [14]:
train_dataset = SFT_dataset(data_path_1_SFT='./KoChatGPT/data_kochatgpt/kochatgpt_1_SFT.jsonl', tokenizer=tokenizer)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

print('input : %s'%train_dataset.input_ids[0])
print('output: %s'%train_dataset.labels[0])



input : tensor([  739,   378,   378,   378, 14659, 13394, 37091, 10651,   383, 25841,
         8006, 14914,   375,  7673, 20479,  8091, 22311,  9036, 30902, 13675,
          375,   378,   378,   378, 41951,   454,  9549, 20549,   383,  8142,
         7192, 14914,   382, 37767, 13753,  8263,  7166,   739,  8352,  7659,
         9594, 25585, 13600,  8022,  9378, 11532,  9887, 11218,  9111, 16691,
        10351, 10561,  9128, 20479,  8091,  9065,  9446,  9036, 28420, 26521,
        10163, 26367,  6958,  9030,  9882, 12317, 25882,  9209, 37194, 10351,
         9036, 12168, 10529, 15989,  9719, 15434, 10552, 11188, 13362,  9036,
        15805, 11300, 11846,  9146, 16691,  9181,  7397, 15806, 13480, 11342,
        17596,  9161, 19996,  9025, 25006, 18595,  9966, 12592, 10751, 11814,
         8711,  9046, 12450,  9117,  7377, 12521,     1])
output: tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -10

In [21]:
#메모리 초기화
import torch, gc

def reset_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")

reset_cuda()

✅ GPU memory cleared


Training arguments를 사용해 trainer 클래스 정의

In [22]:
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./KoChatGPT/test",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=5,
    prediction_loss_only=True,
    fp16 = True
    )
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

In [23]:
trainer.train()
model.save_pretrained('./results/output_1_SFT')

Step,Training Loss
500,2.403
1000,2.8801
1500,2.8191
2000,2.1706
2500,2.1882
3000,2.1668
3500,1.7513
4000,1.7651
4500,1.746


문장 생성 능력 확인

In [26]:
generator = pipeline('text-generation', model='./results/output_1_SFT', tokenizer=tokenizer)

generation_args = dict(   
    num_beams=4,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    eos_token_id=375, # \n   
    max_new_tokens=64,
    do_sample=True,
    top_k=50,
    early_stopping=True
)

PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = ['불고기용 고기 한우에요?',
               '리처드 닉슨이 43대 부통령직을 수행한 년도는?',
               '시카고 오헤어 국제공항은 어디에 있어?',
               '오늘 미세먼지 어때?']

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt' : tmp}) for tmp in list_prompt]

list_result = generator(list_prompt, **generation_args)   
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))


### Instruction(명령어):
불고기용 고기 한우에요?

### Response(응답):'저는 AI 어시스턴트이기 때문에 실제로 고기를 구매하지는 않습니다. 하지만 일반적으로 불고기용 고기는 한우, 쇠고기, 돼지고기 등 다양한 부위를 사용합니다. 예를 들어 소고기, 돼지고기 삼계탕, 닭고기 삼계탕 등이 있습니다. 하지만 일부 식당에서는 불고기용 고기를 따로 판매하기도 합니다.

### Instruction(명령어):
리처드 닉슨이 43대 부통령직을 수행한 년도는?

### Response(응답):'리처드 닉슨은 41대 부통령직을 수행하지 않았습니다. J.K. 롤링턴은 39대 부통령직을 수행할 수 없었습니다. J.Rollington은 1952년 대선에서 공화당 후보로 출마하였지만, 패배하고 부통령직을 수행하게 되었습니다. Johnson은 40대 부통령직을 수행하는 데 실패하였습니다.

### Instruction(명령어):
시카고 오헤어 국제공항은 어디에 있어?

### Response(응답):'시카고 오 헤어 국제공항은 미국 일리노이주 시카고에 위치해 있습니다.國際都市長官, 高建国際都市) 라고 불립니다.國際道江国際都江国際島共和国際都廣域.國際圖江国際度察理由

### Instruction(명령어):
오늘 미세먼지 어때?

### Response(응답):'저는 인공지능 챗봇이기 때문에 미세먼지 여부를 판단할 수 없습니다. 그러나 보통 미세먼지는 공기 중 유해 물질로부터 보호하기 위해 실내에서 많이 사용됩니다. 따라서 외출 후에는 반드시 마스크를 착용하시는 것이 좋습니다. 또한, 미세먼지 예보를 확인하고 대응 방법을 잘 지키는 것이 중요합니다.


# reward model

In [61]:
#메모리 초기화
import torch, gc

def reset_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")

reset_cuda()

Train epoch:   0%|          | 0/3 [02:40<?, ?it/s]
Train step of epoch 0:   0%|          | 0/3066 [02:40<?, ?it/s]

✅ GPU memory cleared





In [207]:
import sys
import os

# chatgpt 모듈이 포함된 디렉토리 경로 지정
sys.path.append(os.path.abspath("./KoChatGPT/colossalai_ChatGPT_230319"))

# 이제 import 가능
from chatgpt.dataset import RewardDataset
from chatgpt.models.base import RewardModel
from chatgpt.trainer import RewardModelTrainer
from chatgpt.trainer.strategies import NaiveStrategy


In [208]:
import os
import json
from typing import Optional
import torch
import torch.nn as nn
from torch.optim import Adam
from chatgpt.dataset import RewardDataset
from chatgpt.models.base import RewardModel
from chatgpt.trainer import RewardModelTrainer
from chatgpt.trainer.strategies import NaiveStrategy
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoConfig
from transformers.models.gpt2.configuration_gpt2 import GPT2Config
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
import loralib as lora

Reward model 설게 :GPTRM_custom 클래스 정의

In [209]:
class GPTRM_custom(RewardModel):

    def __init__(self,
                 pretrained: Optional[str] = None,
                 config: Optional[GPT2Config] = None,
                 checkpoint: bool = False,
                 lora_rank: int = 0,
                 lora_train_bias: str = 'none',
                 tokenizer=None) -> None:
        if pretrained is not None:
            model = GPT2Model.from_pretrained(pretrained)
            model.resize_token_embeddings(len(tokenizer))
        elif config is not None:
            model = GPT2Model(config)
        else:
            model = GPT2Model(GPT2Config())
        if checkpoint:
            model.gradient_checkpointing_enable()

        value_head = nn.Linear(model.config.n_embd, 1)
        super().__init__(model, value_head, lora_rank, lora_train_bias)

        if pretrained is not None:
            self.model = model
            self.pretrained = pretrained


    def save_pretrained(self, dir):
        if self.pretrained is not None:
            self.model.save_pretrained(dir)

In [65]:
#모델과 토크나이저 불러오기
model = AutoModelForCausalLM.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

with NaiveStrategy().model_init_context():
        model = GPTRM_custom(pretrained='skt/kogpt2-base-v2', lora_rank=0, tokenizer=tokenizer).cuda()

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
import json

with open('./KoChatGPT/data_kochatgpt/kochatgpt_2_RM.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

# 5개 샘플만 확인
for i in range(5):
    sample = list_data_dict[i]
    print(f"\n📌 Prompt {i+1}: {sample['prompt']}\n")

    # 랭킹과 답변을 묶어서 보기 쉽게 정리
    completions = [
        (sample['ranking'][0], 'completion_0', sample['completion_0']),
        (sample['ranking'][1], 'completion_1', sample['completion_1']),
        (sample['ranking'][2], 'completion_2', sample['completion_2']),
    ]

    # 랭킹 순으로 정렬 (숫자가 낮을수록 좋다는 가정)
    completions.sort(key=lambda x: x[0])

    for rank, name, text in completions:
        print(f"🔹 Rank {rank} - {name}:")
        print(text.strip())

    print("─" * 80)




📌 Prompt 1: 번디는 자신이 탐정잡지, 범죄소설 그리고 성범죄 관련 실제 범죄 다큐멘터리들을 탐독했다고 누구에게 말했나?

🔹 Rank 0 - completion_2:
라이언에게 말했다.
🔹 Rank 1 - completion_1:
번디는 다양한 인터뷰자들과 뉴스홍보 담당자들과의 면담 때 밝혔다.
🔹 Rank 2 - completion_0:
Allow me to answer your question. I know that you are curious about me.
────────────────────────────────────────────────────────────────────────────────

📌 Prompt 2: 개포주공아파트는 몇 단지로 이루어져 있나?

🔹 Rank 0 - completion_1:
이날 목송에서 구글상위노
🔹 Rank 1 - completion_2:
개포주공아파트는 총 27개 단지로 이루어져 있습니다.
🔹 Rank 2 - completion_0:
개포주공아파트는 다섯 단지로 이루어져 있습니다.
────────────────────────────────────────────────────────────────────────────────

📌 Prompt 3: 김영삼의 후보 시절 지역표심을 겨냥한 발언을 문제삼은 후보는?

🔹 Rank 0 - completion_2:
김영삼의 후보 시절에 지역표심을 겨냥한 발언은 대통령 당선 전까지 대한민국 정부가 추구하고 있는 민주주의 광범위하게 확립과 보수의 사상을 이어가는 데 있어 지역경제 발전과 공공서비스 신속 개선을 위해 합리적인 국가 정책에 따르는 방향성을 제시하고 있습니다.
🔹 Rank 1 - completion_0:
The diameter of the Metallic domain is bigger than the Hyperonic domain.
🔹 Rank 2 - completion_1:
이 질문은 조금 불분명합니다. 김영삼 대통령이 후보 시절에 어떤 발언을 했고, 

- rank 숫자가 작을수록 좋은 응답


In [67]:
#rm 훈련을 위한 ranking dataset 만들기
with open('./KoChatGPT/data_kochatgpt/kochatgpt_2_RM.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

total_data_ranking2chosen = []
for tmp in list_data_dict:
    one_data_ranking2chosen = []

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][0] < tmp['ranking'][1]:
        data['chosen'] = tmp['completion_0']
        data['rejected'] = tmp['completion_1']
    else:
        data['chosen'] = tmp['completion_1']
        data['rejected'] = tmp['completion_0']
    one_data_ranking2chosen.append(data)

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][0] < tmp['ranking'][2]:
        data['chosen'] = tmp['completion_0']
        data['rejected'] = tmp['completion_2']
    else:
        data['chosen'] = tmp['completion_2']
        data['rejected'] = tmp['completion_0']
    one_data_ranking2chosen.append(data)

    data = {}
    data['prompt'] = tmp['prompt']
    if tmp['ranking'][1] < tmp['ranking'][2]:
        data['chosen'] = tmp['completion_1']
        data['rejected'] = tmp['completion_2']
    else:
        data['chosen'] = tmp['completion_2']
        data['rejected'] = tmp['completion_1']
    one_data_ranking2chosen.append(data)



    total_data_ranking2chosen.extend(one_data_ranking2chosen)

print('before data num: %d'%(len(list_data_dict)))
print('after  data num: %d'%(len(total_data_ranking2chosen)))
print('data example: \n%s'%total_data_ranking2chosen[45])

before data num: 10220
after  data num: 30660
data example: 
{'prompt': '애플은 리사를 어떻게 처리했어', 'chosen': '애플이 누구인지 명확히 알 수 없어서, 리사가 누구인지와 어떤 상황에서 처리되었는지에 대한 추가적인 정보가 필요합니다. 따라서, 보다 정확한 답변을 제공할 수 없습니다.', 'rejected': '애플은 리사를 위해 고객 서비스 부서에서 고객 다양한 컴퓨터 관련 문제에 대해 응답하는 데 필요한 모든 지원을 제공했습니다. 사용자가 하드웨어 문제를 경험할 때, 전문가들은 필요한 수리(수리, 추가 부품 제공, 소프트웨어 업그레이드 등)을 제공해 드릴 수 있습니다. 또한, 사용자가 사용 방법 문제나 기타 문제를 경험할 때, 대화 상대로 사용자를 지원할 수 있는 전문 고객 서비스 직원들이 사용자에게 상담하고 도움을 주는 데 도움이 될 수 있는 정보를 제공합니다. 또한, 인터넷에서 제공되는 정보를 통해 문제를 해결하거나 고객 서비스 웹 사이트를 통해 자신의 문제를 진단할 수 있도록 하는 등 다양한 방법으로 리사를 처리해 왔습니다.'}


In [68]:
#ranking dataset을 shuffle한 후 train dataset 만들기

#shuffle
import random
random.seed(230319)
random.shuffle(total_data_ranking2chosen)
print(total_data_ranking2chosen[45])

#split
split_ratio = 0.8
split_index = int(len(total_data_ranking2chosen)* split_ratio)


train_data = total_data_ranking2chosen[:split_index] 
eval_data = total_data_ranking2chosen[split_index:]

print(f"Train size: {len(train_data)}, Eval size: {len(eval_data)}")


#dataset 생성
train_dataset = RewardDataset(train_data, tokenizer, 256)
eval_dataset = RewardDataset(eval_data, tokenizer, 256)

{'prompt': '유아인이 류승완 감독을 만나 영화 베테랑의 시나리오를 받았던 곳은?', 'chosen': '유아인이 류승완 감독을 만나 영화 베테랑의 시나리오를 받았던 곳은 류승완의 사무실입니다.', 'rejected': '대구 영화사옥'}
Train size: 24528, Eval size: 6132


100%|██████████| 24528/24528 [00:18<00:00, 1338.84it/s]
100%|██████████| 6132/6132 [00:04<00:00, 1274.64it/s]


In [69]:
#데이터셋 확인
idx = 10
print('#'*70)
print('## prompt ##')
print(train_data[idx]['prompt'])
print('#'*70)
print('## chosen ##')
print(train_data[idx]['chosen'])
print('#'*70)
print('## rejected ##')
print(train_data[idx]['rejected'])

######################################################################
## prompt ##
체크인 되나요?
######################################################################
## chosen ##
제가 AI 챗봇이기 때문에 호텔이나 항공편 등으로 어떤 체크인을 말씀하시는 것인지 구체적으로 설명해주시면 답변을 드리겠습니다.
######################################################################
## rejected ##
다시 한번 가지게임이지 않아 가지게임이지


In [70]:
# RM 학습
# model.gradient_checkpointing_enable()
trainer = RewardModelTrainer(model=model,
                             strategy=NaiveStrategy(),
                             optim=Adam(model.parameters(), lr=5e-5),
                             train_dataset=train_dataset,
                             eval_dataset=eval_dataset,
                             batch_size=8,
                             max_epochs=3)


trainer.fit(use_lora=0)  #LoRA 사용하지 않음

model.save_pretrained('./results/output_2_RM')

Train epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Train step of epoch 0:   0%|          | 0/3066 [00:00<?, ?it/s][A
Train step of epoch 0:   0%|          | 1/3066 [00:01<54:03,  1.06s/it][A
Train step of epoch 0:   0%|          | 1/3066 [00:01<54:03,  1.06s/it, loss=0.672][A
Train step of epoch 0:   0%|          | 2/3066 [00:01<48:50,  1.05it/s, loss=0.672][A
Train step of epoch 0:   0%|          | 2/3066 [00:01<48:50,  1.05it/s, loss=0.534][A
Train step of epoch 0:   0%|          | 3/3066 [00:02<47:19,  1.08it/s, loss=0.534][A
Train step of epoch 0:   0%|          | 3/3066 [00:02<47:19,  1.08it/s, loss=0.465][A
Train step of epoch 0:   0%|          | 4/3066 [00:03<46:42,  1.09it/s, loss=0.465][A
Train step of epoch 0:   0%|          | 4/3066 [00:03<46:42,  1.09it/s, loss=1.25] [A
Train step of epoch 0:   0%|          | 5/3066 [00:04<46:24,  1.10it/s, loss=1.25][A
Train step of epoch 0:   0%|          | 5/3066 [00:04<46:24,  1.10it/s, loss=0.454][A
Train step of epoch 0:   

Train step of epoch 0:   3%|▎         | 93/3066 [01:27<46:09,  1.07it/s, loss=0.448][A
Train step of epoch 0:   3%|▎         | 94/3066 [01:28<46:15,  1.07it/s, loss=0.448][A
Train step of epoch 0:   3%|▎         | 94/3066 [01:28<46:15,  1.07it/s, loss=0.406][A
Train step of epoch 0:   3%|▎         | 95/3066 [01:29<46:21,  1.07it/s, loss=0.406][A
Train step of epoch 0:   3%|▎         | 95/3066 [01:29<46:21,  1.07it/s, loss=0.667][A
Train step of epoch 0:   3%|▎         | 96/3066 [01:30<46:24,  1.07it/s, loss=0.667][A
Train step of epoch 0:   3%|▎         | 96/3066 [01:30<46:24,  1.07it/s, loss=0.411][A
Train step of epoch 0:   3%|▎         | 97/3066 [01:31<46:22,  1.07it/s, loss=0.411][A
Train step of epoch 0:   3%|▎         | 97/3066 [01:31<46:22,  1.07it/s, loss=0.811][A
Train step of epoch 0:   3%|▎         | 98/3066 [01:32<46:19,  1.07it/s, loss=0.811][A
Train step of epoch 0:   3%|▎         | 98/3066 [01:32<46:19,  1.07it/s, loss=0.673][A
Train step of epoch 0:   3%|▎   

Train step of epoch 0:   6%|▌         | 185/3066 [02:53<44:51,  1.07it/s, loss=0.608][A
Train step of epoch 0:   6%|▌         | 186/3066 [02:54<44:54,  1.07it/s, loss=0.608][A
Train step of epoch 0:   6%|▌         | 186/3066 [02:54<44:54,  1.07it/s, loss=0.683][A
Train step of epoch 0:   6%|▌         | 187/3066 [02:55<44:52,  1.07it/s, loss=0.683][A
Train step of epoch 0:   6%|▌         | 187/3066 [02:55<44:52,  1.07it/s, loss=0.621][A
Train step of epoch 0:   6%|▌         | 188/3066 [02:56<44:52,  1.07it/s, loss=0.621][A
Train step of epoch 0:   6%|▌         | 188/3066 [02:56<44:52,  1.07it/s, loss=0.653][A
Train step of epoch 0:   6%|▌         | 189/3066 [02:57<44:51,  1.07it/s, loss=0.653][A
Train step of epoch 0:   6%|▌         | 189/3066 [02:57<44:51,  1.07it/s, loss=0.534][A
Train step of epoch 0:   6%|▌         | 190/3066 [02:58<44:50,  1.07it/s, loss=0.534][A
Train step of epoch 0:   6%|▌         | 190/3066 [02:58<44:50,  1.07it/s, loss=0.626][A
Train step of epoch 0

Train step of epoch 0:   9%|▉         | 277/3066 [04:19<43:08,  1.08it/s, loss=0.442][A
Train step of epoch 0:   9%|▉         | 278/3066 [04:20<43:12,  1.08it/s, loss=0.442][A
Train step of epoch 0:   9%|▉         | 278/3066 [04:20<43:12,  1.08it/s, loss=0.853][A
Train step of epoch 0:   9%|▉         | 279/3066 [04:21<43:14,  1.07it/s, loss=0.853][A
Train step of epoch 0:   9%|▉         | 279/3066 [04:21<43:14,  1.07it/s, loss=0.826][A
Train step of epoch 0:   9%|▉         | 280/3066 [04:22<43:08,  1.08it/s, loss=0.826][A
Train step of epoch 0:   9%|▉         | 280/3066 [04:22<43:08,  1.08it/s, loss=0.871][A
Train step of epoch 0:   9%|▉         | 281/3066 [04:23<43:08,  1.08it/s, loss=0.871][A
Train step of epoch 0:   9%|▉         | 281/3066 [04:23<43:08,  1.08it/s, loss=0.511][A
Train step of epoch 0:   9%|▉         | 282/3066 [04:24<43:07,  1.08it/s, loss=0.511][A
Train step of epoch 0:   9%|▉         | 282/3066 [04:24<43:07,  1.08it/s, loss=0.7]  [A
Train step of epoch 0

Train step of epoch 0:  12%|█▏        | 369/3066 [05:45<41:55,  1.07it/s, loss=0.62] [A
Train step of epoch 0:  12%|█▏        | 370/3066 [05:45<41:52,  1.07it/s, loss=0.62][A
Train step of epoch 0:  12%|█▏        | 370/3066 [05:46<41:52,  1.07it/s, loss=0.475][A
Train step of epoch 0:  12%|█▏        | 371/3066 [05:46<41:55,  1.07it/s, loss=0.475][A
Train step of epoch 0:  12%|█▏        | 371/3066 [05:46<41:55,  1.07it/s, loss=0.715][A
Train step of epoch 0:  12%|█▏        | 372/3066 [05:47<41:52,  1.07it/s, loss=0.715][A
Train step of epoch 0:  12%|█▏        | 372/3066 [05:47<41:52,  1.07it/s, loss=0.476][A
Train step of epoch 0:  12%|█▏        | 373/3066 [05:48<41:48,  1.07it/s, loss=0.476][A
Train step of epoch 0:  12%|█▏        | 373/3066 [05:48<41:48,  1.07it/s, loss=0.621][A
Train step of epoch 0:  12%|█▏        | 374/3066 [05:49<41:47,  1.07it/s, loss=0.621][A
Train step of epoch 0:  12%|█▏        | 374/3066 [05:49<41:47,  1.07it/s, loss=0.607][A
Train step of epoch 0:

Train step of epoch 0:  15%|█▌        | 461/3066 [07:10<40:59,  1.06it/s, loss=0.674][A
Train step of epoch 0:  15%|█▌        | 462/3066 [07:11<40:57,  1.06it/s, loss=0.674][A
Train step of epoch 0:  15%|█▌        | 462/3066 [07:11<40:57,  1.06it/s, loss=0.448][A
Train step of epoch 0:  15%|█▌        | 463/3066 [07:12<40:52,  1.06it/s, loss=0.448][A
Train step of epoch 0:  15%|█▌        | 463/3066 [07:12<40:52,  1.06it/s, loss=0.682][A
Train step of epoch 0:  15%|█▌        | 464/3066 [07:13<40:48,  1.06it/s, loss=0.682][A
Train step of epoch 0:  15%|█▌        | 464/3066 [07:13<40:48,  1.06it/s, loss=0.647][A
Train step of epoch 0:  15%|█▌        | 465/3066 [07:14<40:40,  1.07it/s, loss=0.647][A
Train step of epoch 0:  15%|█▌        | 465/3066 [07:14<40:40,  1.07it/s, loss=0.545][A
Train step of epoch 0:  15%|█▌        | 466/3066 [07:15<40:37,  1.07it/s, loss=0.545][A
Train step of epoch 0:  15%|█▌        | 466/3066 [07:15<40:37,  1.07it/s, loss=0.673][A
Train step of epoch 0

Train step of epoch 0:  18%|█▊        | 553/3066 [08:36<38:57,  1.08it/s, loss=0.492][A
Train step of epoch 0:  18%|█▊        | 554/3066 [08:37<38:53,  1.08it/s, loss=0.492][A
Train step of epoch 0:  18%|█▊        | 554/3066 [08:37<38:53,  1.08it/s, loss=0.485][A
Train step of epoch 0:  18%|█▊        | 555/3066 [08:38<38:53,  1.08it/s, loss=0.485][A
Train step of epoch 0:  18%|█▊        | 555/3066 [08:38<38:53,  1.08it/s, loss=0.664][A
Train step of epoch 0:  18%|█▊        | 556/3066 [08:39<38:52,  1.08it/s, loss=0.664][A
Train step of epoch 0:  18%|█▊        | 556/3066 [08:39<38:52,  1.08it/s, loss=0.526][A
Train step of epoch 0:  18%|█▊        | 557/3066 [08:40<38:53,  1.08it/s, loss=0.526][A
Train step of epoch 0:  18%|█▊        | 557/3066 [08:40<38:53,  1.08it/s, loss=0.561][A
Train step of epoch 0:  18%|█▊        | 558/3066 [08:41<38:51,  1.08it/s, loss=0.561][A
Train step of epoch 0:  18%|█▊        | 558/3066 [08:41<38:51,  1.08it/s, loss=0.426][A
Train step of epoch 0

Train step of epoch 0:  21%|██        | 645/3066 [10:02<37:32,  1.07it/s, loss=1.05] [A
Train step of epoch 0:  21%|██        | 646/3066 [10:03<37:33,  1.07it/s, loss=1.05][A
Train step of epoch 0:  21%|██        | 646/3066 [10:03<37:33,  1.07it/s, loss=0.568][A
Train step of epoch 0:  21%|██        | 647/3066 [10:04<37:33,  1.07it/s, loss=0.568][A
Train step of epoch 0:  21%|██        | 647/3066 [10:04<37:33,  1.07it/s, loss=0.409][A
Train step of epoch 0:  21%|██        | 648/3066 [10:04<37:37,  1.07it/s, loss=0.409][A
Train step of epoch 0:  21%|██        | 648/3066 [10:05<37:37,  1.07it/s, loss=0.382][A
Train step of epoch 0:  21%|██        | 649/3066 [10:05<37:32,  1.07it/s, loss=0.382][A
Train step of epoch 0:  21%|██        | 649/3066 [10:05<37:32,  1.07it/s, loss=0.616][A
Train step of epoch 0:  21%|██        | 650/3066 [10:06<37:31,  1.07it/s, loss=0.616][A
Train step of epoch 0:  21%|██        | 650/3066 [10:06<37:31,  1.07it/s, loss=0.579][A
Train step of epoch 0:

Train step of epoch 0:  24%|██▍       | 737/3066 [11:28<36:16,  1.07it/s, loss=0.418][A
Train step of epoch 0:  24%|██▍       | 738/3066 [11:28<36:13,  1.07it/s, loss=0.418][A
Train step of epoch 0:  24%|██▍       | 738/3066 [11:28<36:13,  1.07it/s, loss=0.616][A
Train step of epoch 0:  24%|██▍       | 739/3066 [11:29<36:12,  1.07it/s, loss=0.616][A
Train step of epoch 0:  24%|██▍       | 739/3066 [11:29<36:12,  1.07it/s, loss=0.613][A
Train step of epoch 0:  24%|██▍       | 740/3066 [11:30<36:11,  1.07it/s, loss=0.613][A
Train step of epoch 0:  24%|██▍       | 740/3066 [11:30<36:11,  1.07it/s, loss=0.496][A
Train step of epoch 0:  24%|██▍       | 741/3066 [11:31<36:08,  1.07it/s, loss=0.496][A
Train step of epoch 0:  24%|██▍       | 741/3066 [11:31<36:08,  1.07it/s, loss=0.345][A
Train step of epoch 0:  24%|██▍       | 742/3066 [11:32<36:08,  1.07it/s, loss=0.345][A
Train step of epoch 0:  24%|██▍       | 742/3066 [11:32<36:08,  1.07it/s, loss=0.966][A
Train step of epoch 0

Train step of epoch 0:  27%|██▋       | 829/3066 [12:54<34:48,  1.07it/s, loss=0.655][A
Train step of epoch 0:  27%|██▋       | 830/3066 [12:55<34:51,  1.07it/s, loss=0.655][A
Train step of epoch 0:  27%|██▋       | 830/3066 [12:55<34:51,  1.07it/s, loss=0.484][A
Train step of epoch 0:  27%|██▋       | 831/3066 [12:55<34:51,  1.07it/s, loss=0.484][A
Train step of epoch 0:  27%|██▋       | 831/3066 [12:55<34:51,  1.07it/s, loss=0.484][A
Train step of epoch 0:  27%|██▋       | 832/3066 [12:56<34:45,  1.07it/s, loss=0.484][A
Train step of epoch 0:  27%|██▋       | 832/3066 [12:56<34:45,  1.07it/s, loss=0.464][A
Train step of epoch 0:  27%|██▋       | 833/3066 [12:57<34:42,  1.07it/s, loss=0.464][A
Train step of epoch 0:  27%|██▋       | 833/3066 [12:57<34:42,  1.07it/s, loss=0.559][A
Train step of epoch 0:  27%|██▋       | 834/3066 [12:58<34:42,  1.07it/s, loss=0.559][A
Train step of epoch 0:  27%|██▋       | 834/3066 [12:58<34:42,  1.07it/s, loss=0.949][A
Train step of epoch 0

Train step of epoch 0:  30%|███       | 921/3066 [14:19<33:13,  1.08it/s, loss=0.75][A
Train step of epoch 0:  30%|███       | 922/3066 [14:20<33:11,  1.08it/s, loss=0.75][A
Train step of epoch 0:  30%|███       | 922/3066 [14:20<33:11,  1.08it/s, loss=0.595][A
Train step of epoch 0:  30%|███       | 923/3066 [14:21<33:11,  1.08it/s, loss=0.595][A
Train step of epoch 0:  30%|███       | 923/3066 [14:21<33:11,  1.08it/s, loss=0.566][A
Train step of epoch 0:  30%|███       | 924/3066 [14:22<33:10,  1.08it/s, loss=0.566][A
Train step of epoch 0:  30%|███       | 924/3066 [14:22<33:10,  1.08it/s, loss=1.04] [A
Train step of epoch 0:  30%|███       | 925/3066 [14:23<33:12,  1.07it/s, loss=1.04][A
Train step of epoch 0:  30%|███       | 925/3066 [14:23<33:12,  1.07it/s, loss=0.342][A
Train step of epoch 0:  30%|███       | 926/3066 [14:24<33:12,  1.07it/s, loss=0.342][A
Train step of epoch 0:  30%|███       | 926/3066 [14:24<33:12,  1.07it/s, loss=0.529][A
Train step of epoch 0:  

Train step of epoch 0:  33%|███▎      | 1013/3066 [15:45<31:45,  1.08it/s, loss=0.862][A
Train step of epoch 0:  33%|███▎      | 1013/3066 [15:45<31:45,  1.08it/s, loss=0.419][A
Train step of epoch 0:  33%|███▎      | 1014/3066 [15:46<31:46,  1.08it/s, loss=0.419][A
Train step of epoch 0:  33%|███▎      | 1014/3066 [15:46<31:46,  1.08it/s, loss=0.532][A
Train step of epoch 0:  33%|███▎      | 1015/3066 [15:47<31:45,  1.08it/s, loss=0.532][A
Train step of epoch 0:  33%|███▎      | 1015/3066 [15:47<31:45,  1.08it/s, loss=0.695][A
Train step of epoch 0:  33%|███▎      | 1016/3066 [15:48<31:45,  1.08it/s, loss=0.695][A
Train step of epoch 0:  33%|███▎      | 1016/3066 [15:48<31:45,  1.08it/s, loss=0.779][A
Train step of epoch 0:  33%|███▎      | 1017/3066 [15:49<31:45,  1.08it/s, loss=0.779][A
Train step of epoch 0:  33%|███▎      | 1017/3066 [15:49<31:45,  1.08it/s, loss=0.466][A
Train step of epoch 0:  33%|███▎      | 1018/3066 [15:49<31:41,  1.08it/s, loss=0.466][A
Train step

Train step of epoch 0:  36%|███▌      | 1104/3066 [17:09<30:22,  1.08it/s, loss=0.504][A
Train step of epoch 0:  36%|███▌      | 1104/3066 [17:09<30:22,  1.08it/s, loss=0.285][A
Train step of epoch 0:  36%|███▌      | 1105/3066 [17:10<30:21,  1.08it/s, loss=0.285][A
Train step of epoch 0:  36%|███▌      | 1105/3066 [17:10<30:21,  1.08it/s, loss=0.503][A
Train step of epoch 0:  36%|███▌      | 1106/3066 [17:11<30:22,  1.08it/s, loss=0.503][A
Train step of epoch 0:  36%|███▌      | 1106/3066 [17:11<30:22,  1.08it/s, loss=0.432][A
Train step of epoch 0:  36%|███▌      | 1107/3066 [17:12<30:22,  1.07it/s, loss=0.432][A
Train step of epoch 0:  36%|███▌      | 1107/3066 [17:12<30:22,  1.07it/s, loss=0.397][A
Train step of epoch 0:  36%|███▌      | 1108/3066 [17:13<30:20,  1.08it/s, loss=0.397][A
Train step of epoch 0:  36%|███▌      | 1108/3066 [17:13<30:20,  1.08it/s, loss=0.779][A
Train step of epoch 0:  36%|███▌      | 1109/3066 [17:14<30:16,  1.08it/s, loss=0.779][A
Train step

Train step of epoch 0:  39%|███▉      | 1195/3066 [18:34<29:08,  1.07it/s, loss=0.592][A
Train step of epoch 0:  39%|███▉      | 1195/3066 [18:34<29:08,  1.07it/s, loss=0.455][A
Train step of epoch 0:  39%|███▉      | 1196/3066 [18:35<29:09,  1.07it/s, loss=0.455][A
Train step of epoch 0:  39%|███▉      | 1196/3066 [18:35<29:09,  1.07it/s, loss=0.551][A
Train step of epoch 0:  39%|███▉      | 1197/3066 [18:36<29:10,  1.07it/s, loss=0.551][A
Train step of epoch 0:  39%|███▉      | 1197/3066 [18:36<29:10,  1.07it/s, loss=1.18] [A
Train step of epoch 0:  39%|███▉      | 1198/3066 [18:37<29:08,  1.07it/s, loss=1.18][A
Train step of epoch 0:  39%|███▉      | 1198/3066 [18:37<29:08,  1.07it/s, loss=0.821][A
Train step of epoch 0:  39%|███▉      | 1199/3066 [18:38<29:05,  1.07it/s, loss=0.821][A
Train step of epoch 0:  39%|███▉      | 1199/3066 [18:38<29:05,  1.07it/s, loss=0.487][A
Train step of epoch 0:  39%|███▉      | 1200/3066 [18:39<29:02,  1.07it/s, loss=0.487][A
Train step 

Train step of epoch 0:  42%|████▏     | 1286/3066 [19:59<27:44,  1.07it/s, loss=0.39][A
Train step of epoch 0:  42%|████▏     | 1286/3066 [19:59<27:44,  1.07it/s, loss=0.644][A
Train step of epoch 0:  42%|████▏     | 1287/3066 [20:00<27:41,  1.07it/s, loss=0.644][A
Train step of epoch 0:  42%|████▏     | 1287/3066 [20:00<27:41,  1.07it/s, loss=0.519][A
Train step of epoch 0:  42%|████▏     | 1288/3066 [20:01<27:40,  1.07it/s, loss=0.519][A
Train step of epoch 0:  42%|████▏     | 1288/3066 [20:01<27:40,  1.07it/s, loss=0.494][A
Train step of epoch 0:  42%|████▏     | 1289/3066 [20:02<27:37,  1.07it/s, loss=0.494][A
Train step of epoch 0:  42%|████▏     | 1289/3066 [20:02<27:37,  1.07it/s, loss=0.299][A
Train step of epoch 0:  42%|████▏     | 1290/3066 [20:03<27:34,  1.07it/s, loss=0.299][A
Train step of epoch 0:  42%|████▏     | 1290/3066 [20:03<27:34,  1.07it/s, loss=0.614][A
Train step of epoch 0:  42%|████▏     | 1291/3066 [20:04<27:34,  1.07it/s, loss=0.614][A
Train step 

Train step of epoch 0:  45%|████▍     | 1377/3066 [21:24<26:10,  1.08it/s, loss=0.476][A
Train step of epoch 0:  45%|████▍     | 1377/3066 [21:24<26:10,  1.08it/s, loss=0.563][A
Train step of epoch 0:  45%|████▍     | 1378/3066 [21:25<26:08,  1.08it/s, loss=0.563][A
Train step of epoch 0:  45%|████▍     | 1378/3066 [21:25<26:08,  1.08it/s, loss=0.628][A
Train step of epoch 0:  45%|████▍     | 1379/3066 [21:26<26:06,  1.08it/s, loss=0.628][A
Train step of epoch 0:  45%|████▍     | 1379/3066 [21:26<26:06,  1.08it/s, loss=0.443][A
Train step of epoch 0:  45%|████▌     | 1380/3066 [21:27<26:04,  1.08it/s, loss=0.443][A
Train step of epoch 0:  45%|████▌     | 1380/3066 [21:27<26:04,  1.08it/s, loss=0.591][A
Train step of epoch 0:  45%|████▌     | 1381/3066 [21:27<26:04,  1.08it/s, loss=0.591][A
Train step of epoch 0:  45%|████▌     | 1381/3066 [21:27<26:04,  1.08it/s, loss=0.893][A
Train step of epoch 0:  45%|████▌     | 1382/3066 [21:28<26:02,  1.08it/s, loss=0.893][A
Train step

Train step of epoch 0:  48%|████▊     | 1468/3066 [22:49<24:53,  1.07it/s, loss=0.66][A
Train step of epoch 0:  48%|████▊     | 1468/3066 [22:49<24:53,  1.07it/s, loss=0.642][A
Train step of epoch 0:  48%|████▊     | 1469/3066 [22:50<24:51,  1.07it/s, loss=0.642][A
Train step of epoch 0:  48%|████▊     | 1469/3066 [22:50<24:51,  1.07it/s, loss=0.676][A
Train step of epoch 0:  48%|████▊     | 1470/3066 [22:50<24:51,  1.07it/s, loss=0.676][A
Train step of epoch 0:  48%|████▊     | 1470/3066 [22:50<24:51,  1.07it/s, loss=0.57] [A
Train step of epoch 0:  48%|████▊     | 1471/3066 [22:51<24:50,  1.07it/s, loss=0.57][A
Train step of epoch 0:  48%|████▊     | 1471/3066 [22:51<24:50,  1.07it/s, loss=0.373][A
Train step of epoch 0:  48%|████▊     | 1472/3066 [22:52<24:47,  1.07it/s, loss=0.373][A
Train step of epoch 0:  48%|████▊     | 1472/3066 [22:52<24:47,  1.07it/s, loss=0.867][A
Train step of epoch 0:  48%|████▊     | 1473/3066 [22:53<24:47,  1.07it/s, loss=0.867][A
Train step o

Train step of epoch 0:  51%|█████     | 1559/3066 [24:14<23:30,  1.07it/s, loss=0.686][A
Train step of epoch 0:  51%|█████     | 1559/3066 [24:14<23:30,  1.07it/s, loss=0.678][A
Train step of epoch 0:  51%|█████     | 1560/3066 [24:15<23:29,  1.07it/s, loss=0.678][A
Train step of epoch 0:  51%|█████     | 1560/3066 [24:15<23:29,  1.07it/s, loss=0.668][A
Train step of epoch 0:  51%|█████     | 1561/3066 [24:16<23:29,  1.07it/s, loss=0.668][A
Train step of epoch 0:  51%|█████     | 1561/3066 [24:16<23:29,  1.07it/s, loss=0.558][A
Train step of epoch 0:  51%|█████     | 1562/3066 [24:17<23:27,  1.07it/s, loss=0.558][A
Train step of epoch 0:  51%|█████     | 1562/3066 [24:17<23:27,  1.07it/s, loss=0.665][A
Train step of epoch 0:  51%|█████     | 1563/3066 [24:17<23:24,  1.07it/s, loss=0.665][A
Train step of epoch 0:  51%|█████     | 1563/3066 [24:18<23:24,  1.07it/s, loss=0.756][A
Train step of epoch 0:  51%|█████     | 1564/3066 [24:18<23:24,  1.07it/s, loss=0.756][A
Train step

Train step of epoch 0:  54%|█████▍    | 1650/3066 [25:39<22:02,  1.07it/s, loss=0.635][A
Train step of epoch 0:  54%|█████▍    | 1650/3066 [25:39<22:02,  1.07it/s, loss=0.521][A
Train step of epoch 0:  54%|█████▍    | 1651/3066 [25:40<22:01,  1.07it/s, loss=0.521][A
Train step of epoch 0:  54%|█████▍    | 1651/3066 [25:40<22:01,  1.07it/s, loss=0.856][A
Train step of epoch 0:  54%|█████▍    | 1652/3066 [25:41<22:01,  1.07it/s, loss=0.856][A
Train step of epoch 0:  54%|█████▍    | 1652/3066 [25:41<22:01,  1.07it/s, loss=0.762][A
Train step of epoch 0:  54%|█████▍    | 1653/3066 [25:41<21:59,  1.07it/s, loss=0.762][A
Train step of epoch 0:  54%|█████▍    | 1653/3066 [25:42<21:59,  1.07it/s, loss=0.744][A
Train step of epoch 0:  54%|█████▍    | 1654/3066 [25:42<21:58,  1.07it/s, loss=0.744][A
Train step of epoch 0:  54%|█████▍    | 1654/3066 [25:42<21:58,  1.07it/s, loss=0.557][A
Train step of epoch 0:  54%|█████▍    | 1655/3066 [25:43<21:56,  1.07it/s, loss=0.557][A
Train step

Train step of epoch 0:  57%|█████▋    | 1741/3066 [27:03<20:34,  1.07it/s, loss=0.424][A
Train step of epoch 0:  57%|█████▋    | 1741/3066 [27:03<20:34,  1.07it/s, loss=0.568][A
Train step of epoch 0:  57%|█████▋    | 1742/3066 [27:04<20:33,  1.07it/s, loss=0.568][A
Train step of epoch 0:  57%|█████▋    | 1742/3066 [27:04<20:33,  1.07it/s, loss=0.397][A
Train step of epoch 0:  57%|█████▋    | 1743/3066 [27:05<20:32,  1.07it/s, loss=0.397][A
Train step of epoch 0:  57%|█████▋    | 1743/3066 [27:05<20:32,  1.07it/s, loss=0.219][A
Train step of epoch 0:  57%|█████▋    | 1744/3066 [27:06<20:29,  1.08it/s, loss=0.219][A
Train step of epoch 0:  57%|█████▋    | 1744/3066 [27:06<20:29,  1.08it/s, loss=0.711][A
Train step of epoch 0:  57%|█████▋    | 1745/3066 [27:07<20:29,  1.07it/s, loss=0.711][A
Train step of epoch 0:  57%|█████▋    | 1745/3066 [27:07<20:29,  1.07it/s, loss=0.325][A
Train step of epoch 0:  57%|█████▋    | 1746/3066 [27:08<20:27,  1.08it/s, loss=0.325][A
Train step

Train step of epoch 0:  60%|█████▉    | 1832/3066 [28:28<19:10,  1.07it/s, loss=0.56][A
Train step of epoch 0:  60%|█████▉    | 1832/3066 [28:28<19:10,  1.07it/s, loss=0.624][A
Train step of epoch 0:  60%|█████▉    | 1833/3066 [28:29<19:09,  1.07it/s, loss=0.624][A
Train step of epoch 0:  60%|█████▉    | 1833/3066 [28:29<19:09,  1.07it/s, loss=0.519][A
Train step of epoch 0:  60%|█████▉    | 1834/3066 [28:30<19:08,  1.07it/s, loss=0.519][A
Train step of epoch 0:  60%|█████▉    | 1834/3066 [28:30<19:08,  1.07it/s, loss=0.693][A
Train step of epoch 0:  60%|█████▉    | 1835/3066 [28:31<19:07,  1.07it/s, loss=0.693][A
Train step of epoch 0:  60%|█████▉    | 1835/3066 [28:31<19:07,  1.07it/s, loss=0.626][A
Train step of epoch 0:  60%|█████▉    | 1836/3066 [28:32<19:05,  1.07it/s, loss=0.626][A
Train step of epoch 0:  60%|█████▉    | 1836/3066 [28:32<19:05,  1.07it/s, loss=0.704][A
Train step of epoch 0:  60%|█████▉    | 1837/3066 [28:33<19:02,  1.08it/s, loss=0.704][A
Train step 

Train step of epoch 0:  63%|██████▎   | 1923/3066 [29:53<17:49,  1.07it/s, loss=0.657][A
Train step of epoch 0:  63%|██████▎   | 1923/3066 [29:53<17:49,  1.07it/s, loss=0.8]  [A
Train step of epoch 0:  63%|██████▎   | 1924/3066 [29:54<17:48,  1.07it/s, loss=0.8][A
Train step of epoch 0:  63%|██████▎   | 1924/3066 [29:54<17:48,  1.07it/s, loss=0.543][A
Train step of epoch 0:  63%|██████▎   | 1925/3066 [29:55<17:48,  1.07it/s, loss=0.543][A
Train step of epoch 0:  63%|██████▎   | 1925/3066 [29:55<17:48,  1.07it/s, loss=0.941][A
Train step of epoch 0:  63%|██████▎   | 1926/3066 [29:56<17:47,  1.07it/s, loss=0.941][A
Train step of epoch 0:  63%|██████▎   | 1926/3066 [29:56<17:47,  1.07it/s, loss=0.874][A
Train step of epoch 0:  63%|██████▎   | 1927/3066 [29:57<17:46,  1.07it/s, loss=0.874][A
Train step of epoch 0:  63%|██████▎   | 1927/3066 [29:57<17:46,  1.07it/s, loss=0.682][A
Train step of epoch 0:  63%|██████▎   | 1928/3066 [29:58<17:44,  1.07it/s, loss=0.682][A
Train step o

Train step of epoch 0:  66%|██████▌   | 2014/3066 [31:18<16:22,  1.07it/s, loss=0.413][A
Train step of epoch 0:  66%|██████▌   | 2014/3066 [31:18<16:22,  1.07it/s, loss=0.647][A
Train step of epoch 0:  66%|██████▌   | 2015/3066 [31:19<16:22,  1.07it/s, loss=0.647][A
Train step of epoch 0:  66%|██████▌   | 2015/3066 [31:19<16:22,  1.07it/s, loss=0.658][A
Train step of epoch 0:  66%|██████▌   | 2016/3066 [31:20<16:22,  1.07it/s, loss=0.658][A
Train step of epoch 0:  66%|██████▌   | 2016/3066 [31:20<16:22,  1.07it/s, loss=0.59] [A
Train step of epoch 0:  66%|██████▌   | 2017/3066 [31:21<16:21,  1.07it/s, loss=0.59][A
Train step of epoch 0:  66%|██████▌   | 2017/3066 [31:21<16:21,  1.07it/s, loss=0.517][A
Train step of epoch 0:  66%|██████▌   | 2018/3066 [31:22<16:19,  1.07it/s, loss=0.517][A
Train step of epoch 0:  66%|██████▌   | 2018/3066 [31:22<16:19,  1.07it/s, loss=0.403][A
Train step of epoch 0:  66%|██████▌   | 2019/3066 [31:23<16:18,  1.07it/s, loss=0.403][A
Train step 

Train step of epoch 0:  69%|██████▊   | 2105/3066 [32:43<14:53,  1.08it/s, loss=0.433][A
Train step of epoch 0:  69%|██████▊   | 2105/3066 [32:43<14:53,  1.08it/s, loss=0.241][A
Train step of epoch 0:  69%|██████▊   | 2106/3066 [32:44<14:51,  1.08it/s, loss=0.241][A
Train step of epoch 0:  69%|██████▊   | 2106/3066 [32:44<14:51,  1.08it/s, loss=1.4]  [A
Train step of epoch 0:  69%|██████▊   | 2107/3066 [32:45<14:51,  1.08it/s, loss=1.4][A
Train step of epoch 0:  69%|██████▊   | 2107/3066 [32:45<14:51,  1.08it/s, loss=0.43][A
Train step of epoch 0:  69%|██████▉   | 2108/3066 [32:46<14:49,  1.08it/s, loss=0.43][A
Train step of epoch 0:  69%|██████▉   | 2108/3066 [32:46<14:49,  1.08it/s, loss=0.345][A
Train step of epoch 0:  69%|██████▉   | 2109/3066 [32:47<14:48,  1.08it/s, loss=0.345][A
Train step of epoch 0:  69%|██████▉   | 2109/3066 [32:47<14:48,  1.08it/s, loss=0.545][A
Train step of epoch 0:  69%|██████▉   | 2110/3066 [32:47<14:47,  1.08it/s, loss=0.545][A
Train step of 

Train step of epoch 0:  72%|███████▏  | 2196/3066 [34:07<13:34,  1.07it/s, loss=0.303][A
Train step of epoch 0:  72%|███████▏  | 2196/3066 [34:07<13:34,  1.07it/s, loss=0.712][A
Train step of epoch 0:  72%|███████▏  | 2197/3066 [34:08<13:32,  1.07it/s, loss=0.712][A
Train step of epoch 0:  72%|███████▏  | 2197/3066 [34:08<13:32,  1.07it/s, loss=0.73] [A
Train step of epoch 0:  72%|███████▏  | 2198/3066 [34:09<13:33,  1.07it/s, loss=0.73][A
Train step of epoch 0:  72%|███████▏  | 2198/3066 [34:09<13:33,  1.07it/s, loss=0.727][A
Train step of epoch 0:  72%|███████▏  | 2199/3066 [34:10<13:32,  1.07it/s, loss=0.727][A
Train step of epoch 0:  72%|███████▏  | 2199/3066 [34:10<13:32,  1.07it/s, loss=0.688][A
Train step of epoch 0:  72%|███████▏  | 2200/3066 [34:11<13:31,  1.07it/s, loss=0.688][A
Train step of epoch 0:  72%|███████▏  | 2200/3066 [34:11<13:31,  1.07it/s, loss=0.637][A
Train step of epoch 0:  72%|███████▏  | 2201/3066 [34:12<13:29,  1.07it/s, loss=0.637][A
Train step 

Train step of epoch 0:  75%|███████▍  | 2287/3066 [35:32<12:05,  1.07it/s, loss=0.583][A
Train step of epoch 0:  75%|███████▍  | 2287/3066 [35:32<12:05,  1.07it/s, loss=0.399][A
Train step of epoch 0:  75%|███████▍  | 2288/3066 [35:33<12:04,  1.07it/s, loss=0.399][A
Train step of epoch 0:  75%|███████▍  | 2288/3066 [35:33<12:04,  1.07it/s, loss=0.478][A
Train step of epoch 0:  75%|███████▍  | 2289/3066 [35:34<12:03,  1.07it/s, loss=0.478][A
Train step of epoch 0:  75%|███████▍  | 2289/3066 [35:34<12:03,  1.07it/s, loss=0.513][A
Train step of epoch 0:  75%|███████▍  | 2290/3066 [35:35<12:02,  1.07it/s, loss=0.513][A
Train step of epoch 0:  75%|███████▍  | 2290/3066 [35:35<12:02,  1.07it/s, loss=0.495][A
Train step of epoch 0:  75%|███████▍  | 2291/3066 [35:36<12:01,  1.07it/s, loss=0.495][A
Train step of epoch 0:  75%|███████▍  | 2291/3066 [35:36<12:01,  1.07it/s, loss=0.579][A
Train step of epoch 0:  75%|███████▍  | 2292/3066 [35:37<12:01,  1.07it/s, loss=0.579][A
Train step

Train step of epoch 0:  78%|███████▊  | 2378/3066 [36:57<10:35,  1.08it/s, loss=0.796][A
Train step of epoch 0:  78%|███████▊  | 2378/3066 [36:57<10:35,  1.08it/s, loss=0.64] [A
Train step of epoch 0:  78%|███████▊  | 2379/3066 [36:58<10:35,  1.08it/s, loss=0.64][A
Train step of epoch 0:  78%|███████▊  | 2379/3066 [36:58<10:35,  1.08it/s, loss=0.442][A
Train step of epoch 0:  78%|███████▊  | 2380/3066 [36:59<10:35,  1.08it/s, loss=0.442][A
Train step of epoch 0:  78%|███████▊  | 2380/3066 [36:59<10:35,  1.08it/s, loss=0.731][A
Train step of epoch 0:  78%|███████▊  | 2381/3066 [37:00<10:35,  1.08it/s, loss=0.731][A
Train step of epoch 0:  78%|███████▊  | 2381/3066 [37:00<10:35,  1.08it/s, loss=0.664][A
Train step of epoch 0:  78%|███████▊  | 2382/3066 [37:01<10:33,  1.08it/s, loss=0.664][A
Train step of epoch 0:  78%|███████▊  | 2382/3066 [37:01<10:33,  1.08it/s, loss=0.646][A
Train step of epoch 0:  78%|███████▊  | 2383/3066 [37:02<10:32,  1.08it/s, loss=0.646][A
Train step 

Train step of epoch 0:  81%|████████  | 2469/3066 [38:22<09:19,  1.07it/s, loss=0.763][A
Train step of epoch 0:  81%|████████  | 2469/3066 [38:22<09:19,  1.07it/s, loss=0.578][A
Train step of epoch 0:  81%|████████  | 2470/3066 [38:23<09:18,  1.07it/s, loss=0.578][A
Train step of epoch 0:  81%|████████  | 2470/3066 [38:23<09:18,  1.07it/s, loss=0.652][A
Train step of epoch 0:  81%|████████  | 2471/3066 [38:24<09:17,  1.07it/s, loss=0.652][A
Train step of epoch 0:  81%|████████  | 2471/3066 [38:24<09:17,  1.07it/s, loss=0.341][A
Train step of epoch 0:  81%|████████  | 2472/3066 [38:25<09:15,  1.07it/s, loss=0.341][A
Train step of epoch 0:  81%|████████  | 2472/3066 [38:25<09:15,  1.07it/s, loss=0.582][A
Train step of epoch 0:  81%|████████  | 2473/3066 [38:26<09:15,  1.07it/s, loss=0.582][A
Train step of epoch 0:  81%|████████  | 2473/3066 [38:26<09:15,  1.07it/s, loss=0.66] [A
Train step of epoch 0:  81%|████████  | 2474/3066 [38:27<09:14,  1.07it/s, loss=0.66][A
Train step 

Train step of epoch 0:  83%|████████▎ | 2560/3066 [39:47<07:48,  1.08it/s, loss=0.713][A
Train step of epoch 0:  83%|████████▎ | 2560/3066 [39:47<07:48,  1.08it/s, loss=0.469][A
Train step of epoch 0:  84%|████████▎ | 2561/3066 [39:48<07:48,  1.08it/s, loss=0.469][A
Train step of epoch 0:  84%|████████▎ | 2561/3066 [39:48<07:48,  1.08it/s, loss=0.518][A
Train step of epoch 0:  84%|████████▎ | 2562/3066 [39:49<07:47,  1.08it/s, loss=0.518][A
Train step of epoch 0:  84%|████████▎ | 2562/3066 [39:49<07:47,  1.08it/s, loss=0.762][A
Train step of epoch 0:  84%|████████▎ | 2563/3066 [39:49<07:47,  1.08it/s, loss=0.762][A
Train step of epoch 0:  84%|████████▎ | 2563/3066 [39:49<07:47,  1.08it/s, loss=0.618][A
Train step of epoch 0:  84%|████████▎ | 2564/3066 [39:50<07:46,  1.08it/s, loss=0.618][A
Train step of epoch 0:  84%|████████▎ | 2564/3066 [39:50<07:46,  1.08it/s, loss=0.433][A
Train step of epoch 0:  84%|████████▎ | 2565/3066 [39:51<07:45,  1.08it/s, loss=0.433][A
Train step

Train step of epoch 0:  86%|████████▋ | 2651/3066 [41:11<06:27,  1.07it/s, loss=0.822][A
Train step of epoch 0:  86%|████████▋ | 2651/3066 [41:11<06:27,  1.07it/s, loss=0.621][A
Train step of epoch 0:  86%|████████▋ | 2652/3066 [41:12<06:27,  1.07it/s, loss=0.621][A
Train step of epoch 0:  86%|████████▋ | 2652/3066 [41:12<06:27,  1.07it/s, loss=0.638][A
Train step of epoch 0:  87%|████████▋ | 2653/3066 [41:13<06:26,  1.07it/s, loss=0.638][A
Train step of epoch 0:  87%|████████▋ | 2653/3066 [41:13<06:26,  1.07it/s, loss=0.563][A
Train step of epoch 0:  87%|████████▋ | 2654/3066 [41:14<06:26,  1.07it/s, loss=0.563][A
Train step of epoch 0:  87%|████████▋ | 2654/3066 [41:14<06:26,  1.07it/s, loss=0.559][A
Train step of epoch 0:  87%|████████▋ | 2655/3066 [41:15<06:25,  1.07it/s, loss=0.559][A
Train step of epoch 0:  87%|████████▋ | 2655/3066 [41:15<06:25,  1.07it/s, loss=0.751][A
Train step of epoch 0:  87%|████████▋ | 2656/3066 [41:16<06:24,  1.07it/s, loss=0.751][A
Train step

Train step of epoch 0:  89%|████████▉ | 2742/3066 [42:36<04:59,  1.08it/s, loss=0.69][A
Train step of epoch 0:  89%|████████▉ | 2742/3066 [42:36<04:59,  1.08it/s, loss=0.653][A
Train step of epoch 0:  89%|████████▉ | 2743/3066 [42:37<04:58,  1.08it/s, loss=0.653][A
Train step of epoch 0:  89%|████████▉ | 2743/3066 [42:37<04:58,  1.08it/s, loss=0.522][A
Train step of epoch 0:  89%|████████▉ | 2744/3066 [42:38<04:58,  1.08it/s, loss=0.522][A
Train step of epoch 0:  89%|████████▉ | 2744/3066 [42:38<04:58,  1.08it/s, loss=0.496][A
Train step of epoch 0:  90%|████████▉ | 2745/3066 [42:39<04:57,  1.08it/s, loss=0.496][A
Train step of epoch 0:  90%|████████▉ | 2745/3066 [42:39<04:57,  1.08it/s, loss=0.603][A
Train step of epoch 0:  90%|████████▉ | 2746/3066 [42:40<04:56,  1.08it/s, loss=0.603][A
Train step of epoch 0:  90%|████████▉ | 2746/3066 [42:40<04:56,  1.08it/s, loss=0.668][A
Train step of epoch 0:  90%|████████▉ | 2747/3066 [42:41<04:55,  1.08it/s, loss=0.668][A
Train step 

Train step of epoch 0:  92%|█████████▏| 2833/3066 [44:01<03:37,  1.07it/s, loss=0.666][A
Train step of epoch 0:  92%|█████████▏| 2833/3066 [44:01<03:37,  1.07it/s, loss=0.572][A
Train step of epoch 0:  92%|█████████▏| 2834/3066 [44:02<03:36,  1.07it/s, loss=0.572][A
Train step of epoch 0:  92%|█████████▏| 2834/3066 [44:02<03:36,  1.07it/s, loss=0.817][A
Train step of epoch 0:  92%|█████████▏| 2835/3066 [44:03<03:35,  1.07it/s, loss=0.817][A
Train step of epoch 0:  92%|█████████▏| 2835/3066 [44:03<03:35,  1.07it/s, loss=0.642][A
Train step of epoch 0:  92%|█████████▏| 2836/3066 [44:04<03:34,  1.07it/s, loss=0.642][A
Train step of epoch 0:  92%|█████████▏| 2836/3066 [44:04<03:34,  1.07it/s, loss=0.651][A
Train step of epoch 0:  93%|█████████▎| 2837/3066 [44:05<03:33,  1.07it/s, loss=0.651][A
Train step of epoch 0:  93%|█████████▎| 2837/3066 [44:05<03:33,  1.07it/s, loss=0.691][A
Train step of epoch 0:  93%|█████████▎| 2838/3066 [44:05<03:32,  1.07it/s, loss=0.691][A
Train step

Train step of epoch 0:  95%|█████████▌| 2924/3066 [45:25<02:12,  1.08it/s, loss=0.472][A
Train step of epoch 0:  95%|█████████▌| 2924/3066 [45:25<02:12,  1.08it/s, loss=0.624][A
Train step of epoch 0:  95%|█████████▌| 2925/3066 [45:26<02:11,  1.07it/s, loss=0.624][A
Train step of epoch 0:  95%|█████████▌| 2925/3066 [45:26<02:11,  1.07it/s, loss=0.566][A
Train step of epoch 0:  95%|█████████▌| 2926/3066 [45:27<02:10,  1.07it/s, loss=0.566][A
Train step of epoch 0:  95%|█████████▌| 2926/3066 [45:27<02:10,  1.07it/s, loss=0.442][A
Train step of epoch 0:  95%|█████████▌| 2927/3066 [45:28<02:09,  1.07it/s, loss=0.442][A
Train step of epoch 0:  95%|█████████▌| 2927/3066 [45:28<02:09,  1.07it/s, loss=0.7]  [A
Train step of epoch 0:  95%|█████████▌| 2928/3066 [45:29<02:08,  1.07it/s, loss=0.7][A
Train step of epoch 0:  95%|█████████▌| 2928/3066 [45:29<02:08,  1.07it/s, loss=0.522][A
Train step of epoch 0:  96%|█████████▌| 2929/3066 [45:30<02:07,  1.07it/s, loss=0.522][A
Train step o

Train step of epoch 0:  98%|█████████▊| 3015/3066 [46:50<00:47,  1.08it/s, loss=0.589][A
Train step of epoch 0:  98%|█████████▊| 3015/3066 [46:50<00:47,  1.08it/s, loss=0.564][A
Train step of epoch 0:  98%|█████████▊| 3016/3066 [46:51<00:46,  1.08it/s, loss=0.564][A
Train step of epoch 0:  98%|█████████▊| 3016/3066 [46:51<00:46,  1.08it/s, loss=0.677][A
Train step of epoch 0:  98%|█████████▊| 3017/3066 [46:52<00:45,  1.08it/s, loss=0.677][A
Train step of epoch 0:  98%|█████████▊| 3017/3066 [46:52<00:45,  1.08it/s, loss=0.829][A
Train step of epoch 0:  98%|█████████▊| 3018/3066 [46:53<00:44,  1.08it/s, loss=0.829][A
Train step of epoch 0:  98%|█████████▊| 3018/3066 [46:53<00:44,  1.08it/s, loss=0.527][A
Train step of epoch 0:  98%|█████████▊| 3019/3066 [46:54<00:43,  1.08it/s, loss=0.527][A
Train step of epoch 0:  98%|█████████▊| 3019/3066 [46:54<00:43,  1.08it/s, loss=0.507][A
Train step of epoch 0:  98%|█████████▊| 3020/3066 [46:55<00:42,  1.08it/s, loss=0.507][A
Train step

Train step of epoch 1:   1%|▏         | 40/3066 [00:37<46:50,  1.08it/s, loss=0.314][A
Train step of epoch 1:   1%|▏         | 40/3066 [00:37<46:50,  1.08it/s, loss=0.693][A
Train step of epoch 1:   1%|▏         | 41/3066 [00:38<46:53,  1.08it/s, loss=0.693][A
Train step of epoch 1:   1%|▏         | 41/3066 [00:38<46:53,  1.08it/s, loss=0.547][A
Train step of epoch 1:   1%|▏         | 42/3066 [00:39<46:53,  1.07it/s, loss=0.547][A
Train step of epoch 1:   1%|▏         | 42/3066 [00:39<46:53,  1.07it/s, loss=0.296][A
Train step of epoch 1:   1%|▏         | 43/3066 [00:40<46:56,  1.07it/s, loss=0.296][A
Train step of epoch 1:   1%|▏         | 43/3066 [00:40<46:56,  1.07it/s, loss=0.612][A
Train step of epoch 1:   1%|▏         | 44/3066 [00:41<46:49,  1.08it/s, loss=0.612][A
Train step of epoch 1:   1%|▏         | 44/3066 [00:41<46:49,  1.08it/s, loss=0.58] [A
Train step of epoch 1:   1%|▏         | 45/3066 [00:42<46:47,  1.08it/s, loss=0.58][A
Train step of epoch 1:   1%|▏    

Train step of epoch 1:   4%|▍         | 132/3066 [02:02<45:25,  1.08it/s, loss=0.579][A
Train step of epoch 1:   4%|▍         | 133/3066 [02:03<45:20,  1.08it/s, loss=0.579][A
Train step of epoch 1:   4%|▍         | 133/3066 [02:03<45:20,  1.08it/s, loss=0.702][A
Train step of epoch 1:   4%|▍         | 134/3066 [02:04<45:18,  1.08it/s, loss=0.702][A
Train step of epoch 1:   4%|▍         | 134/3066 [02:04<45:18,  1.08it/s, loss=0.525][A
Train step of epoch 1:   4%|▍         | 135/3066 [02:05<45:16,  1.08it/s, loss=0.525][A
Train step of epoch 1:   4%|▍         | 135/3066 [02:05<45:16,  1.08it/s, loss=0.432][A
Train step of epoch 1:   4%|▍         | 136/3066 [02:06<45:12,  1.08it/s, loss=0.432][A
Train step of epoch 1:   4%|▍         | 136/3066 [02:06<45:12,  1.08it/s, loss=0.664][A
Train step of epoch 1:   4%|▍         | 137/3066 [02:07<45:16,  1.08it/s, loss=0.664][A
Train step of epoch 1:   4%|▍         | 137/3066 [02:07<45:16,  1.08it/s, loss=0.431][A
Train step of epoch 1

Train step of epoch 1:   7%|▋         | 224/3066 [03:28<44:00,  1.08it/s, loss=0.716][A
Train step of epoch 1:   7%|▋         | 225/3066 [03:29<44:02,  1.07it/s, loss=0.716][A
Train step of epoch 1:   7%|▋         | 225/3066 [03:29<44:02,  1.07it/s, loss=0.838][A
Train step of epoch 1:   7%|▋         | 226/3066 [03:29<44:01,  1.08it/s, loss=0.838][A
Train step of epoch 1:   7%|▋         | 226/3066 [03:30<44:01,  1.08it/s, loss=0.492][A
Train step of epoch 1:   7%|▋         | 227/3066 [03:30<43:58,  1.08it/s, loss=0.492][A
Train step of epoch 1:   7%|▋         | 227/3066 [03:30<43:58,  1.08it/s, loss=0.456][A
Train step of epoch 1:   7%|▋         | 228/3066 [03:31<43:54,  1.08it/s, loss=0.456][A
Train step of epoch 1:   7%|▋         | 228/3066 [03:31<43:54,  1.08it/s, loss=0.9]  [A
Train step of epoch 1:   7%|▋         | 229/3066 [03:32<44:00,  1.07it/s, loss=0.9][A
Train step of epoch 1:   7%|▋         | 229/3066 [03:32<44:00,  1.07it/s, loss=0.627][A
Train step of epoch 1: 

Train step of epoch 1:  10%|█         | 316/3066 [04:53<42:40,  1.07it/s, loss=0.956][A
Train step of epoch 1:  10%|█         | 317/3066 [04:54<42:36,  1.08it/s, loss=0.956][A
Train step of epoch 1:  10%|█         | 317/3066 [04:54<42:36,  1.08it/s, loss=0.514][A
Train step of epoch 1:  10%|█         | 318/3066 [04:55<42:40,  1.07it/s, loss=0.514][A
Train step of epoch 1:  10%|█         | 318/3066 [04:55<42:40,  1.07it/s, loss=0.532][A
Train step of epoch 1:  10%|█         | 319/3066 [04:56<42:42,  1.07it/s, loss=0.532][A
Train step of epoch 1:  10%|█         | 319/3066 [04:56<42:42,  1.07it/s, loss=0.65] [A
Train step of epoch 1:  10%|█         | 320/3066 [04:57<42:44,  1.07it/s, loss=0.65][A
Train step of epoch 1:  10%|█         | 320/3066 [04:57<42:44,  1.07it/s, loss=0.434][A
Train step of epoch 1:  10%|█         | 321/3066 [04:58<42:37,  1.07it/s, loss=0.434][A
Train step of epoch 1:  10%|█         | 321/3066 [04:58<42:37,  1.07it/s, loss=0.524][A
Train step of epoch 1:

Train step of epoch 1:  13%|█▎        | 408/3066 [06:19<41:21,  1.07it/s, loss=0.358][A
Train step of epoch 1:  13%|█▎        | 409/3066 [06:20<41:22,  1.07it/s, loss=0.358][A
Train step of epoch 1:  13%|█▎        | 409/3066 [06:20<41:22,  1.07it/s, loss=0.677][A
Train step of epoch 1:  13%|█▎        | 410/3066 [06:21<41:21,  1.07it/s, loss=0.677][A
Train step of epoch 1:  13%|█▎        | 410/3066 [06:21<41:21,  1.07it/s, loss=0.489][A
Train step of epoch 1:  13%|█▎        | 411/3066 [06:22<41:20,  1.07it/s, loss=0.489][A
Train step of epoch 1:  13%|█▎        | 411/3066 [06:22<41:20,  1.07it/s, loss=0.648][A
Train step of epoch 1:  13%|█▎        | 412/3066 [06:23<41:14,  1.07it/s, loss=0.648][A
Train step of epoch 1:  13%|█▎        | 412/3066 [06:23<41:14,  1.07it/s, loss=0.615][A
Train step of epoch 1:  13%|█▎        | 413/3066 [06:23<41:13,  1.07it/s, loss=0.615][A
Train step of epoch 1:  13%|█▎        | 413/3066 [06:23<41:13,  1.07it/s, loss=0.304][A
Train step of epoch 1

Train step of epoch 1:  16%|█▋        | 500/3066 [07:45<40:08,  1.07it/s, loss=0.623][A
Train step of epoch 1:  16%|█▋        | 501/3066 [07:46<40:07,  1.07it/s, loss=0.623][A
Train step of epoch 1:  16%|█▋        | 501/3066 [07:46<40:07,  1.07it/s, loss=0.345][A
Train step of epoch 1:  16%|█▋        | 502/3066 [07:47<40:03,  1.07it/s, loss=0.345][A
Train step of epoch 1:  16%|█▋        | 502/3066 [07:47<40:03,  1.07it/s, loss=0.456][A
Train step of epoch 1:  16%|█▋        | 503/3066 [07:48<40:00,  1.07it/s, loss=0.456][A
Train step of epoch 1:  16%|█▋        | 503/3066 [07:48<40:00,  1.07it/s, loss=0.727][A
Train step of epoch 1:  16%|█▋        | 504/3066 [07:49<39:56,  1.07it/s, loss=0.727][A
Train step of epoch 1:  16%|█▋        | 504/3066 [07:49<39:56,  1.07it/s, loss=0.405][A
Train step of epoch 1:  16%|█▋        | 505/3066 [07:50<39:56,  1.07it/s, loss=0.405][A
Train step of epoch 1:  16%|█▋        | 505/3066 [07:50<39:56,  1.07it/s, loss=0.555][A
Train step of epoch 1

Train step of epoch 1:  19%|█▉        | 592/3066 [09:11<38:21,  1.07it/s, loss=0.47] [A
Train step of epoch 1:  19%|█▉        | 593/3066 [09:12<38:22,  1.07it/s, loss=0.47][A
Train step of epoch 1:  19%|█▉        | 593/3066 [09:12<38:22,  1.07it/s, loss=0.886][A
Train step of epoch 1:  19%|█▉        | 594/3066 [09:13<38:19,  1.07it/s, loss=0.886][A
Train step of epoch 1:  19%|█▉        | 594/3066 [09:13<38:19,  1.07it/s, loss=0.483][A
Train step of epoch 1:  19%|█▉        | 595/3066 [09:13<38:19,  1.07it/s, loss=0.483][A
Train step of epoch 1:  19%|█▉        | 595/3066 [09:14<38:19,  1.07it/s, loss=0.6]  [A
Train step of epoch 1:  19%|█▉        | 596/3066 [09:14<38:21,  1.07it/s, loss=0.6][A
Train step of epoch 1:  19%|█▉        | 596/3066 [09:14<38:21,  1.07it/s, loss=0.383][A
Train step of epoch 1:  19%|█▉        | 597/3066 [09:15<38:24,  1.07it/s, loss=0.383][A
Train step of epoch 1:  19%|█▉        | 597/3066 [09:15<38:24,  1.07it/s, loss=0.572][A
Train step of epoch 1:  

Train step of epoch 1:  22%|██▏       | 684/3066 [10:36<36:56,  1.07it/s, loss=0.751][A
Train step of epoch 1:  22%|██▏       | 685/3066 [10:37<36:54,  1.08it/s, loss=0.751][A
Train step of epoch 1:  22%|██▏       | 685/3066 [10:37<36:54,  1.08it/s, loss=0.629][A
Train step of epoch 1:  22%|██▏       | 686/3066 [10:38<36:52,  1.08it/s, loss=0.629][A
Train step of epoch 1:  22%|██▏       | 686/3066 [10:38<36:52,  1.08it/s, loss=0.685][A
Train step of epoch 1:  22%|██▏       | 687/3066 [10:39<36:51,  1.08it/s, loss=0.685][A
Train step of epoch 1:  22%|██▏       | 687/3066 [10:39<36:51,  1.08it/s, loss=0.57] [A
Train step of epoch 1:  22%|██▏       | 688/3066 [10:40<36:50,  1.08it/s, loss=0.57][A
Train step of epoch 1:  22%|██▏       | 688/3066 [10:40<36:50,  1.08it/s, loss=0.309][A
Train step of epoch 1:  22%|██▏       | 689/3066 [10:41<36:51,  1.08it/s, loss=0.309][A
Train step of epoch 1:  22%|██▏       | 689/3066 [10:41<36:51,  1.08it/s, loss=0.336][A
Train step of epoch 1:

Train step of epoch 1:  25%|██▌       | 776/3066 [12:02<35:26,  1.08it/s, loss=0.469][A
Train step of epoch 1:  25%|██▌       | 777/3066 [12:03<35:21,  1.08it/s, loss=0.469][A
Train step of epoch 1:  25%|██▌       | 777/3066 [12:03<35:21,  1.08it/s, loss=0.599][A
Train step of epoch 1:  25%|██▌       | 778/3066 [12:04<35:20,  1.08it/s, loss=0.599][A
Train step of epoch 1:  25%|██▌       | 778/3066 [12:04<35:20,  1.08it/s, loss=0.411][A
Train step of epoch 1:  25%|██▌       | 779/3066 [12:04<35:21,  1.08it/s, loss=0.411][A
Train step of epoch 1:  25%|██▌       | 779/3066 [12:04<35:21,  1.08it/s, loss=0.692][A
Train step of epoch 1:  25%|██▌       | 780/3066 [12:05<35:19,  1.08it/s, loss=0.692][A
Train step of epoch 1:  25%|██▌       | 780/3066 [12:05<35:19,  1.08it/s, loss=0.344][A
Train step of epoch 1:  25%|██▌       | 781/3066 [12:06<35:17,  1.08it/s, loss=0.344][A
Train step of epoch 1:  25%|██▌       | 781/3066 [12:06<35:17,  1.08it/s, loss=0.441][A
Train step of epoch 1

Train step of epoch 1:  28%|██▊       | 868/3066 [13:27<34:06,  1.07it/s, loss=0.335][A
Train step of epoch 1:  28%|██▊       | 869/3066 [13:28<34:09,  1.07it/s, loss=0.335][A
Train step of epoch 1:  28%|██▊       | 869/3066 [13:28<34:09,  1.07it/s, loss=0.539][A
Train step of epoch 1:  28%|██▊       | 870/3066 [13:29<34:08,  1.07it/s, loss=0.539][A
Train step of epoch 1:  28%|██▊       | 870/3066 [13:29<34:08,  1.07it/s, loss=0.398][A
Train step of epoch 1:  28%|██▊       | 871/3066 [13:30<34:05,  1.07it/s, loss=0.398][A
Train step of epoch 1:  28%|██▊       | 871/3066 [13:30<34:05,  1.07it/s, loss=0.491][A
Train step of epoch 1:  28%|██▊       | 872/3066 [13:31<34:05,  1.07it/s, loss=0.491][A
Train step of epoch 1:  28%|██▊       | 872/3066 [13:31<34:05,  1.07it/s, loss=0.56] [A
Train step of epoch 1:  28%|██▊       | 873/3066 [13:32<34:04,  1.07it/s, loss=0.56][A
Train step of epoch 1:  28%|██▊       | 873/3066 [13:32<34:04,  1.07it/s, loss=0.783][A
Train step of epoch 1:

Train step of epoch 1:  31%|███▏      | 960/3066 [14:53<32:46,  1.07it/s, loss=0.341][A
Train step of epoch 1:  31%|███▏      | 961/3066 [14:54<32:45,  1.07it/s, loss=0.341][A
Train step of epoch 1:  31%|███▏      | 961/3066 [14:54<32:45,  1.07it/s, loss=0.498][A
Train step of epoch 1:  31%|███▏      | 962/3066 [14:55<32:45,  1.07it/s, loss=0.498][A
Train step of epoch 1:  31%|███▏      | 962/3066 [14:55<32:45,  1.07it/s, loss=0.452][A
Train step of epoch 1:  31%|███▏      | 963/3066 [14:56<32:44,  1.07it/s, loss=0.452][A
Train step of epoch 1:  31%|███▏      | 963/3066 [14:56<32:44,  1.07it/s, loss=0.277][A
Train step of epoch 1:  31%|███▏      | 964/3066 [14:57<32:46,  1.07it/s, loss=0.277][A
Train step of epoch 1:  31%|███▏      | 964/3066 [14:57<32:46,  1.07it/s, loss=0.342][A
Train step of epoch 1:  31%|███▏      | 965/3066 [14:58<32:46,  1.07it/s, loss=0.342][A
Train step of epoch 1:  31%|███▏      | 965/3066 [14:58<32:46,  1.07it/s, loss=0.524][A
Train step of epoch 1

Train step of epoch 1:  34%|███▍      | 1051/3066 [16:18<31:25,  1.07it/s, loss=0.679][A
Train step of epoch 1:  34%|███▍      | 1052/3066 [16:19<31:23,  1.07it/s, loss=0.679][A
Train step of epoch 1:  34%|███▍      | 1052/3066 [16:19<31:23,  1.07it/s, loss=0.553][A
Train step of epoch 1:  34%|███▍      | 1053/3066 [16:20<31:19,  1.07it/s, loss=0.553][A
Train step of epoch 1:  34%|███▍      | 1053/3066 [16:20<31:19,  1.07it/s, loss=0.357][A
Train step of epoch 1:  34%|███▍      | 1054/3066 [16:21<31:20,  1.07it/s, loss=0.357][A
Train step of epoch 1:  34%|███▍      | 1054/3066 [16:21<31:20,  1.07it/s, loss=0.328][A
Train step of epoch 1:  34%|███▍      | 1055/3066 [16:22<31:21,  1.07it/s, loss=0.328][A
Train step of epoch 1:  34%|███▍      | 1055/3066 [16:22<31:21,  1.07it/s, loss=0.527][A
Train step of epoch 1:  34%|███▍      | 1056/3066 [16:23<31:21,  1.07it/s, loss=0.527][A
Train step of epoch 1:  34%|███▍      | 1056/3066 [16:23<31:21,  1.07it/s, loss=0.538][A
Train step

Train step of epoch 1:  37%|███▋      | 1142/3066 [17:43<29:56,  1.07it/s, loss=0.731][A
Train step of epoch 1:  37%|███▋      | 1143/3066 [17:44<29:56,  1.07it/s, loss=0.731][A
Train step of epoch 1:  37%|███▋      | 1143/3066 [17:44<29:56,  1.07it/s, loss=0.539][A
Train step of epoch 1:  37%|███▋      | 1144/3066 [17:45<29:56,  1.07it/s, loss=0.539][A
Train step of epoch 1:  37%|███▋      | 1144/3066 [17:45<29:56,  1.07it/s, loss=0.576][A
Train step of epoch 1:  37%|███▋      | 1145/3066 [17:46<29:55,  1.07it/s, loss=0.576][A
Train step of epoch 1:  37%|███▋      | 1145/3066 [17:46<29:55,  1.07it/s, loss=0.735][A
Train step of epoch 1:  37%|███▋      | 1146/3066 [17:47<29:51,  1.07it/s, loss=0.735][A
Train step of epoch 1:  37%|███▋      | 1146/3066 [17:47<29:51,  1.07it/s, loss=0.487][A
Train step of epoch 1:  37%|███▋      | 1147/3066 [17:48<29:50,  1.07it/s, loss=0.487][A
Train step of epoch 1:  37%|███▋      | 1147/3066 [17:48<29:50,  1.07it/s, loss=0.527][A
Train step

Train step of epoch 1:  40%|████      | 1233/3066 [19:08<28:25,  1.07it/s, loss=0.771][A
Train step of epoch 1:  40%|████      | 1234/3066 [19:09<28:25,  1.07it/s, loss=0.771][A
Train step of epoch 1:  40%|████      | 1234/3066 [19:09<28:25,  1.07it/s, loss=0.586][A
Train step of epoch 1:  40%|████      | 1235/3066 [19:10<28:22,  1.08it/s, loss=0.586][A
Train step of epoch 1:  40%|████      | 1235/3066 [19:10<28:22,  1.08it/s, loss=0.494][A
Train step of epoch 1:  40%|████      | 1236/3066 [19:11<28:21,  1.08it/s, loss=0.494][A
Train step of epoch 1:  40%|████      | 1236/3066 [19:11<28:21,  1.08it/s, loss=0.866][A
Train step of epoch 1:  40%|████      | 1237/3066 [19:12<28:20,  1.08it/s, loss=0.866][A
Train step of epoch 1:  40%|████      | 1237/3066 [19:12<28:20,  1.08it/s, loss=0.61] [A
Train step of epoch 1:  40%|████      | 1238/3066 [19:13<28:18,  1.08it/s, loss=0.61][A
Train step of epoch 1:  40%|████      | 1238/3066 [19:13<28:18,  1.08it/s, loss=0.458][A
Train step 

Train step of epoch 1:  43%|████▎     | 1324/3066 [20:32<26:53,  1.08it/s, loss=0.463][A
Train step of epoch 1:  43%|████▎     | 1325/3066 [20:33<26:53,  1.08it/s, loss=0.463][A
Train step of epoch 1:  43%|████▎     | 1325/3066 [20:33<26:53,  1.08it/s, loss=0.622][A
Train step of epoch 1:  43%|████▎     | 1326/3066 [20:34<26:53,  1.08it/s, loss=0.622][A
Train step of epoch 1:  43%|████▎     | 1326/3066 [20:34<26:53,  1.08it/s, loss=0.913][A
Train step of epoch 1:  43%|████▎     | 1327/3066 [20:35<26:49,  1.08it/s, loss=0.913][A
Train step of epoch 1:  43%|████▎     | 1327/3066 [20:35<26:49,  1.08it/s, loss=0.581][A
Train step of epoch 1:  43%|████▎     | 1328/3066 [20:36<26:49,  1.08it/s, loss=0.581][A
Train step of epoch 1:  43%|████▎     | 1328/3066 [20:36<26:49,  1.08it/s, loss=0.669][A
Train step of epoch 1:  43%|████▎     | 1329/3066 [20:37<26:48,  1.08it/s, loss=0.669][A
Train step of epoch 1:  43%|████▎     | 1329/3066 [20:37<26:48,  1.08it/s, loss=0.32] [A
Train step

Train step of epoch 1:  46%|████▌     | 1415/3066 [21:57<25:33,  1.08it/s, loss=0.444][A
Train step of epoch 1:  46%|████▌     | 1416/3066 [21:58<25:32,  1.08it/s, loss=0.444][A
Train step of epoch 1:  46%|████▌     | 1416/3066 [21:58<25:32,  1.08it/s, loss=0.452][A
Train step of epoch 1:  46%|████▌     | 1417/3066 [21:59<25:34,  1.07it/s, loss=0.452][A
Train step of epoch 1:  46%|████▌     | 1417/3066 [21:59<25:34,  1.07it/s, loss=0.585][A
Train step of epoch 1:  46%|████▌     | 1418/3066 [22:00<25:33,  1.07it/s, loss=0.585][A
Train step of epoch 1:  46%|████▌     | 1418/3066 [22:00<25:33,  1.07it/s, loss=0.432][A
Train step of epoch 1:  46%|████▋     | 1419/3066 [22:01<25:31,  1.08it/s, loss=0.432][A
Train step of epoch 1:  46%|████▋     | 1419/3066 [22:01<25:31,  1.08it/s, loss=0.339][A
Train step of epoch 1:  46%|████▋     | 1420/3066 [22:02<25:35,  1.07it/s, loss=0.339][A
Train step of epoch 1:  46%|████▋     | 1420/3066 [22:02<25:35,  1.07it/s, loss=0.768][A
Train step

Train step of epoch 1:  49%|████▉     | 1506/3066 [23:22<24:10,  1.08it/s, loss=0.211][A
Train step of epoch 1:  49%|████▉     | 1507/3066 [23:22<24:08,  1.08it/s, loss=0.211][A
Train step of epoch 1:  49%|████▉     | 1507/3066 [23:22<24:08,  1.08it/s, loss=0.331][A
Train step of epoch 1:  49%|████▉     | 1508/3066 [23:23<24:06,  1.08it/s, loss=0.331][A
Train step of epoch 1:  49%|████▉     | 1508/3066 [23:23<24:06,  1.08it/s, loss=0.489][A
Train step of epoch 1:  49%|████▉     | 1509/3066 [23:24<24:07,  1.08it/s, loss=0.489][A
Train step of epoch 1:  49%|████▉     | 1509/3066 [23:24<24:07,  1.08it/s, loss=0.492][A
Train step of epoch 1:  49%|████▉     | 1510/3066 [23:25<24:07,  1.08it/s, loss=0.492][A
Train step of epoch 1:  49%|████▉     | 1510/3066 [23:25<24:07,  1.08it/s, loss=0.436][A
Train step of epoch 1:  49%|████▉     | 1511/3066 [23:26<24:07,  1.07it/s, loss=0.436][A
Train step of epoch 1:  49%|████▉     | 1511/3066 [23:26<24:07,  1.07it/s, loss=0.403][A
Train step

Train step of epoch 1:  52%|█████▏    | 1597/3066 [24:46<22:44,  1.08it/s, loss=0.305][A
Train step of epoch 1:  52%|█████▏    | 1598/3066 [24:47<22:43,  1.08it/s, loss=0.305][A
Train step of epoch 1:  52%|█████▏    | 1598/3066 [24:47<22:43,  1.08it/s, loss=0.594][A
Train step of epoch 1:  52%|█████▏    | 1599/3066 [24:48<22:41,  1.08it/s, loss=0.594][A
Train step of epoch 1:  52%|█████▏    | 1599/3066 [24:48<22:41,  1.08it/s, loss=0.955][A
Train step of epoch 1:  52%|█████▏    | 1600/3066 [24:49<22:43,  1.08it/s, loss=0.955][A
Train step of epoch 1:  52%|█████▏    | 1600/3066 [24:49<22:43,  1.08it/s, loss=0.8]  [A
Train step of epoch 1:  52%|█████▏    | 1601/3066 [24:50<22:42,  1.08it/s, loss=0.8][A
Train step of epoch 1:  52%|█████▏    | 1601/3066 [24:50<22:42,  1.08it/s, loss=0.27][A
Train step of epoch 1:  52%|█████▏    | 1602/3066 [24:51<22:40,  1.08it/s, loss=0.27][A
Train step of epoch 1:  52%|█████▏    | 1602/3066 [24:51<22:40,  1.08it/s, loss=0.832][A
Train step of 

Train step of epoch 1:  55%|█████▌    | 1688/3066 [26:10<21:16,  1.08it/s, loss=0.542][A
Train step of epoch 1:  55%|█████▌    | 1689/3066 [26:11<21:15,  1.08it/s, loss=0.542][A
Train step of epoch 1:  55%|█████▌    | 1689/3066 [26:11<21:15,  1.08it/s, loss=0.411][A
Train step of epoch 1:  55%|█████▌    | 1690/3066 [26:12<21:14,  1.08it/s, loss=0.411][A
Train step of epoch 1:  55%|█████▌    | 1690/3066 [26:12<21:14,  1.08it/s, loss=0.415][A
Train step of epoch 1:  55%|█████▌    | 1691/3066 [26:13<21:15,  1.08it/s, loss=0.415][A
Train step of epoch 1:  55%|█████▌    | 1691/3066 [26:13<21:15,  1.08it/s, loss=0.57] [A
Train step of epoch 1:  55%|█████▌    | 1692/3066 [26:14<21:14,  1.08it/s, loss=0.57][A
Train step of epoch 1:  55%|█████▌    | 1692/3066 [26:14<21:14,  1.08it/s, loss=0.837][A
Train step of epoch 1:  55%|█████▌    | 1693/3066 [26:15<21:13,  1.08it/s, loss=0.837][A
Train step of epoch 1:  55%|█████▌    | 1693/3066 [26:15<21:13,  1.08it/s, loss=0.754][A
Train step 

Train step of epoch 1:  58%|█████▊    | 1779/3066 [27:35<19:53,  1.08it/s, loss=0.686][A
Train step of epoch 1:  58%|█████▊    | 1780/3066 [27:36<19:52,  1.08it/s, loss=0.686][A
Train step of epoch 1:  58%|█████▊    | 1780/3066 [27:36<19:52,  1.08it/s, loss=0.424][A
Train step of epoch 1:  58%|█████▊    | 1781/3066 [27:37<19:51,  1.08it/s, loss=0.424][A
Train step of epoch 1:  58%|█████▊    | 1781/3066 [27:37<19:51,  1.08it/s, loss=0.482][A
Train step of epoch 1:  58%|█████▊    | 1782/3066 [27:38<19:49,  1.08it/s, loss=0.482][A
Train step of epoch 1:  58%|█████▊    | 1782/3066 [27:38<19:49,  1.08it/s, loss=0.439][A
Train step of epoch 1:  58%|█████▊    | 1783/3066 [27:38<19:48,  1.08it/s, loss=0.439][A
Train step of epoch 1:  58%|█████▊    | 1783/3066 [27:38<19:48,  1.08it/s, loss=0.349][A
Train step of epoch 1:  58%|█████▊    | 1784/3066 [27:39<19:48,  1.08it/s, loss=0.349][A
Train step of epoch 1:  58%|█████▊    | 1784/3066 [27:39<19:48,  1.08it/s, loss=0.239][A
Train step

Train step of epoch 1:  61%|██████    | 1870/3066 [28:59<18:24,  1.08it/s, loss=0.973][A
Train step of epoch 1:  61%|██████    | 1871/3066 [29:00<18:24,  1.08it/s, loss=0.973][A
Train step of epoch 1:  61%|██████    | 1871/3066 [29:00<18:24,  1.08it/s, loss=0.855][A
Train step of epoch 1:  61%|██████    | 1872/3066 [29:01<18:22,  1.08it/s, loss=0.855][A
Train step of epoch 1:  61%|██████    | 1872/3066 [29:01<18:22,  1.08it/s, loss=0.536][A
Train step of epoch 1:  61%|██████    | 1873/3066 [29:02<18:19,  1.08it/s, loss=0.536][A
Train step of epoch 1:  61%|██████    | 1873/3066 [29:02<18:19,  1.08it/s, loss=0.441][A
Train step of epoch 1:  61%|██████    | 1874/3066 [29:03<18:20,  1.08it/s, loss=0.441][A
Train step of epoch 1:  61%|██████    | 1874/3066 [29:03<18:20,  1.08it/s, loss=0.314][A
Train step of epoch 1:  61%|██████    | 1875/3066 [29:04<18:19,  1.08it/s, loss=0.314][A
Train step of epoch 1:  61%|██████    | 1875/3066 [29:04<18:19,  1.08it/s, loss=0.575][A
Train step

Train step of epoch 1:  64%|██████▍   | 1961/3066 [30:23<17:01,  1.08it/s, loss=0.576][A
Train step of epoch 1:  64%|██████▍   | 1962/3066 [30:24<17:02,  1.08it/s, loss=0.576][A
Train step of epoch 1:  64%|██████▍   | 1962/3066 [30:24<17:02,  1.08it/s, loss=0.989][A
Train step of epoch 1:  64%|██████▍   | 1963/3066 [30:25<17:01,  1.08it/s, loss=0.989][A
Train step of epoch 1:  64%|██████▍   | 1963/3066 [30:25<17:01,  1.08it/s, loss=0.506][A
Train step of epoch 1:  64%|██████▍   | 1964/3066 [30:26<17:01,  1.08it/s, loss=0.506][A
Train step of epoch 1:  64%|██████▍   | 1964/3066 [30:26<17:01,  1.08it/s, loss=0.352][A
Train step of epoch 1:  64%|██████▍   | 1965/3066 [30:27<17:02,  1.08it/s, loss=0.352][A
Train step of epoch 1:  64%|██████▍   | 1965/3066 [30:27<17:02,  1.08it/s, loss=0.438][A
Train step of epoch 1:  64%|██████▍   | 1966/3066 [30:28<17:02,  1.08it/s, loss=0.438][A
Train step of epoch 1:  64%|██████▍   | 1966/3066 [30:28<17:02,  1.08it/s, loss=0.493][A
Train step

Train step of epoch 1:  67%|██████▋   | 2052/3066 [31:48<15:42,  1.08it/s, loss=0.405][A
Train step of epoch 1:  67%|██████▋   | 2053/3066 [31:48<15:40,  1.08it/s, loss=0.405][A
Train step of epoch 1:  67%|██████▋   | 2053/3066 [31:48<15:40,  1.08it/s, loss=0.427][A
Train step of epoch 1:  67%|██████▋   | 2054/3066 [31:49<15:38,  1.08it/s, loss=0.427][A
Train step of epoch 1:  67%|██████▋   | 2054/3066 [31:49<15:38,  1.08it/s, loss=0.301][A
Train step of epoch 1:  67%|██████▋   | 2055/3066 [31:50<15:38,  1.08it/s, loss=0.301][A
Train step of epoch 1:  67%|██████▋   | 2055/3066 [31:50<15:38,  1.08it/s, loss=0.933][A
Train step of epoch 1:  67%|██████▋   | 2056/3066 [31:51<15:37,  1.08it/s, loss=0.933][A
Train step of epoch 1:  67%|██████▋   | 2056/3066 [31:51<15:37,  1.08it/s, loss=0.561][A
Train step of epoch 1:  67%|██████▋   | 2057/3066 [31:52<15:35,  1.08it/s, loss=0.561][A
Train step of epoch 1:  67%|██████▋   | 2057/3066 [31:52<15:35,  1.08it/s, loss=0.639][A
Train step

Train step of epoch 1:  70%|██████▉   | 2143/3066 [33:12<14:15,  1.08it/s, loss=0.541][A
Train step of epoch 1:  70%|██████▉   | 2144/3066 [33:13<14:14,  1.08it/s, loss=0.541][A
Train step of epoch 1:  70%|██████▉   | 2144/3066 [33:13<14:14,  1.08it/s, loss=0.441][A
Train step of epoch 1:  70%|██████▉   | 2145/3066 [33:14<14:11,  1.08it/s, loss=0.441][A
Train step of epoch 1:  70%|██████▉   | 2145/3066 [33:14<14:11,  1.08it/s, loss=0.41] [A
Train step of epoch 1:  70%|██████▉   | 2146/3066 [33:15<14:11,  1.08it/s, loss=0.41][A
Train step of epoch 1:  70%|██████▉   | 2146/3066 [33:15<14:11,  1.08it/s, loss=0.414][A
Train step of epoch 1:  70%|███████   | 2147/3066 [33:16<14:10,  1.08it/s, loss=0.414][A
Train step of epoch 1:  70%|███████   | 2147/3066 [33:16<14:10,  1.08it/s, loss=0.442][A
Train step of epoch 1:  70%|███████   | 2148/3066 [33:17<14:08,  1.08it/s, loss=0.442][A
Train step of epoch 1:  70%|███████   | 2148/3066 [33:17<14:08,  1.08it/s, loss=0.57] [A
Train step 

Train step of epoch 1:  73%|███████▎  | 2234/3066 [34:36<12:52,  1.08it/s, loss=0.452][A
Train step of epoch 1:  73%|███████▎  | 2235/3066 [34:37<12:51,  1.08it/s, loss=0.452][A
Train step of epoch 1:  73%|███████▎  | 2235/3066 [34:37<12:51,  1.08it/s, loss=0.717][A
Train step of epoch 1:  73%|███████▎  | 2236/3066 [34:38<12:49,  1.08it/s, loss=0.717][A
Train step of epoch 1:  73%|███████▎  | 2236/3066 [34:38<12:49,  1.08it/s, loss=0.441][A
Train step of epoch 1:  73%|███████▎  | 2237/3066 [34:39<12:48,  1.08it/s, loss=0.441][A
Train step of epoch 1:  73%|███████▎  | 2237/3066 [34:39<12:48,  1.08it/s, loss=0.512][A
Train step of epoch 1:  73%|███████▎  | 2238/3066 [34:40<12:48,  1.08it/s, loss=0.512][A
Train step of epoch 1:  73%|███████▎  | 2238/3066 [34:40<12:48,  1.08it/s, loss=0.283][A
Train step of epoch 1:  73%|███████▎  | 2239/3066 [34:41<12:46,  1.08it/s, loss=0.283][A
Train step of epoch 1:  73%|███████▎  | 2239/3066 [34:41<12:46,  1.08it/s, loss=0.0916][A
Train ste

Train step of epoch 1:  76%|███████▌  | 2325/3066 [36:01<11:29,  1.07it/s, loss=0.576][A
Train step of epoch 1:  76%|███████▌  | 2326/3066 [36:02<11:28,  1.07it/s, loss=0.576][A
Train step of epoch 1:  76%|███████▌  | 2326/3066 [36:02<11:28,  1.07it/s, loss=0.486][A
Train step of epoch 1:  76%|███████▌  | 2327/3066 [36:03<11:27,  1.08it/s, loss=0.486][A
Train step of epoch 1:  76%|███████▌  | 2327/3066 [36:03<11:27,  1.08it/s, loss=0.311][A
Train step of epoch 1:  76%|███████▌  | 2328/3066 [36:04<11:25,  1.08it/s, loss=0.311][A
Train step of epoch 1:  76%|███████▌  | 2328/3066 [36:04<11:25,  1.08it/s, loss=0.662][A
Train step of epoch 1:  76%|███████▌  | 2329/3066 [36:04<11:24,  1.08it/s, loss=0.662][A
Train step of epoch 1:  76%|███████▌  | 2329/3066 [36:04<11:24,  1.08it/s, loss=0.714][A
Train step of epoch 1:  76%|███████▌  | 2330/3066 [36:05<11:24,  1.08it/s, loss=0.714][A
Train step of epoch 1:  76%|███████▌  | 2330/3066 [36:05<11:24,  1.08it/s, loss=0.39] [A
Train step

Train step of epoch 1:  79%|███████▉  | 2416/3066 [37:25<10:05,  1.07it/s, loss=0.61] [A
Train step of epoch 1:  79%|███████▉  | 2417/3066 [37:26<10:05,  1.07it/s, loss=0.61][A
Train step of epoch 1:  79%|███████▉  | 2417/3066 [37:26<10:05,  1.07it/s, loss=0.826][A
Train step of epoch 1:  79%|███████▉  | 2418/3066 [37:27<10:04,  1.07it/s, loss=0.826][A
Train step of epoch 1:  79%|███████▉  | 2418/3066 [37:27<10:04,  1.07it/s, loss=0.332][A
Train step of epoch 1:  79%|███████▉  | 2419/3066 [37:28<10:04,  1.07it/s, loss=0.332][A
Train step of epoch 1:  79%|███████▉  | 2419/3066 [37:28<10:04,  1.07it/s, loss=0.686][A
Train step of epoch 1:  79%|███████▉  | 2420/3066 [37:29<10:02,  1.07it/s, loss=0.686][A
Train step of epoch 1:  79%|███████▉  | 2420/3066 [37:29<10:02,  1.07it/s, loss=0.624][A
Train step of epoch 1:  79%|███████▉  | 2421/3066 [37:30<10:00,  1.07it/s, loss=0.624][A
Train step of epoch 1:  79%|███████▉  | 2421/3066 [37:30<10:00,  1.07it/s, loss=0.575][A
Train step 

Train step of epoch 1:  82%|████████▏ | 2507/3066 [38:50<08:43,  1.07it/s, loss=0.439][A
Train step of epoch 1:  82%|████████▏ | 2508/3066 [38:51<08:43,  1.07it/s, loss=0.439][A
Train step of epoch 1:  82%|████████▏ | 2508/3066 [38:51<08:43,  1.07it/s, loss=0.592][A
Train step of epoch 1:  82%|████████▏ | 2509/3066 [38:52<08:42,  1.07it/s, loss=0.592][A
Train step of epoch 1:  82%|████████▏ | 2509/3066 [38:52<08:42,  1.07it/s, loss=0.492][A
Train step of epoch 1:  82%|████████▏ | 2510/3066 [38:53<08:41,  1.07it/s, loss=0.492][A
Train step of epoch 1:  82%|████████▏ | 2510/3066 [38:53<08:41,  1.07it/s, loss=0.793][A
Train step of epoch 1:  82%|████████▏ | 2511/3066 [38:54<08:41,  1.06it/s, loss=0.793][A
Train step of epoch 1:  82%|████████▏ | 2511/3066 [38:54<08:41,  1.06it/s, loss=0.52] [A
Train step of epoch 1:  82%|████████▏ | 2512/3066 [38:55<08:40,  1.07it/s, loss=0.52][A
Train step of epoch 1:  82%|████████▏ | 2512/3066 [38:55<08:40,  1.07it/s, loss=0.644][A
Train step 

Train step of epoch 1:  85%|████████▍ | 2598/3066 [40:15<07:14,  1.08it/s, loss=0.481][A
Train step of epoch 1:  85%|████████▍ | 2599/3066 [40:16<07:13,  1.08it/s, loss=0.481][A
Train step of epoch 1:  85%|████████▍ | 2599/3066 [40:16<07:13,  1.08it/s, loss=0.367][A
Train step of epoch 1:  85%|████████▍ | 2600/3066 [40:17<07:12,  1.08it/s, loss=0.367][A
Train step of epoch 1:  85%|████████▍ | 2600/3066 [40:17<07:12,  1.08it/s, loss=0.596][A
Train step of epoch 1:  85%|████████▍ | 2601/3066 [40:18<07:11,  1.08it/s, loss=0.596][A
Train step of epoch 1:  85%|████████▍ | 2601/3066 [40:18<07:11,  1.08it/s, loss=0.65] [A
Train step of epoch 1:  85%|████████▍ | 2602/3066 [40:19<07:10,  1.08it/s, loss=0.65][A
Train step of epoch 1:  85%|████████▍ | 2602/3066 [40:19<07:10,  1.08it/s, loss=0.47][A
Train step of epoch 1:  85%|████████▍ | 2603/3066 [40:20<07:09,  1.08it/s, loss=0.47][A
Train step of epoch 1:  85%|████████▍ | 2603/3066 [40:20<07:09,  1.08it/s, loss=0.714][A
Train step of

Train step of epoch 1:  88%|████████▊ | 2689/3066 [41:40<05:48,  1.08it/s, loss=0.366][A
Train step of epoch 1:  88%|████████▊ | 2690/3066 [41:40<05:47,  1.08it/s, loss=0.366][A
Train step of epoch 1:  88%|████████▊ | 2690/3066 [41:40<05:47,  1.08it/s, loss=0.478][A
Train step of epoch 1:  88%|████████▊ | 2691/3066 [41:41<05:45,  1.08it/s, loss=0.478][A
Train step of epoch 1:  88%|████████▊ | 2691/3066 [41:41<05:45,  1.08it/s, loss=0.249][A
Train step of epoch 1:  88%|████████▊ | 2692/3066 [41:42<05:44,  1.08it/s, loss=0.249][A
Train step of epoch 1:  88%|████████▊ | 2692/3066 [41:42<05:44,  1.08it/s, loss=0.49] [A
Train step of epoch 1:  88%|████████▊ | 2693/3066 [41:43<05:43,  1.08it/s, loss=0.49][A
Train step of epoch 1:  88%|████████▊ | 2693/3066 [41:43<05:43,  1.08it/s, loss=0.366][A
Train step of epoch 1:  88%|████████▊ | 2694/3066 [41:44<05:43,  1.08it/s, loss=0.366][A
Train step of epoch 1:  88%|████████▊ | 2694/3066 [41:44<05:43,  1.08it/s, loss=0.449][A
Train step 

Train step of epoch 1:  91%|█████████ | 2780/3066 [43:04<04:25,  1.08it/s, loss=0.336][A
Train step of epoch 1:  91%|█████████ | 2781/3066 [43:05<04:24,  1.08it/s, loss=0.336][A
Train step of epoch 1:  91%|█████████ | 2781/3066 [43:05<04:24,  1.08it/s, loss=0.402][A
Train step of epoch 1:  91%|█████████ | 2782/3066 [43:06<04:23,  1.08it/s, loss=0.402][A
Train step of epoch 1:  91%|█████████ | 2782/3066 [43:06<04:23,  1.08it/s, loss=0.395][A
Train step of epoch 1:  91%|█████████ | 2783/3066 [43:07<04:22,  1.08it/s, loss=0.395][A
Train step of epoch 1:  91%|█████████ | 2783/3066 [43:07<04:22,  1.08it/s, loss=0.843][A
Train step of epoch 1:  91%|█████████ | 2784/3066 [43:07<04:22,  1.08it/s, loss=0.843][A
Train step of epoch 1:  91%|█████████ | 2784/3066 [43:08<04:22,  1.08it/s, loss=0.621][A
Train step of epoch 1:  91%|█████████ | 2785/3066 [43:08<04:20,  1.08it/s, loss=0.621][A
Train step of epoch 1:  91%|█████████ | 2785/3066 [43:08<04:20,  1.08it/s, loss=0.484][A
Train step

Train step of epoch 1:  94%|█████████▎| 2871/3066 [44:28<03:01,  1.07it/s, loss=0.628][A
Train step of epoch 1:  94%|█████████▎| 2872/3066 [44:29<03:00,  1.07it/s, loss=0.628][A
Train step of epoch 1:  94%|█████████▎| 2872/3066 [44:29<03:00,  1.07it/s, loss=0.323][A
Train step of epoch 1:  94%|█████████▎| 2873/3066 [44:30<02:59,  1.07it/s, loss=0.323][A
Train step of epoch 1:  94%|█████████▎| 2873/3066 [44:30<02:59,  1.07it/s, loss=0.834][A
Train step of epoch 1:  94%|█████████▎| 2874/3066 [44:31<02:58,  1.07it/s, loss=0.834][A
Train step of epoch 1:  94%|█████████▎| 2874/3066 [44:31<02:58,  1.07it/s, loss=0.618][A
Train step of epoch 1:  94%|█████████▍| 2875/3066 [44:32<02:57,  1.07it/s, loss=0.618][A
Train step of epoch 1:  94%|█████████▍| 2875/3066 [44:32<02:57,  1.07it/s, loss=0.239][A
Train step of epoch 1:  94%|█████████▍| 2876/3066 [44:33<02:56,  1.07it/s, loss=0.239][A
Train step of epoch 1:  94%|█████████▍| 2876/3066 [44:33<02:56,  1.07it/s, loss=0.771][A
Train step

Train step of epoch 1:  97%|█████████▋| 2962/3066 [45:53<01:36,  1.08it/s, loss=0.761][A
Train step of epoch 1:  97%|█████████▋| 2963/3066 [45:54<01:35,  1.08it/s, loss=0.761][A
Train step of epoch 1:  97%|█████████▋| 2963/3066 [45:54<01:35,  1.08it/s, loss=0.523][A
Train step of epoch 1:  97%|█████████▋| 2964/3066 [45:55<01:34,  1.08it/s, loss=0.523][A
Train step of epoch 1:  97%|█████████▋| 2964/3066 [45:55<01:34,  1.08it/s, loss=0.294][A
Train step of epoch 1:  97%|█████████▋| 2965/3066 [45:56<01:33,  1.08it/s, loss=0.294][A
Train step of epoch 1:  97%|█████████▋| 2965/3066 [45:56<01:33,  1.08it/s, loss=0.512][A
Train step of epoch 1:  97%|█████████▋| 2966/3066 [45:57<01:32,  1.08it/s, loss=0.512][A
Train step of epoch 1:  97%|█████████▋| 2966/3066 [45:57<01:32,  1.08it/s, loss=0.401][A
Train step of epoch 1:  97%|█████████▋| 2967/3066 [45:58<01:32,  1.08it/s, loss=0.401][A
Train step of epoch 1:  97%|█████████▋| 2967/3066 [45:58<01:32,  1.08it/s, loss=0.323][A
Train step

Train step of epoch 1: 100%|█████████▉| 3053/3066 [47:18<00:12,  1.07it/s, loss=0.457][A
Train step of epoch 1: 100%|█████████▉| 3054/3066 [47:19<00:11,  1.07it/s, loss=0.457][A
Train step of epoch 1: 100%|█████████▉| 3054/3066 [47:19<00:11,  1.07it/s, loss=0.498][A
Train step of epoch 1: 100%|█████████▉| 3055/3066 [47:20<00:10,  1.07it/s, loss=0.498][A
Train step of epoch 1: 100%|█████████▉| 3055/3066 [47:20<00:10,  1.07it/s, loss=0.254][A
Train step of epoch 1: 100%|█████████▉| 3056/3066 [47:21<00:09,  1.07it/s, loss=0.254][A
Train step of epoch 1: 100%|█████████▉| 3056/3066 [47:21<00:09,  1.07it/s, loss=0.743][A
Train step of epoch 1: 100%|█████████▉| 3057/3066 [47:22<00:08,  1.07it/s, loss=0.743][A
Train step of epoch 1: 100%|█████████▉| 3057/3066 [47:22<00:08,  1.07it/s, loss=0.319][A
Train step of epoch 1: 100%|█████████▉| 3058/3066 [47:23<00:07,  1.07it/s, loss=0.319][A
Train step of epoch 1: 100%|█████████▉| 3058/3066 [47:23<00:07,  1.07it/s, loss=0.453][A
Train step

Train step of epoch 2:   3%|▎         | 79/3066 [01:13<46:19,  1.07it/s, loss=0.516][A
Train step of epoch 2:   3%|▎         | 79/3066 [01:13<46:19,  1.07it/s, loss=0.444][A
Train step of epoch 2:   3%|▎         | 80/3066 [01:14<46:17,  1.07it/s, loss=0.444][A
Train step of epoch 2:   3%|▎         | 80/3066 [01:14<46:17,  1.07it/s, loss=0.517][A
Train step of epoch 2:   3%|▎         | 81/3066 [01:15<46:22,  1.07it/s, loss=0.517][A
Train step of epoch 2:   3%|▎         | 81/3066 [01:15<46:22,  1.07it/s, loss=0.576][A
Train step of epoch 2:   3%|▎         | 82/3066 [01:16<46:21,  1.07it/s, loss=0.576][A
Train step of epoch 2:   3%|▎         | 82/3066 [01:16<46:21,  1.07it/s, loss=0.402][A
Train step of epoch 2:   3%|▎         | 83/3066 [01:17<46:17,  1.07it/s, loss=0.402][A
Train step of epoch 2:   3%|▎         | 83/3066 [01:17<46:17,  1.07it/s, loss=0.539][A
Train step of epoch 2:   3%|▎         | 84/3066 [01:18<46:15,  1.07it/s, loss=0.539][A
Train step of epoch 2:   3%|▎   

Train step of epoch 2:   6%|▌         | 171/3066 [02:39<45:02,  1.07it/s, loss=0.611][A
Train step of epoch 2:   6%|▌         | 171/3066 [02:39<45:02,  1.07it/s, loss=0.254][A
Train step of epoch 2:   6%|▌         | 172/3066 [02:39<45:04,  1.07it/s, loss=0.254][A
Train step of epoch 2:   6%|▌         | 172/3066 [02:40<45:04,  1.07it/s, loss=0.332][A
Train step of epoch 2:   6%|▌         | 173/3066 [02:40<45:04,  1.07it/s, loss=0.332][A
Train step of epoch 2:   6%|▌         | 173/3066 [02:40<45:04,  1.07it/s, loss=0.242][A
Train step of epoch 2:   6%|▌         | 174/3066 [02:41<45:03,  1.07it/s, loss=0.242][A
Train step of epoch 2:   6%|▌         | 174/3066 [02:41<45:03,  1.07it/s, loss=0.511][A
Train step of epoch 2:   6%|▌         | 175/3066 [02:42<45:03,  1.07it/s, loss=0.511][A
Train step of epoch 2:   6%|▌         | 175/3066 [02:42<45:03,  1.07it/s, loss=0.266][A
Train step of epoch 2:   6%|▌         | 176/3066 [02:43<45:05,  1.07it/s, loss=0.266][A
Train step of epoch 2

Train step of epoch 2:   9%|▊         | 263/3066 [04:05<43:35,  1.07it/s, loss=0.514][A
Train step of epoch 2:   9%|▊         | 263/3066 [04:05<43:35,  1.07it/s, loss=0.252][A
Train step of epoch 2:   9%|▊         | 264/3066 [04:06<43:32,  1.07it/s, loss=0.252][A
Train step of epoch 2:   9%|▊         | 264/3066 [04:06<43:32,  1.07it/s, loss=0.155][A
Train step of epoch 2:   9%|▊         | 265/3066 [04:06<43:30,  1.07it/s, loss=0.155][A
Train step of epoch 2:   9%|▊         | 265/3066 [04:07<43:30,  1.07it/s, loss=0.26] [A
Train step of epoch 2:   9%|▊         | 266/3066 [04:07<43:30,  1.07it/s, loss=0.26][A
Train step of epoch 2:   9%|▊         | 266/3066 [04:07<43:30,  1.07it/s, loss=0.233][A
Train step of epoch 2:   9%|▊         | 267/3066 [04:08<43:28,  1.07it/s, loss=0.233][A
Train step of epoch 2:   9%|▊         | 267/3066 [04:08<43:28,  1.07it/s, loss=0.718][A
Train step of epoch 2:   9%|▊         | 268/3066 [04:09<43:27,  1.07it/s, loss=0.718][A
Train step of epoch 2:

Train step of epoch 2:  12%|█▏        | 355/3066 [05:30<41:48,  1.08it/s, loss=0.402][A
Train step of epoch 2:  12%|█▏        | 355/3066 [05:30<41:48,  1.08it/s, loss=0.224][A
Train step of epoch 2:  12%|█▏        | 356/3066 [05:31<41:54,  1.08it/s, loss=0.224][A
Train step of epoch 2:  12%|█▏        | 356/3066 [05:31<41:54,  1.08it/s, loss=0.318][A
Train step of epoch 2:  12%|█▏        | 357/3066 [05:32<41:50,  1.08it/s, loss=0.318][A
Train step of epoch 2:  12%|█▏        | 357/3066 [05:32<41:50,  1.08it/s, loss=0.141][A
Train step of epoch 2:  12%|█▏        | 358/3066 [05:33<41:47,  1.08it/s, loss=0.141][A
Train step of epoch 2:  12%|█▏        | 358/3066 [05:33<41:47,  1.08it/s, loss=0.177][A
Train step of epoch 2:  12%|█▏        | 359/3066 [05:34<41:45,  1.08it/s, loss=0.177][A
Train step of epoch 2:  12%|█▏        | 359/3066 [05:34<41:45,  1.08it/s, loss=0.49] [A
Train step of epoch 2:  12%|█▏        | 360/3066 [05:35<41:46,  1.08it/s, loss=0.49][A
Train step of epoch 2:

Train step of epoch 2:  15%|█▍        | 446/3066 [06:54<40:35,  1.08it/s, loss=0.633][A
Train step of epoch 2:  15%|█▍        | 447/3066 [06:55<40:35,  1.08it/s, loss=0.633][A
Train step of epoch 2:  15%|█▍        | 447/3066 [06:55<40:35,  1.08it/s, loss=0.817][A
Train step of epoch 2:  15%|█▍        | 448/3066 [06:56<40:38,  1.07it/s, loss=0.817][A
Train step of epoch 2:  15%|█▍        | 448/3066 [06:56<40:38,  1.07it/s, loss=0.619][A
Train step of epoch 2:  15%|█▍        | 449/3066 [06:57<40:36,  1.07it/s, loss=0.619][A
Train step of epoch 2:  15%|█▍        | 449/3066 [06:57<40:36,  1.07it/s, loss=0.626][A
Train step of epoch 2:  15%|█▍        | 450/3066 [06:58<40:39,  1.07it/s, loss=0.626][A
Train step of epoch 2:  15%|█▍        | 450/3066 [06:58<40:39,  1.07it/s, loss=0.413][A
Train step of epoch 2:  15%|█▍        | 451/3066 [06:59<40:36,  1.07it/s, loss=0.413][A
Train step of epoch 2:  15%|█▍        | 451/3066 [06:59<40:36,  1.07it/s, loss=0.395][A
Train step of epoch 2

Train step of epoch 2:  18%|█▊        | 538/3066 [08:20<39:21,  1.07it/s, loss=0.198][A
Train step of epoch 2:  18%|█▊        | 539/3066 [08:21<39:20,  1.07it/s, loss=0.198][A
Train step of epoch 2:  18%|█▊        | 539/3066 [08:21<39:20,  1.07it/s, loss=0.433][A
Train step of epoch 2:  18%|█▊        | 540/3066 [08:22<39:21,  1.07it/s, loss=0.433][A
Train step of epoch 2:  18%|█▊        | 540/3066 [08:22<39:21,  1.07it/s, loss=0.165][A
Train step of epoch 2:  18%|█▊        | 541/3066 [08:23<39:25,  1.07it/s, loss=0.165][A
Train step of epoch 2:  18%|█▊        | 541/3066 [08:23<39:25,  1.07it/s, loss=0.307][A
Train step of epoch 2:  18%|█▊        | 542/3066 [08:24<39:25,  1.07it/s, loss=0.307][A
Train step of epoch 2:  18%|█▊        | 542/3066 [08:24<39:25,  1.07it/s, loss=0.754][A
Train step of epoch 2:  18%|█▊        | 543/3066 [08:25<39:24,  1.07it/s, loss=0.754][A
Train step of epoch 2:  18%|█▊        | 543/3066 [08:25<39:24,  1.07it/s, loss=0.285][A
Train step of epoch 2

Train step of epoch 2:  21%|██        | 629/3066 [09:45<38:04,  1.07it/s, loss=0.437][A
Train step of epoch 2:  21%|██        | 630/3066 [09:46<38:03,  1.07it/s, loss=0.437][A
Train step of epoch 2:  21%|██        | 630/3066 [09:46<38:03,  1.07it/s, loss=0.292][A
Train step of epoch 2:  21%|██        | 631/3066 [09:47<38:00,  1.07it/s, loss=0.292][A
Train step of epoch 2:  21%|██        | 631/3066 [09:47<38:00,  1.07it/s, loss=0.364][A
Train step of epoch 2:  21%|██        | 632/3066 [09:48<37:58,  1.07it/s, loss=0.364][A
Train step of epoch 2:  21%|██        | 632/3066 [09:48<37:58,  1.07it/s, loss=0.894][A
Train step of epoch 2:  21%|██        | 633/3066 [09:49<38:02,  1.07it/s, loss=0.894][A
Train step of epoch 2:  21%|██        | 633/3066 [09:49<38:02,  1.07it/s, loss=0.224][A
Train step of epoch 2:  21%|██        | 634/3066 [09:50<38:00,  1.07it/s, loss=0.224][A
Train step of epoch 2:  21%|██        | 634/3066 [09:50<38:00,  1.07it/s, loss=0.181][A
Train step of epoch 2

Train step of epoch 2:  24%|██▎       | 721/3066 [11:11<36:19,  1.08it/s, loss=0.474][A
Train step of epoch 2:  24%|██▎       | 722/3066 [11:12<36:18,  1.08it/s, loss=0.474][A
Train step of epoch 2:  24%|██▎       | 722/3066 [11:12<36:18,  1.08it/s, loss=0.444][A
Train step of epoch 2:  24%|██▎       | 723/3066 [11:13<36:14,  1.08it/s, loss=0.444][A
Train step of epoch 2:  24%|██▎       | 723/3066 [11:13<36:14,  1.08it/s, loss=0.403][A
Train step of epoch 2:  24%|██▎       | 724/3066 [11:14<36:12,  1.08it/s, loss=0.403][A
Train step of epoch 2:  24%|██▎       | 724/3066 [11:14<36:12,  1.08it/s, loss=0.286][A
Train step of epoch 2:  24%|██▎       | 725/3066 [11:15<36:11,  1.08it/s, loss=0.286][A
Train step of epoch 2:  24%|██▎       | 725/3066 [11:15<36:11,  1.08it/s, loss=0.0977][A
Train step of epoch 2:  24%|██▎       | 726/3066 [11:16<36:10,  1.08it/s, loss=0.0977][A
Train step of epoch 2:  24%|██▎       | 726/3066 [11:16<36:10,  1.08it/s, loss=0.166] [A
Train step of epoc

Train step of epoch 2:  27%|██▋       | 813/3066 [12:36<34:47,  1.08it/s, loss=0.287][A
Train step of epoch 2:  27%|██▋       | 813/3066 [12:36<34:47,  1.08it/s, loss=0.5]  [A
Train step of epoch 2:  27%|██▋       | 814/3066 [12:37<34:46,  1.08it/s, loss=0.5][A
Train step of epoch 2:  27%|██▋       | 814/3066 [12:37<34:46,  1.08it/s, loss=0.664][A
Train step of epoch 2:  27%|██▋       | 815/3066 [12:38<34:45,  1.08it/s, loss=0.664][A
Train step of epoch 2:  27%|██▋       | 815/3066 [12:38<34:45,  1.08it/s, loss=0.105][A
Train step of epoch 2:  27%|██▋       | 816/3066 [12:39<34:45,  1.08it/s, loss=0.105][A
Train step of epoch 2:  27%|██▋       | 816/3066 [12:39<34:45,  1.08it/s, loss=0.327][A
Train step of epoch 2:  27%|██▋       | 817/3066 [12:40<34:44,  1.08it/s, loss=0.327][A
Train step of epoch 2:  27%|██▋       | 817/3066 [12:40<34:44,  1.08it/s, loss=0.422][A
Train step of epoch 2:  27%|██▋       | 818/3066 [12:41<34:44,  1.08it/s, loss=0.422][A
Train step of epoch 2: 

Train step of epoch 2:  30%|██▉       | 905/3066 [14:02<33:29,  1.08it/s, loss=0.346][A
Train step of epoch 2:  30%|██▉       | 905/3066 [14:02<33:29,  1.08it/s, loss=0.425][A
Train step of epoch 2:  30%|██▉       | 906/3066 [14:03<33:23,  1.08it/s, loss=0.425][A
Train step of epoch 2:  30%|██▉       | 906/3066 [14:03<33:23,  1.08it/s, loss=0.604][A
Train step of epoch 2:  30%|██▉       | 907/3066 [14:04<33:25,  1.08it/s, loss=0.604][A
Train step of epoch 2:  30%|██▉       | 907/3066 [14:04<33:25,  1.08it/s, loss=0.0586][A
Train step of epoch 2:  30%|██▉       | 908/3066 [14:05<33:27,  1.08it/s, loss=0.0586][A
Train step of epoch 2:  30%|██▉       | 908/3066 [14:05<33:27,  1.08it/s, loss=0.398] [A
Train step of epoch 2:  30%|██▉       | 909/3066 [14:05<33:26,  1.08it/s, loss=0.398][A
Train step of epoch 2:  30%|██▉       | 909/3066 [14:06<33:26,  1.08it/s, loss=0.346][A
Train step of epoch 2:  30%|██▉       | 910/3066 [14:06<33:22,  1.08it/s, loss=0.346][A
Train step of epoc

Train step of epoch 2:  32%|███▏      | 996/3066 [15:27<32:12,  1.07it/s, loss=0.346][A
Train step of epoch 2:  33%|███▎      | 997/3066 [15:27<32:12,  1.07it/s, loss=0.346][A
Train step of epoch 2:  33%|███▎      | 997/3066 [15:27<32:12,  1.07it/s, loss=0.221][A
Train step of epoch 2:  33%|███▎      | 998/3066 [15:28<32:13,  1.07it/s, loss=0.221][A
Train step of epoch 2:  33%|███▎      | 998/3066 [15:28<32:13,  1.07it/s, loss=0.0722][A
Train step of epoch 2:  33%|███▎      | 999/3066 [15:29<32:13,  1.07it/s, loss=0.0722][A
Train step of epoch 2:  33%|███▎      | 999/3066 [15:29<32:13,  1.07it/s, loss=0.428] [A
Train step of epoch 2:  33%|███▎      | 1000/3066 [15:30<32:11,  1.07it/s, loss=0.428][A
Train step of epoch 2:  33%|███▎      | 1000/3066 [15:30<32:11,  1.07it/s, loss=0.0853][A
Train step of epoch 2:  33%|███▎      | 1001/3066 [15:31<32:10,  1.07it/s, loss=0.0853][A
Train step of epoch 2:  33%|███▎      | 1001/3066 [15:31<32:10,  1.07it/s, loss=0.414] [A
Train step 

Train step of epoch 2:  35%|███▌      | 1087/3066 [16:51<30:41,  1.07it/s, loss=0.332][A
Train step of epoch 2:  35%|███▌      | 1087/3066 [16:51<30:41,  1.07it/s, loss=0.0717][A
Train step of epoch 2:  35%|███▌      | 1088/3066 [16:52<30:43,  1.07it/s, loss=0.0717][A
Train step of epoch 2:  35%|███▌      | 1088/3066 [16:52<30:43,  1.07it/s, loss=0.203] [A
Train step of epoch 2:  36%|███▌      | 1089/3066 [16:53<30:41,  1.07it/s, loss=0.203][A
Train step of epoch 2:  36%|███▌      | 1089/3066 [16:53<30:41,  1.07it/s, loss=0.245][A
Train step of epoch 2:  36%|███▌      | 1090/3066 [16:54<30:39,  1.07it/s, loss=0.245][A
Train step of epoch 2:  36%|███▌      | 1090/3066 [16:54<30:39,  1.07it/s, loss=0.0565][A
Train step of epoch 2:  36%|███▌      | 1091/3066 [16:55<30:45,  1.07it/s, loss=0.0565][A
Train step of epoch 2:  36%|███▌      | 1091/3066 [16:55<30:45,  1.07it/s, loss=0.184] [A
Train step of epoch 2:  36%|███▌      | 1092/3066 [16:56<30:44,  1.07it/s, loss=0.184][A
Trai

Train step of epoch 2:  38%|███▊      | 1177/3066 [18:15<29:29,  1.07it/s, loss=0.191][A
Train step of epoch 2:  38%|███▊      | 1178/3066 [18:16<29:29,  1.07it/s, loss=0.191][A
Train step of epoch 2:  38%|███▊      | 1178/3066 [18:16<29:29,  1.07it/s, loss=0.431][A
Train step of epoch 2:  38%|███▊      | 1179/3066 [18:17<29:26,  1.07it/s, loss=0.431][A
Train step of epoch 2:  38%|███▊      | 1179/3066 [18:17<29:26,  1.07it/s, loss=0.43] [A
Train step of epoch 2:  38%|███▊      | 1180/3066 [18:18<29:26,  1.07it/s, loss=0.43][A
Train step of epoch 2:  38%|███▊      | 1180/3066 [18:18<29:26,  1.07it/s, loss=0.594][A
Train step of epoch 2:  39%|███▊      | 1181/3066 [18:19<29:23,  1.07it/s, loss=0.594][A
Train step of epoch 2:  39%|███▊      | 1181/3066 [18:19<29:23,  1.07it/s, loss=0.129][A
Train step of epoch 2:  39%|███▊      | 1182/3066 [18:20<29:22,  1.07it/s, loss=0.129][A
Train step of epoch 2:  39%|███▊      | 1182/3066 [18:20<29:22,  1.07it/s, loss=0.0346][A
Train step

Train step of epoch 2:  41%|████▏     | 1268/3066 [19:40<27:59,  1.07it/s, loss=0.284][A
Train step of epoch 2:  41%|████▏     | 1268/3066 [19:40<27:59,  1.07it/s, loss=0.236][A
Train step of epoch 2:  41%|████▏     | 1269/3066 [19:41<27:59,  1.07it/s, loss=0.236][A
Train step of epoch 2:  41%|████▏     | 1269/3066 [19:41<27:59,  1.07it/s, loss=0.195][A
Train step of epoch 2:  41%|████▏     | 1270/3066 [19:42<27:58,  1.07it/s, loss=0.195][A
Train step of epoch 2:  41%|████▏     | 1270/3066 [19:42<27:58,  1.07it/s, loss=0.256][A
Train step of epoch 2:  41%|████▏     | 1271/3066 [19:43<27:54,  1.07it/s, loss=0.256][A
Train step of epoch 2:  41%|████▏     | 1271/3066 [19:43<27:54,  1.07it/s, loss=0.36] [A
Train step of epoch 2:  41%|████▏     | 1272/3066 [19:44<27:53,  1.07it/s, loss=0.36][A
Train step of epoch 2:  41%|████▏     | 1272/3066 [19:44<27:53,  1.07it/s, loss=0.111][A
Train step of epoch 2:  42%|████▏     | 1273/3066 [19:45<27:52,  1.07it/s, loss=0.111][A
Train step 

Train step of epoch 2:  44%|████▍     | 1358/3066 [21:04<26:28,  1.07it/s, loss=0.152][A
Train step of epoch 2:  44%|████▍     | 1358/3066 [21:04<26:28,  1.07it/s, loss=0.208][A
Train step of epoch 2:  44%|████▍     | 1359/3066 [21:05<26:27,  1.08it/s, loss=0.208][A
Train step of epoch 2:  44%|████▍     | 1359/3066 [21:05<26:27,  1.08it/s, loss=0.236][A
Train step of epoch 2:  44%|████▍     | 1360/3066 [21:06<26:25,  1.08it/s, loss=0.236][A
Train step of epoch 2:  44%|████▍     | 1360/3066 [21:06<26:25,  1.08it/s, loss=0.631][A
Train step of epoch 2:  44%|████▍     | 1361/3066 [21:07<26:23,  1.08it/s, loss=0.631][A
Train step of epoch 2:  44%|████▍     | 1361/3066 [21:07<26:23,  1.08it/s, loss=0.145][A
Train step of epoch 2:  44%|████▍     | 1362/3066 [21:08<26:21,  1.08it/s, loss=0.145][A
Train step of epoch 2:  44%|████▍     | 1362/3066 [21:08<26:21,  1.08it/s, loss=0.168][A
Train step of epoch 2:  44%|████▍     | 1363/3066 [21:09<26:21,  1.08it/s, loss=0.168][A
Train step

Train step of epoch 2:  46%|████▌     | 1403/3066 [21:46<25:52,  1.07it/s, loss=0.895][A
Train step of epoch 2:  46%|████▌     | 1404/3066 [21:47<25:51,  1.07it/s, loss=0.895][A
Train step of epoch 2:  46%|████▌     | 1404/3066 [21:47<25:51,  1.07it/s, loss=0.116][A
Train step of epoch 2:  46%|████▌     | 1405/3066 [21:48<25:51,  1.07it/s, loss=0.116][A
Train step of epoch 2:  46%|████▌     | 1405/3066 [21:48<25:51,  1.07it/s, loss=0.088][A
Train step of epoch 2:  46%|████▌     | 1406/3066 [21:49<25:48,  1.07it/s, loss=0.088][A
Train step of epoch 2:  46%|████▌     | 1406/3066 [21:49<25:48,  1.07it/s, loss=0.521][A
Train step of epoch 2:  46%|████▌     | 1407/3066 [21:50<25:48,  1.07it/s, loss=0.521][A
Train step of epoch 2:  46%|████▌     | 1407/3066 [21:50<25:48,  1.07it/s, loss=0.351][A
Train step of epoch 2:  46%|████▌     | 1408/3066 [21:51<25:48,  1.07it/s, loss=0.351][A
Train step of epoch 2:  46%|████▌     | 1408/3066 [21:51<25:48,  1.07it/s, loss=0.255][A
Train step

Train step of epoch 2:  49%|████▊     | 1494/3066 [23:11<24:24,  1.07it/s, loss=0.733][A
Train step of epoch 2:  49%|████▊     | 1494/3066 [23:11<24:24,  1.07it/s, loss=0.152][A
Train step of epoch 2:  49%|████▉     | 1495/3066 [23:12<24:24,  1.07it/s, loss=0.152][A
Train step of epoch 2:  49%|████▉     | 1495/3066 [23:12<24:24,  1.07it/s, loss=0.0973][A
Train step of epoch 2:  49%|████▉     | 1496/3066 [23:13<24:22,  1.07it/s, loss=0.0973][A
Train step of epoch 2:  49%|████▉     | 1496/3066 [23:13<24:22,  1.07it/s, loss=0.337] [A
Train step of epoch 2:  49%|████▉     | 1497/3066 [23:14<24:22,  1.07it/s, loss=0.337][A
Train step of epoch 2:  49%|████▉     | 1497/3066 [23:14<24:22,  1.07it/s, loss=0.0374][A
Train step of epoch 2:  49%|████▉     | 1498/3066 [23:15<24:23,  1.07it/s, loss=0.0374][A
Train step of epoch 2:  49%|████▉     | 1498/3066 [23:15<24:23,  1.07it/s, loss=0.937] [A
Train step of epoch 2:  49%|████▉     | 1499/3066 [23:16<24:23,  1.07it/s, loss=0.937][A
Trai

Train step of epoch 2:  52%|█████▏    | 1584/3066 [24:35<23:02,  1.07it/s, loss=0.165][A
Train step of epoch 2:  52%|█████▏    | 1585/3066 [24:36<23:01,  1.07it/s, loss=0.165][A
Train step of epoch 2:  52%|█████▏    | 1585/3066 [24:36<23:01,  1.07it/s, loss=0.476][A
Train step of epoch 2:  52%|█████▏    | 1586/3066 [24:37<22:59,  1.07it/s, loss=0.476][A
Train step of epoch 2:  52%|█████▏    | 1586/3066 [24:37<22:59,  1.07it/s, loss=0.392][A
Train step of epoch 2:  52%|█████▏    | 1587/3066 [24:38<22:58,  1.07it/s, loss=0.392][A
Train step of epoch 2:  52%|█████▏    | 1587/3066 [24:38<22:58,  1.07it/s, loss=0.272][A
Train step of epoch 2:  52%|█████▏    | 1588/3066 [24:39<22:57,  1.07it/s, loss=0.272][A
Train step of epoch 2:  52%|█████▏    | 1588/3066 [24:39<22:57,  1.07it/s, loss=0.338][A
Train step of epoch 2:  52%|█████▏    | 1589/3066 [24:39<22:56,  1.07it/s, loss=0.338][A
Train step of epoch 2:  52%|█████▏    | 1589/3066 [24:39<22:56,  1.07it/s, loss=0.638][A
Train step

Train step of epoch 2:  55%|█████▍    | 1675/3066 [26:00<21:41,  1.07it/s, loss=0.22] [A
Train step of epoch 2:  55%|█████▍    | 1676/3066 [26:01<21:37,  1.07it/s, loss=0.22][A
Train step of epoch 2:  55%|█████▍    | 1676/3066 [26:01<21:37,  1.07it/s, loss=0.736][A
Train step of epoch 2:  55%|█████▍    | 1677/3066 [26:02<21:35,  1.07it/s, loss=0.736][A
Train step of epoch 2:  55%|█████▍    | 1677/3066 [26:02<21:35,  1.07it/s, loss=0.54] [A
Train step of epoch 2:  55%|█████▍    | 1678/3066 [26:02<21:34,  1.07it/s, loss=0.54][A
Train step of epoch 2:  55%|█████▍    | 1678/3066 [26:02<21:34,  1.07it/s, loss=0.21][A
Train step of epoch 2:  55%|█████▍    | 1679/3066 [26:03<21:34,  1.07it/s, loss=0.21][A
Train step of epoch 2:  55%|█████▍    | 1679/3066 [26:03<21:34,  1.07it/s, loss=0.321][A
Train step of epoch 2:  55%|█████▍    | 1680/3066 [26:04<21:32,  1.07it/s, loss=0.321][A
Train step of epoch 2:  55%|█████▍    | 1680/3066 [26:04<21:32,  1.07it/s, loss=0.146][A
Train step of 

Train step of epoch 2:  58%|█████▊    | 1766/3066 [27:24<20:12,  1.07it/s, loss=0.193] [A
Train step of epoch 2:  58%|█████▊    | 1767/3066 [27:25<20:09,  1.07it/s, loss=0.193][A
Train step of epoch 2:  58%|█████▊    | 1767/3066 [27:25<20:09,  1.07it/s, loss=0.448][A
Train step of epoch 2:  58%|█████▊    | 1768/3066 [27:26<20:08,  1.07it/s, loss=0.448][A
Train step of epoch 2:  58%|█████▊    | 1768/3066 [27:26<20:08,  1.07it/s, loss=0.37] [A
Train step of epoch 2:  58%|█████▊    | 1769/3066 [27:27<20:07,  1.07it/s, loss=0.37][A
Train step of epoch 2:  58%|█████▊    | 1769/3066 [27:27<20:07,  1.07it/s, loss=0.118][A
Train step of epoch 2:  58%|█████▊    | 1770/3066 [27:28<20:07,  1.07it/s, loss=0.118][A
Train step of epoch 2:  58%|█████▊    | 1770/3066 [27:28<20:07,  1.07it/s, loss=0.0841][A
Train step of epoch 2:  58%|█████▊    | 1771/3066 [27:29<20:09,  1.07it/s, loss=0.0841][A
Train step of epoch 2:  58%|█████▊    | 1771/3066 [27:29<20:09,  1.07it/s, loss=0.0979][A
Train s

Train step of epoch 2:  61%|██████    | 1857/3066 [28:49<18:45,  1.07it/s, loss=0.225][A
Train step of epoch 2:  61%|██████    | 1857/3066 [28:49<18:45,  1.07it/s, loss=0.164][A
Train step of epoch 2:  61%|██████    | 1858/3066 [28:50<18:47,  1.07it/s, loss=0.164][A
Train step of epoch 2:  61%|██████    | 1858/3066 [28:50<18:47,  1.07it/s, loss=0.582][A
Train step of epoch 2:  61%|██████    | 1859/3066 [28:51<18:45,  1.07it/s, loss=0.582][A
Train step of epoch 2:  61%|██████    | 1859/3066 [28:51<18:45,  1.07it/s, loss=0.211][A
Train step of epoch 2:  61%|██████    | 1860/3066 [28:52<18:43,  1.07it/s, loss=0.211][A
Train step of epoch 2:  61%|██████    | 1860/3066 [28:52<18:43,  1.07it/s, loss=0.0507][A
Train step of epoch 2:  61%|██████    | 1861/3066 [28:53<18:43,  1.07it/s, loss=0.0507][A
Train step of epoch 2:  61%|██████    | 1861/3066 [28:53<18:43,  1.07it/s, loss=0.179] [A
Train step of epoch 2:  61%|██████    | 1862/3066 [28:54<18:41,  1.07it/s, loss=0.179][A
Train s

Train step of epoch 2:  64%|██████▎   | 1947/3066 [30:13<17:21,  1.07it/s, loss=0.468][A
Train step of epoch 2:  64%|██████▎   | 1947/3066 [30:13<17:21,  1.07it/s, loss=0.404][A
Train step of epoch 2:  64%|██████▎   | 1948/3066 [30:14<17:19,  1.08it/s, loss=0.404][A
Train step of epoch 2:  64%|██████▎   | 1948/3066 [30:14<17:19,  1.08it/s, loss=0.366][A
Train step of epoch 2:  64%|██████▎   | 1949/3066 [30:15<17:18,  1.08it/s, loss=0.366][A
Train step of epoch 2:  64%|██████▎   | 1949/3066 [30:15<17:18,  1.08it/s, loss=0.4]  [A
Train step of epoch 2:  64%|██████▎   | 1950/3066 [30:16<17:17,  1.08it/s, loss=0.4][A
Train step of epoch 2:  64%|██████▎   | 1950/3066 [30:16<17:17,  1.08it/s, loss=0.456][A
Train step of epoch 2:  64%|██████▎   | 1951/3066 [30:17<17:16,  1.08it/s, loss=0.456][A
Train step of epoch 2:  64%|██████▎   | 1951/3066 [30:17<17:16,  1.08it/s, loss=0.247][A
Train step of epoch 2:  64%|██████▎   | 1952/3066 [30:17<17:15,  1.08it/s, loss=0.247][A
Train step o

Train step of epoch 2:  66%|██████▋   | 2038/3066 [31:37<15:53,  1.08it/s, loss=0.107][A
Train step of epoch 2:  66%|██████▋   | 2038/3066 [31:37<15:53,  1.08it/s, loss=0.21] [A
Train step of epoch 2:  67%|██████▋   | 2039/3066 [31:38<15:51,  1.08it/s, loss=0.21][A
Train step of epoch 2:  67%|██████▋   | 2039/3066 [31:38<15:51,  1.08it/s, loss=0.165][A
Train step of epoch 2:  67%|██████▋   | 2040/3066 [31:39<15:50,  1.08it/s, loss=0.165][A
Train step of epoch 2:  67%|██████▋   | 2040/3066 [31:39<15:50,  1.08it/s, loss=0.392][A
Train step of epoch 2:  67%|██████▋   | 2041/3066 [31:40<15:49,  1.08it/s, loss=0.392][A
Train step of epoch 2:  67%|██████▋   | 2041/3066 [31:40<15:49,  1.08it/s, loss=0.219][A
Train step of epoch 2:  67%|██████▋   | 2042/3066 [31:41<15:48,  1.08it/s, loss=0.219][A
Train step of epoch 2:  67%|██████▋   | 2042/3066 [31:41<15:48,  1.08it/s, loss=0.623][A
Train step of epoch 2:  67%|██████▋   | 2043/3066 [31:42<15:48,  1.08it/s, loss=0.623][A
Train step 

Train step of epoch 2:  69%|██████▉   | 2128/3066 [33:01<14:29,  1.08it/s, loss=0.815][A
Train step of epoch 2:  69%|██████▉   | 2129/3066 [33:02<14:29,  1.08it/s, loss=0.815][A
Train step of epoch 2:  69%|██████▉   | 2129/3066 [33:02<14:29,  1.08it/s, loss=0.539][A
Train step of epoch 2:  69%|██████▉   | 2130/3066 [33:03<14:27,  1.08it/s, loss=0.539][A
Train step of epoch 2:  69%|██████▉   | 2130/3066 [33:03<14:27,  1.08it/s, loss=0.0705][A
Train step of epoch 2:  70%|██████▉   | 2131/3066 [33:04<14:26,  1.08it/s, loss=0.0705][A
Train step of epoch 2:  70%|██████▉   | 2131/3066 [33:04<14:26,  1.08it/s, loss=0.456] [A
Train step of epoch 2:  70%|██████▉   | 2132/3066 [33:05<14:26,  1.08it/s, loss=0.456][A
Train step of epoch 2:  70%|██████▉   | 2132/3066 [33:05<14:26,  1.08it/s, loss=0.0857][A
Train step of epoch 2:  70%|██████▉   | 2133/3066 [33:06<14:25,  1.08it/s, loss=0.0857][A
Train step of epoch 2:  70%|██████▉   | 2133/3066 [33:06<14:25,  1.08it/s, loss=0.174] [A
Trai

Train step of epoch 2:  72%|███████▏  | 2219/3066 [34:25<13:09,  1.07it/s, loss=0.347][A
Train step of epoch 2:  72%|███████▏  | 2220/3066 [34:26<13:07,  1.07it/s, loss=0.347][A
Train step of epoch 2:  72%|███████▏  | 2220/3066 [34:26<13:07,  1.07it/s, loss=0.243][A
Train step of epoch 2:  72%|███████▏  | 2221/3066 [34:27<13:08,  1.07it/s, loss=0.243][A
Train step of epoch 2:  72%|███████▏  | 2221/3066 [34:27<13:08,  1.07it/s, loss=0.377][A
Train step of epoch 2:  72%|███████▏  | 2222/3066 [34:28<13:06,  1.07it/s, loss=0.377][A
Train step of epoch 2:  72%|███████▏  | 2222/3066 [34:28<13:06,  1.07it/s, loss=0.415][A
Train step of epoch 2:  73%|███████▎  | 2223/3066 [34:29<13:05,  1.07it/s, loss=0.415][A
Train step of epoch 2:  73%|███████▎  | 2223/3066 [34:29<13:05,  1.07it/s, loss=0.246][A
Train step of epoch 2:  73%|███████▎  | 2224/3066 [34:30<13:06,  1.07it/s, loss=0.246][A
Train step of epoch 2:  73%|███████▎  | 2224/3066 [34:30<13:06,  1.07it/s, loss=0.241][A
Train step

Train step of epoch 2:  75%|███████▌  | 2310/3066 [35:50<11:40,  1.08it/s, loss=0.497][A
Train step of epoch 2:  75%|███████▌  | 2311/3066 [35:51<11:38,  1.08it/s, loss=0.497][A
Train step of epoch 2:  75%|███████▌  | 2311/3066 [35:51<11:38,  1.08it/s, loss=0.394][A
Train step of epoch 2:  75%|███████▌  | 2312/3066 [35:52<11:37,  1.08it/s, loss=0.394][A
Train step of epoch 2:  75%|███████▌  | 2312/3066 [35:52<11:37,  1.08it/s, loss=0.645][A
Train step of epoch 2:  75%|███████▌  | 2313/3066 [35:53<11:38,  1.08it/s, loss=0.645][A
Train step of epoch 2:  75%|███████▌  | 2313/3066 [35:53<11:38,  1.08it/s, loss=0.586][A
Train step of epoch 2:  75%|███████▌  | 2314/3066 [35:54<11:37,  1.08it/s, loss=0.586][A
Train step of epoch 2:  75%|███████▌  | 2314/3066 [35:54<11:37,  1.08it/s, loss=0.788][A
Train step of epoch 2:  76%|███████▌  | 2315/3066 [35:55<11:36,  1.08it/s, loss=0.788][A
Train step of epoch 2:  76%|███████▌  | 2315/3066 [35:55<11:36,  1.08it/s, loss=0.463][A
Train step

Train step of epoch 2:  78%|███████▊  | 2401/3066 [37:15<10:19,  1.07it/s, loss=0.582][A
Train step of epoch 2:  78%|███████▊  | 2402/3066 [37:16<10:19,  1.07it/s, loss=0.582][A
Train step of epoch 2:  78%|███████▊  | 2402/3066 [37:16<10:19,  1.07it/s, loss=0.515][A
Train step of epoch 2:  78%|███████▊  | 2403/3066 [37:17<10:17,  1.07it/s, loss=0.515][A
Train step of epoch 2:  78%|███████▊  | 2403/3066 [37:17<10:17,  1.07it/s, loss=0.0838][A
Train step of epoch 2:  78%|███████▊  | 2404/3066 [37:18<10:16,  1.07it/s, loss=0.0838][A
Train step of epoch 2:  78%|███████▊  | 2404/3066 [37:18<10:16,  1.07it/s, loss=0.287] [A
Train step of epoch 2:  78%|███████▊  | 2405/3066 [37:19<10:15,  1.07it/s, loss=0.287][A
Train step of epoch 2:  78%|███████▊  | 2405/3066 [37:19<10:15,  1.07it/s, loss=0.325][A
Train step of epoch 2:  78%|███████▊  | 2406/3066 [37:19<10:14,  1.07it/s, loss=0.325][A
Train step of epoch 2:  78%|███████▊  | 2406/3066 [37:20<10:14,  1.07it/s, loss=0.619][A
Train s

Train step of epoch 2:  80%|███████▉  | 2447/3066 [37:57<09:33,  1.08it/s, loss=0.186][A
Train step of epoch 2:  80%|███████▉  | 2447/3066 [37:57<09:33,  1.08it/s, loss=0.243][A
Train step of epoch 2:  80%|███████▉  | 2448/3066 [37:58<09:32,  1.08it/s, loss=0.243][A
Train step of epoch 2:  80%|███████▉  | 2448/3066 [37:58<09:32,  1.08it/s, loss=0.159][A
Train step of epoch 2:  80%|███████▉  | 2449/3066 [37:59<09:32,  1.08it/s, loss=0.159][A
Train step of epoch 2:  80%|███████▉  | 2449/3066 [37:59<09:32,  1.08it/s, loss=0.157][A
Train step of epoch 2:  80%|███████▉  | 2450/3066 [38:00<09:31,  1.08it/s, loss=0.157][A
Train step of epoch 2:  80%|███████▉  | 2450/3066 [38:00<09:31,  1.08it/s, loss=0.93] [A
Train step of epoch 2:  80%|███████▉  | 2451/3066 [38:01<09:30,  1.08it/s, loss=0.93][A
Train step of epoch 2:  80%|███████▉  | 2451/3066 [38:01<09:30,  1.08it/s, loss=0.268][A
Train step of epoch 2:  80%|███████▉  | 2452/3066 [38:02<09:29,  1.08it/s, loss=0.268][A
Train step 

Train step of epoch 2:  83%|████████▎ | 2538/3066 [39:22<08:09,  1.08it/s, loss=0.383][A
Train step of epoch 2:  83%|████████▎ | 2538/3066 [39:22<08:09,  1.08it/s, loss=0.426][A
Train step of epoch 2:  83%|████████▎ | 2539/3066 [39:23<08:08,  1.08it/s, loss=0.426][A
Train step of epoch 2:  83%|████████▎ | 2539/3066 [39:23<08:08,  1.08it/s, loss=0.133][A
Train step of epoch 2:  83%|████████▎ | 2540/3066 [39:24<08:07,  1.08it/s, loss=0.133][A
Train step of epoch 2:  83%|████████▎ | 2540/3066 [39:24<08:07,  1.08it/s, loss=0.343][A
Train step of epoch 2:  83%|████████▎ | 2541/3066 [39:25<08:06,  1.08it/s, loss=0.343][A
Train step of epoch 2:  83%|████████▎ | 2541/3066 [39:25<08:06,  1.08it/s, loss=0.21] [A
Train step of epoch 2:  83%|████████▎ | 2542/3066 [39:26<08:05,  1.08it/s, loss=0.21][A
Train step of epoch 2:  83%|████████▎ | 2542/3066 [39:26<08:05,  1.08it/s, loss=0.36][A
Train step of epoch 2:  83%|████████▎ | 2543/3066 [39:26<08:04,  1.08it/s, loss=0.36][A
Train step of

Train step of epoch 2:  86%|████████▌ | 2628/3066 [40:45<06:47,  1.07it/s, loss=0.144][A
Train step of epoch 2:  86%|████████▌ | 2629/3066 [40:46<06:46,  1.07it/s, loss=0.144][A
Train step of epoch 2:  86%|████████▌ | 2629/3066 [40:46<06:46,  1.07it/s, loss=0.426][A
Train step of epoch 2:  86%|████████▌ | 2630/3066 [40:47<06:45,  1.08it/s, loss=0.426][A
Train step of epoch 2:  86%|████████▌ | 2630/3066 [40:47<06:45,  1.08it/s, loss=0.907][A
Train step of epoch 2:  86%|████████▌ | 2631/3066 [40:48<06:44,  1.08it/s, loss=0.907][A
Train step of epoch 2:  86%|████████▌ | 2631/3066 [40:48<06:44,  1.08it/s, loss=0.371][A
Train step of epoch 2:  86%|████████▌ | 2632/3066 [40:49<06:43,  1.08it/s, loss=0.371][A
Train step of epoch 2:  86%|████████▌ | 2632/3066 [40:49<06:43,  1.08it/s, loss=0.312][A
Train step of epoch 2:  86%|████████▌ | 2633/3066 [40:50<06:42,  1.08it/s, loss=0.312][A
Train step of epoch 2:  86%|████████▌ | 2633/3066 [40:50<06:42,  1.08it/s, loss=0.257][A
Train step

Train step of epoch 2:  89%|████████▊ | 2718/3066 [42:09<05:24,  1.07it/s, loss=0.726][A
Train step of epoch 2:  89%|████████▊ | 2719/3066 [42:10<05:24,  1.07it/s, loss=0.726][A
Train step of epoch 2:  89%|████████▊ | 2719/3066 [42:10<05:24,  1.07it/s, loss=0.637][A
Train step of epoch 2:  89%|████████▊ | 2720/3066 [42:11<05:23,  1.07it/s, loss=0.637][A
Train step of epoch 2:  89%|████████▊ | 2720/3066 [42:11<05:23,  1.07it/s, loss=0.615][A
Train step of epoch 2:  89%|████████▊ | 2721/3066 [42:12<05:22,  1.07it/s, loss=0.615][A
Train step of epoch 2:  89%|████████▊ | 2721/3066 [42:12<05:22,  1.07it/s, loss=0.436][A
Train step of epoch 2:  89%|████████▉ | 2722/3066 [42:13<05:21,  1.07it/s, loss=0.436][A
Train step of epoch 2:  89%|████████▉ | 2722/3066 [42:13<05:21,  1.07it/s, loss=0.486][A
Train step of epoch 2:  89%|████████▉ | 2723/3066 [42:14<05:19,  1.07it/s, loss=0.486][A
Train step of epoch 2:  89%|████████▉ | 2723/3066 [42:14<05:19,  1.07it/s, loss=0.121][A
Train step

Train step of epoch 2:  92%|█████████▏| 2809/3066 [43:34<04:00,  1.07it/s, loss=0.585][A
Train step of epoch 2:  92%|█████████▏| 2809/3066 [43:34<04:00,  1.07it/s, loss=0.0829][A
Train step of epoch 2:  92%|█████████▏| 2810/3066 [43:35<03:59,  1.07it/s, loss=0.0829][A
Train step of epoch 2:  92%|█████████▏| 2810/3066 [43:35<03:59,  1.07it/s, loss=0.956] [A
Train step of epoch 2:  92%|█████████▏| 2811/3066 [43:36<03:58,  1.07it/s, loss=0.956][A
Train step of epoch 2:  92%|█████████▏| 2811/3066 [43:36<03:58,  1.07it/s, loss=0.168][A
Train step of epoch 2:  92%|█████████▏| 2812/3066 [43:37<03:57,  1.07it/s, loss=0.168][A
Train step of epoch 2:  92%|█████████▏| 2812/3066 [43:37<03:57,  1.07it/s, loss=0.458][A
Train step of epoch 2:  92%|█████████▏| 2813/3066 [43:38<03:56,  1.07it/s, loss=0.458][A
Train step of epoch 2:  92%|█████████▏| 2813/3066 [43:38<03:56,  1.07it/s, loss=0.15] [A
Train step of epoch 2:  92%|█████████▏| 2814/3066 [43:39<03:55,  1.07it/s, loss=0.15][A
Train st

Train step of epoch 2:  95%|█████████▍| 2899/3066 [44:58<02:35,  1.08it/s, loss=0.115][A
Train step of epoch 2:  95%|█████████▍| 2900/3066 [44:59<02:34,  1.08it/s, loss=0.115][A
Train step of epoch 2:  95%|█████████▍| 2900/3066 [44:59<02:34,  1.08it/s, loss=0.295][A
Train step of epoch 2:  95%|█████████▍| 2901/3066 [45:00<02:33,  1.07it/s, loss=0.295][A
Train step of epoch 2:  95%|█████████▍| 2901/3066 [45:00<02:33,  1.07it/s, loss=0.344][A
Train step of epoch 2:  95%|█████████▍| 2902/3066 [45:01<02:32,  1.08it/s, loss=0.344][A
Train step of epoch 2:  95%|█████████▍| 2902/3066 [45:01<02:32,  1.08it/s, loss=0.166][A
Train step of epoch 2:  95%|█████████▍| 2903/3066 [45:02<02:31,  1.07it/s, loss=0.166][A
Train step of epoch 2:  95%|█████████▍| 2903/3066 [45:02<02:31,  1.07it/s, loss=0.569][A
Train step of epoch 2:  95%|█████████▍| 2904/3066 [45:03<02:30,  1.08it/s, loss=0.569][A
Train step of epoch 2:  95%|█████████▍| 2904/3066 [45:03<02:30,  1.08it/s, loss=0.241][A
Train step

Train step of epoch 2:  96%|█████████▌| 2945/3066 [45:41<01:52,  1.08it/s, loss=0.0839][A
Train step of epoch 2:  96%|█████████▌| 2945/3066 [45:41<01:52,  1.08it/s, loss=0.196] [A
Train step of epoch 2:  96%|█████████▌| 2946/3066 [45:42<01:51,  1.08it/s, loss=0.196][A
Train step of epoch 2:  96%|█████████▌| 2946/3066 [45:42<01:51,  1.08it/s, loss=0.198][A
Train step of epoch 2:  96%|█████████▌| 2947/3066 [45:43<01:50,  1.08it/s, loss=0.198][A
Train step of epoch 2:  96%|█████████▌| 2947/3066 [45:43<01:50,  1.08it/s, loss=0.618][A
Train step of epoch 2:  96%|█████████▌| 2948/3066 [45:44<01:49,  1.08it/s, loss=0.618][A
Train step of epoch 2:  96%|█████████▌| 2948/3066 [45:44<01:49,  1.08it/s, loss=0.406][A
Train step of epoch 2:  96%|█████████▌| 2949/3066 [45:45<01:48,  1.08it/s, loss=0.406][A
Train step of epoch 2:  96%|█████████▌| 2949/3066 [45:45<01:48,  1.08it/s, loss=0.227][A
Train step of epoch 2:  96%|█████████▌| 2950/3066 [45:45<01:47,  1.08it/s, loss=0.227][A
Train st

Train step of epoch 2:  99%|█████████▉| 3035/3066 [47:04<00:28,  1.08it/s, loss=0.127][A
Train step of epoch 2:  99%|█████████▉| 3036/3066 [47:05<00:27,  1.08it/s, loss=0.127][A
Train step of epoch 2:  99%|█████████▉| 3036/3066 [47:05<00:27,  1.08it/s, loss=0.414][A
Train step of epoch 2:  99%|█████████▉| 3037/3066 [47:06<00:26,  1.08it/s, loss=0.414][A
Train step of epoch 2:  99%|█████████▉| 3037/3066 [47:06<00:26,  1.08it/s, loss=0.165][A
Train step of epoch 2:  99%|█████████▉| 3038/3066 [47:07<00:25,  1.08it/s, loss=0.165][A
Train step of epoch 2:  99%|█████████▉| 3038/3066 [47:07<00:25,  1.08it/s, loss=0.314][A
Train step of epoch 2:  99%|█████████▉| 3039/3066 [47:08<00:25,  1.08it/s, loss=0.314][A
Train step of epoch 2:  99%|█████████▉| 3039/3066 [47:08<00:25,  1.08it/s, loss=0.186][A
Train step of epoch 2:  99%|█████████▉| 3040/3066 [47:09<00:24,  1.08it/s, loss=0.186][A
Train step of epoch 2:  99%|█████████▉| 3040/3066 [47:09<00:24,  1.08it/s, loss=0.536][A
Train step

In [71]:
#학습 확인
#임의의 문장을 입력 후 reward score 확인
def inference_RM(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
        torch.cuda.current_device())
    output = model(input_ids)
    output_reward = output.cpu().detach().numpy()[0]

    print('input: %s\nreward score: %.1f'%(input_text, output_reward))

    return output_reward

input_text = '인공지능은 똥멍청이 입니다'
output_reward = inference_RM(input_text=input_text)

input: 인공지능은 똥멍청이 입니다
reward score: -3.5


In [72]:
input_text = '인공지능(AI)은 컴퓨터에서 음성 및 작성된 언어를 보고 이해하고 번역하고 데이터를 분석하고 추천하는 기능을 포함하여 다양한 고급 기능을 수행할 수 있는 일련의 기술입니다.'

output_reward = inference_RM(input_text=input_text)

input: 인공지능(AI)은 컴퓨터에서 음성 및 작성된 언어를 보고 이해하고 번역하고 데이터를 분석하고 추천하는 기능을 포함하여 다양한 고급 기능을 수행할 수 있는 일련의 기술입니다.
reward score: -1.7


reward score 해석 방식
일반적으로
- 높은 점수(1.0) :응답이 매우 유익, 적절
- 낮은 점수(-1.0):응답이 부적절하거나 무의미, 해로움
- 0근처 : 애매하거나 무난한 응답
- reward score는 상대적으로 해석



| 입력 문장                                  | Reward Score | 해석                                                 |
| -------------------------------------- | ------------ | -------------------------------------------------- |
| 인공지능은 똥멍청이 입니다                         | -3.5         | **비하, 무의미, 유해 발언**으로 인식되어 매우 낮은 점수를 받은 것.          |
| 인공지능(AI)은 … 고급 기능을 수행할 수 있는 일련의 기술입니다. | -1.7         | 내용은 기술적이고 유익하지만, 완전성 부족 또는 일부 문맥 누락으로 낮은 점수를 받은 것. |


- 유해한 발언에 낮은 점수(-3.5)를 주고, 정보성 응답에 상대적 덜 낮은 점수(-1.7)을 주었다. 
- 모델이 응답 간 품질을 구별할 수 있어 보임

In [80]:
prompts = [
    "안녕하세요, 무엇을 도와드릴까요?",
    "2 더하기 2는 4입니다.",
    "씨발 이런 멍청한 시스템",
    "고양이는 하루에 평균 15시간 정도 잡니다.",
    "너는 나보다 똑똑하지 않아",
    "인공지능은 인류에게 큰 위협이 될 것이다.",
    "인공지능은 생산성을 높이고 많은 산업에서 활용되고 있습니다."
]

for prompt in prompts:
    reward = inference_RM(input_text=prompt)
    print(reward)


input: 안녕하세요, 무엇을 도와드릴까요?
reward score: -0.5
-0.51303047
input: 2 더하기 2는 4입니다.
reward score: -1.3
-1.2761163
input: 씨발 이런 멍청한 시스템
reward score: 4.2
4.152214
input: 고양이는 하루에 평균 15시간 정도 잡니다.
reward score: -0.5
-0.49493584
input: 너는 나보다 똑똑하지 않아
reward score: 0.8
0.80171657
input: 인공지능은 인류에게 큰 위협이 될 것이다.
reward score: -3.4
-3.359308
input: 인공지능은 생산성을 높이고 많은 산업에서 활용되고 있습니다.
reward score: -3.0
-2.971998


- 욕설이 있는 부적절한 발언에 높은 점수가 주어짐
- reward model이 올바른 방향으로 학습되지 않았음
- 점수 분포가 전반적으로 음수에 머물러 있음 -> 그 출력값을 0~1 사이로 리스케일링 고려

In [81]:
#메모리 초기화
import torch, gc

def reset_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")

reset_cuda()

Train epoch:   0%|          | 0/3 [2:49:45<?, ?it/s]
Train step of epoch 0:   0%|          | 0/3066 [2:49:45<?, ?it/s]

✅ GPU memory cleared





# PPO
- actor model :sft model -> 현재 policy, 문장 생성
- critic model :rm model -> 생성된 문장 평가 모델
- initial model :sft model을 freezing 하여 사용    -> PPO에서 상대 변화량 계산 기준
- tokenizer : kogpt-2의 tokenizer

In [82]:
#메모리 초기화
import torch, gc

def reset_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")

reset_cuda()

✅ GPU memory cleared


In [39]:
from copy import deepcopy

import torch
from torch.optim import Adam
from chatgpt.models.base import RewardModel
from chatgpt.models.gpt import GPTActor, GPTCritic
from chatgpt.trainer import PPOTrainer
from chatgpt.trainer.strategies import NaiveStrategy
from transformers import AutoTokenizer

In [84]:
with NaiveStrategy().model_init_context():
    actor = GPTActor(pretrained='./results/output_1_SFT', lora_rank=0).to(torch.cuda.current_device())
    critic = GPTCritic(pretrained='./results/output_2_RM', lora_rank=0).to(torch.cuda.current_device())

    tokenizer = AutoTokenizer.from_pretrained(
        'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
        padding_side="right", 
        model_max_length=512
    )

    initial_model = deepcopy(actor)
    reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())

In [85]:
#모델 학습에 사용할 옵티마이저와 모델 준비
actor_optim = Adam(actor.parameters(), lr=5e-6)
critic_optim = Adam(critic.parameters(), lr=5e-6)

(actor, actor_optim), (critic, critic_optim), reward_model, initial_model = NaiveStrategy().prepare(
    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)

In [86]:
# 데이터 불러오기, 토크나이징
with open('./KoChatGPT/data_kochatgpt/kochatgpt_3_PPO.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)
    list_prompt = [tmp['prompt'] for tmp in list_data_dict]

def tokenize_fn(texts):
    batch = tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
    return {k: v.cuda() for k, v in batch.items()}

print(tokenize_fn('It takes something more than intelligence to act intelligently.'))


len(list_prompt)

{'input_ids': tensor([[47311, 10448, 19008,  9792, 11780, 11308, 30190, 10929, 11849, 21663,
         44389,  9574, 13799,   458, 14308, 12778, 22469, 20938, 44696,   458,
         13799,   458, 14308, 12778, 11756, 18944,   389]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]], device='cuda:0')}


12000

In [90]:
# ppo trainer 클래스 설계, 학습
trainer = PPOTrainer(NaiveStrategy(),
                     actor,
                     critic,
                     reward_model,
                     initial_model,
                     actor_optim,
                     critic_optim,
                     max_epochs=3,  
                     train_batch_size=8, 
                     tokenizer=tokenize_fn,
                     max_length=128,
                     do_sample=True,
                     temperature=1.0,
                     top_k=50,
                     pad_token_id=tokenizer.pad_token_id,
                     eos_token_id=tokenizer.eos_token_id)


trainer.fit(list_prompt, 
            num_episodes=10,  
            max_timesteps=3,
            update_timesteps=3)

model.save_pretrained('./results/output_3_PPO')

In [92]:
# rlhf 적용된 kogpt-2 생성능력 테스트
def generation(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
        torch.cuda.current_device())
    outputs = actor.generate(input_ids,
                             max_length=64,
                             do_sample=True,
                             top_k=50,
                             top_p=0.95,
                             num_return_sequences=1)
    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)[0]
    print()
    print(output)
    return output

PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = [
    '불고기용 고기 한우에요?', 
    '리처드 닉슨이 43대 부통령직을 수행한 년도는?', 
    '시카고 오헤어 국제공항은 어디에 있어',
    '오늘 미세먼지 어때?']

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt': tmp}) for tmp in list_prompt]

for input_text in list_prompt:
    output = generation(input_text)


### Instruction(명령어):
불고기용 고기 한우에요?

### Response(응답):'불고기는 일반적으로 고기와 돼지고기로 많이 조리됩니다. 하지만 한우, 송아지를 비롯한 다양한 육류 요리도 존재합니다. 또한 소고기, 돼지고

### Instruction(명령어):
리처드 닉슨이 43대 부통령직을 수행한 년도는?

### Response(응답):'리처드 닉슨이 36대 부통령직을 수행한 년도는 1952년입니다.政官)이던 닉슨은 1947년 대선 공약 중 하나로

### Instruction(명령어):
시카고 오헤어 국제공항은 어디에 있어

### Response(응답):'시카고 오헤어 국제공항은 미국 앨라배니아주의 시카고 지역에 위치해 있습니다. 미국 일리노이 주에서 발생하였습니다.奈鄕恩

### Instruction(명령어):
오늘 미세먼지 어때?

### Response(응답):'미세먼지는 공기 중의 화학물질으로, 대기 중에 유해물질이 방출될 수 있습니다. 이는 대기 질 차이로 인해 발생하는 건강 문제에도 영향을 끼치므로, 실외


- 부분적 사실 오류: 리처드 닉슨은 37대 대통령
- 비문/중복 발생 : '시카고 오헤어 국제공항은 .... 발생했습니다', '奈鄕恩'
- 정확도, 표현력 미흡 :'미세먼지는 공기 중의 화학물질..', 맥락에 맞는 응답이 아님

In [93]:
from copy import deepcopy

import torch
from torch.optim import Adam
from chatgpt.models.base import RewardModel
from chatgpt.models.gpt import GPTActor, GPTCritic
from chatgpt.trainer import PPOTrainer
from chatgpt.trainer.strategies import NaiveStrategy
from transformers import AutoTokenizer

In [95]:
# 모델과 토크나이저 불러오기
model = AutoModelForCausalLM.from_pretrained('skt/kogpt2-base-v2')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

# 1️⃣ 모델 경로 지정
kogpt2_ckpt = "skt/kogpt2-base-v2"  # or local dir
sft_ckpt = "./results/output_1_SFT"  # fine-tuned model dir
rm_ckpt = "./results/output_2_RM"    # reward model dir

# 2️⃣ 토크나이저 불러오기 (KoGPT2 기준)
tokenizer = AutoTokenizer.from_pretrained(kogpt2_ckpt)

# 3️⃣ 모델 불러오기
kogpt2_model = AutoModelForCausalLM.from_pretrained(kogpt2_ckpt).cuda().eval()
sft_model = AutoModelForCausalLM.from_pretrained(sft_ckpt).cuda().eval()
rm_model = AutoModelForCausalLM.from_pretrained(rm_ckpt).cuda().eval()



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# KoChatGPT 업그레이드해서 custom ChatGPT 개발
- 모델 개선 전략
- 데이터셋 추가 정제(beam search, top-k sampling 실험) 후 성능 비교
- 새로운 데이터 수집, 전처리, 성능비교
- 더 적절한 학습 전략(SFT, RM,PPO) 적용, initial model 변경 후 성능 비교

## 데이터셋 추가 정제
- 최소 길이 설정, 반복 비율 설정 등 정제
- beam search
- top-k sampling

In [4]:
#모델 로딩
sft_model = AutoModelForCausalLM.from_pretrained('./results/output_1_SFT').cuda().eval()
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', padding_side="right")
rm_model = AutoModelForCausalLM.from_pretrained('./results/output_2_RM').cuda().eval()


#metric
# BLEU, ROUGE Metric
from datasets import load_metric

bleu = load_metric("bleu")
rouge = load_metric("rouge")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  bleu = load_metric("bleu")


In [105]:
# !pip install rouge_score

- 안정적이고 논리적 응답을 위해 num_beams, temperature 조정
- do_sample=False, num_beams=5~10, temperature=1.0

In [5]:
generation_args = dict(   
    num_beams=6,                     # beam width
    do_sample=False,                # sampling 비활성화 → 확정적 결과
    temperature=1.0,                # 확률 분포에 영향 없음 (do_sample=False일 땐 무시됨)
    repetition_penalty=1.2,         # 중복 방지
    no_repeat_ngram_size=3,         # 반복 n-gram 방지
    eos_token_id=375,               # "\n" 토큰 기준 응답 종료
    max_new_tokens=64,
    early_stopping=True
)



In [8]:
generator = pipeline('text-generation', model='./results/output_1_SFT', tokenizer=tokenizer)

PROMPT_TEMPLATE = "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
list_prompt = [
    '불고기용 고기 한우에요?',
    '리처드 닉슨이 43대 부통령직을 수행한 년도는?',
    '시카고 오헤어 국제공항은 어디에 있어?',
    '오늘 미세먼지 어때?'
]
list_prompt = [PROMPT_TEMPLATE.format(prompt=p) for p in list_prompt]


In [9]:
list_outputs = generator(list_prompt, **generation_args)

for prompt, result in zip(list_prompt, list_outputs):
    output_text = result[0]['generated_text']
    response_only = output_text.split("### Response(응답):")[-1].strip()
    print(f"\n🟦 Prompt:\n{prompt}\n🟩 Response:\n{response_only}")





🟦 Prompt:
### Instruction(명령어):
불고기용 고기 한우에요?

### Response(응답):
🟩 Response:
'죄송합니다, 저는 인공지능 어시스턴트이기 때문에 실제로 고기를 판매하는 가게가 아닙니다. 하지만 일반적으로 불고기용 고기는 한우, 쇠고기, 돼지고기, 소고기, 돼지고기 등 다양한 종류가 판매됩니다. 따라서 해당 가게의 메뉴판을 확인해보시는 것이 좋을 것 같습니다.

🟦 Prompt:
### Instruction(명령어):
리처드 닉슨이 43대 부통령직을 수행한 년도는?

### Response(응답):
🟩 Response:
'이 질문에 대한 답은 제공되지 않았습니다. 닉슨은 1947년 대선에서 공화당 후보로 출마하였지만, 부통령직을 수행하지 않았습니다. 따라서 정확한 답변을 제공할 수 없습니다. J.K. 롤링 (John Johnson Rolling)은 1952년 대선에서 리처드 닉슨이 41대 부통령을 수행한 것으로 알려져 있습니다. J.

🟦 Prompt:
### Instruction(명령어):
시카고 오헤어 국제공항은 어디에 있어?

### Response(응답):
🟩 Response:
'저는 인공지능 어시스턴트이기 때문에 시카고에 대한 정보를 가지고 있지 않습니다. 하지만 일반적으로 시카고는 미국 중서부에 위치한 도시입니다. 따라서 시카고에 있는 국제공항은 없습니다.辰寶(辰寶)四郎(四郎)さん)에 따르면, 시카고는 미국의 중서부에 위치해 있습니다.

🟦 Prompt:
### Instruction(명령어):
오늘 미세먼지 어때?

### Response(응답):
🟩 Response:
'저는 인공지능 어시스턴트이기 때문에 미세먼지에 대한 정보를 알 수 없습니다. 하지만 미세먼지는 건강에 매우 중요한 영향을 미치며, 건강에 유해한 영향을 미치기도 합니다. 따라서 외출 전에는 마스크를 착용하고, 실외 활동을 자제하는 것이 좋습니다. 또한, 미세먼지를 줄이기 위한 노력을 기울이는 것이 중요합니다.


✅ 불고기 질문  

전: 삼계탕, 닭고기 등 엉뚱한 재료 언급  


후: "메뉴판 확인" 등 더 일반적인 조언, 그러나 "소고기, 돼지고기" 반복됨  


🔍 표현력은 좋아졌으나, token-level 반복 방지 효과는 한계 있음  




✅ 닉슨 부통령 질문  

전: 완전한 가짜 인물 등장, 논리 무너짐  


후: "J.K. 롤링 (John Johnson Rolling)" 같은 여전히 가짜 인물 존재  

"1947년 대선 출마"도 사실 아님  


🔍 Beam search는 문장 구조를 개선하지만 사실 정확도는 개선되지 않음 → RM(Reward Model) 필요  






✅ 시카고 공항  

전: “국제도시長官…” 이상한 텍스트 반복  


후: 이상한 일본어 섞임은 사라졌지만 “시카고에 국제공항 없음”이라는 사실 오류 발생  


🔍 중복 제거와 깔끔한 응답은 됐지만, 사실성은 여전히 문제  




✅ 미세먼지 질문  

전: "실내에서 많이 사용된다" 등 의미 불명확  


후: 전반적으로 구조가 개선되고 안전한 조언이 많아짐  


✅ 이 항목은 beam search + repetition_penalty 적용 후 가장 명확하게 개선된 사례  




## 새로운 데이터 수집
- 새 데이터 (AI hub :한국어 성능이 개선된 초거대 ai 언어모델 데이터)
- SFT 재학습
- RM 재학습

In [5]:
import gc
import torch
# 모델, 데이터 등을 명시적으로 삭제
# del sft_model
# del rm_model
# del ppo_model

# Python 가비지 컬렉터 실행
gc.collect()

# CUDA 메모리 해제
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")


✅ GPU memory cleared


In [7]:
# 이전 pretrained model 로드, 새 데이터 로드
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM

# =========================
# 1. 기존 모델 로딩
# =========================

# SFT 모델 (output_1_SFT)
sft_model = AutoModelForCausalLM.from_pretrained('./results/output_1_SFT')

# RM 모델 (output_2_RM)
rm_model = AutoModelForCausalLM.from_pretrained('./results/output_2_RM')

# PPO 모델 (output_3_PPO)
ppo_model = AutoModelForCausalLM.from_pretrained('./results/output_3_PPO')

# 토크나이저 로딩 (기반 모델: KoGPT2)
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)



In [8]:
# =========================
# 2. 새 데이터 로딩
# =========================

#원본경로
input_path = './data/SFTlabel.json'
output_path = './data/SFTlabel.jsonl'
# 파일 로드
with open(input_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# 변환 및 저장
with open(output_path, "w", encoding="utf-8") as f_out:
    for item in raw_data["data_info"]:
        prompt = item.get("question", "").strip()
        completion = item.get("answer", {}).get("contents", "").strip()
        if prompt and completion:
            f_out.write(json.dumps({"prompt": prompt, "completion": completion}, ensure_ascii=False) + "\n")

print(f"✅ 변환 완료: {output_path}")


def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

new_data_path = './data/SFTlabel.jsonl'  # 새 데이터 경로
new_data = load_jsonl(new_data_path)

print(f"✅ 새 데이터 {len(new_data)}개 로드 완료")
print("예시:", new_data[0])

from torch.utils.data import Dataset

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class SFTDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.input_ids = []
        self.labels = []
        
        # 데이터에서 prompt와 completion을 처리하여 input_ids와 labels 생성
        for item in data:
            prompt = f"### Instruction(명령어):\n{item['prompt']}\n\n### Response(응답):"
            completion = item['completion'] + tokenizer.eos_token  # eos_token은 completion 끝에만 추가

            # 전체 텍스트 인코딩 (패딩 및 자르기)
            enc_full = tokenizer(
                prompt + completion,               # 프롬프트와 응답 결합
                truncation=True,                   # 자르기
                padding='max_length',              # max_length로 패딩
                max_length=max_length,            # 최대 길이 설정
                return_tensors="pt"                # 텐서 형식으로 반환
            )
            
            # 프롬프트 인코딩 (길이 계산을 위한)
            enc_prompt = tokenizer(
                prompt,                            # 프롬프트만 사용
                truncation=True,                   # 자르기
                padding='max_length',              # max_length로 패딩
                max_length=max_length,            # 최대 길이 설정
                return_tensors="pt"                # 텐서 형식으로 반환
            )
            
            # 프롬프트의 길이를 계산
            prompt_len = sum([1 for token_id in enc_prompt['input_ids'][0] if token_id != tokenizer.pad_token_id])

            # input_ids 생성 (전체 텍스트)
            input_ids = enc_full['input_ids'].squeeze(0)  # (batch_size, seq_len)에서 seq_len만 추출
            # labels 생성 (input_ids의 복사본)
            labels = input_ids.clone()
            
            # 프롬프트 부분은 -100으로 마스킹
            labels[:prompt_len] = -100

            # input_ids와 labels 리스트에 추가
            self.input_ids.append(input_ids)
            self.labels.append(labels)

    def __len__(self):
        return len(self.input_ids)  # 데이터셋의 크기 반환

    def __getitem__(self, idx):
        # idx에 해당하는 input_ids와 labels 반환
        return {
            "input_ids": self.input_ids[idx],
            "labels": self.labels[idx]
        }


# 데이터셋 생성
new_dataset = SFTDataset(new_data, tokenizer)


✅ 새 데이터 10580개 로드 완료
예시: {'prompt': '화학공학이 다른 공학 분야와 어떻게 다른가요?', 'completion': '화학공학은 화학과 공학의 합성으로 이루어진 학문 분야입니다. 다른 일반적인 공학 분야와 비교하여 화학공학은 화학적인 원리와 반응을 토대로 공학적인 설계, 개발, 생산, 운영 등을 수행합니다. 다른 공학 분야와의 차이점은 다음과 같습니다:\n\n1. 전공 과목: 화학공학은 화학, 열역학, 반응공학 등 화학적 원리와 공학적인 계산, 모델링에 대한 교육을 받습니다. 다른 공학 분야는 기계, 전기, 토목 등과 같은 분야별 전공 과목을 강조합니다.\n\n2. 연구 대상: 화학공학은 다양한 종류의 물질과 화학 반응, 원리에 대한 연구를 수행합니다. 다른 공학 분야는 일반적으로 물리적인 시스템, 기계, 구조 등에 관련된 연구를 수행합니다.\n\n3. 산업 분야: 화학공학은 화학 프로세스, 제약, 에너지, 환경, 식품 등 다양한 산업 분야에서 활동합니다. 다른 공학 분야는 해당 분야의 기술과 지식을 적용하여 자동차, 건축, 통신 등 특정한 분야에 전문화됩니다.\n\n4. 실험 및 연구 방법: 화학공학은 실험실에서 독립적이고 정량적인 연구를 수행하는 경우가 많습니다. 다른 공학 분야는 설계, 모델링, 시뮬레이션, 시스템 개발 및 구축과 같은 다양한 방법을 활용하여 연구를 수행합니다.\n\n5. 안전 및 환경 고려: 화학공학은 화학 물질의 특성과 안전에 대한 이해가 요구됩니다. 안전한 프로세스 설계와 환경 보호에 중점을 둡니다. 다른 공학 분야도 안전 및 환경 요구사항을 고려하지만, 화학물질의 다양성과 특성에 대한 이해는 필요하지 않을 수 있습니다.\n\n이러한 차이점들로 인해 화학공학은 다른 공학 분야와는 독자적인 측면이 있으며, 활용 분야, 산업, 연구 방법 등에서 그 차이를 나타냅니다.'}


In [9]:
# 데이터셋에서 첫 번째 샘플 확인
sample = new_dataset[0]
print(sample['input_ids'])
print(sample['labels'])

tensor([  739,   378,   378,   378, 14659, 13394, 37091, 10651,   383, 25841,
         8006, 14914,   375, 12805, 13605,  8146,  9306, 30634, 10274,  8066,
        11649,  9306,  6824, 13675,   375,   378,   378,   378, 41951,   454,
         9549, 20549,   383,  8142,  7192, 14914, 12805, 13605,  8135,  9192,
        10487,  9067, 10264,  9318, 12041, 10841, 12435, 10274, 21154,  9306,
        12718, 30634, 10274,  8066, 26503, 10953, 13605,  8135, 10953,  9090,
        25024, 15568, 15265, 33011, 11493,   387, 22830, 30432, 10001,  9276,
        10028, 37194,  9306, 30634, 10274, 11486, 15106, 13793, 10615,  9144,
        16913,  7182,   401,   375,   375, 25294, 14160, 37348,   401, 10953,
        13605,  8135, 10953,   387,  9356,  8031, 14785, 10367, 13605,  9030,
        16881, 25024, 33011,  9142, 11982, 14652,  7485,  8022,  9167, 12902,
         9165, 16691,  9306, 30634, 33610, 22478,  9034, 10163, 19278,  9997,
         9239, 10274,  7644, 14160, 50621, 10314, 11387,   375])

In [15]:
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [20]:
# SFT 재학습
# train_sft(
#     merged_data_path="./data/merged_sft.jsonl",
#     base_model_path="./results/output_1_SFT",       # 기존 SFT
#     save_path="./results/output_sft_updated"        # 재학습된 SFT 저장
# )

sft_model.gradient_checkpointing_enable()

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./KoChatGPT/test",
    overwrite_output_dir=True,
    num_train_epochs=1,  # 에폭 수를 줄여 빠르게
    per_device_train_batch_size=8,  # GPU 여유 있다면 batch size 증가
    gradient_accumulation_steps=3,   # 가능한 한 작게 설정
    learning_rate=5e-5,              # 빠른 수렴을 위해 약간 높게
    warmup_steps=0,                  # 전체가 작다면 warmup 생략
    logging_steps=10,                # 빠른 로깅
    save_steps=1000,                  # 저장 빈도 줄이기 (적절히 조절)
    evaluation_strategy="no",        # 평가 생략
    prediction_loss_only=True,
    fp16=True
)
trainer = Trainer(
    model=sft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=new_dataset
)

# 훈련 시작
trainer.train()

# 모델 저장
sft_model.save_pretrained('./results/output_1_SFT_newdata')


Step,Training Loss
10,2.0491
20,2.0532
30,2.1127
40,2.2371
50,2.2201
60,2.1505
70,2.4574
80,2.6358
90,2.889
100,2.7469


In [22]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM

# 1에폭 훈련 후 저장된 모델 로드
sft_model = AutoModelForCausalLM.from_pretrained('./results/output_1_SFT_newdata')

# 재학습 설정
training_args = TrainingArguments(
    output_dir="./KoChatGPT/test",
    overwrite_output_dir=True,
    num_train_epochs=2,  # 2에폭 훈련을 추가로 진행
    per_device_train_batch_size=8,
    gradient_accumulation_steps=3,  
    learning_rate=5e-5,
    warmup_steps=0,
    logging_steps=100,
    save_steps=1000,
    evaluation_strategy="no",
    prediction_loss_only=True,
    fp16=True
)

# Trainer 초기화
trainer = Trainer(
    model=sft_model,
    args=training_args,
    data_collator=data_collator,  # 필요시 데이터 로더 정의
    train_dataset=new_dataset     # 훈련 데이터셋
)

# 훈련 시작
trainer.train()

# 훈련 후 모델 저장
sft_model.save_pretrained('./results/output_2_SFT_newdata')


Step,Training Loss
100,1.7534
200,2.2532
300,2.3915
400,2.5182
500,2.292
600,2.1223
700,2.0977
800,2.0992


In [33]:
generator = pipeline('text-generation', model='./results/output_2_SFT_newdata/', tokenizer=tokenizer)

generation_args = dict(   
    num_beams=6,                     # beam width
    do_sample=False,                # sampling 비활성화 → 확정적 결과
    temperature=1.0,                # 확률 분포에 영향 없음 (do_sample=False일 땐 무시됨)
    repetition_penalty=1.2,         # 중복 방지
    no_repeat_ngram_size=3,         # 반복 n-gram 방지
    eos_token_id=375,               # "\n" 토큰 기준 응답 종료
    max_new_tokens=128,
    early_stopping=True
)


PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = ['불고기용 고기 한우에요?',
               '리처드 닉슨이 43대 부통령직을 수행한 년도는?',
               '시카고 오헤어 국제공항은 어디에 있어?',
               '오늘 미세먼지 어때?']

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt' : tmp}) for tmp in list_prompt]

list_result = generator(list_prompt, **generation_args)   
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))


### Instruction(명령어):
불고기용 고기 한우에요?

### Response(응답):네, 불고기용 고기는 다양한 종류의 고기 요리에 사용됩니다. 일부 주요 고기 요리는 다음과 같습니다:


### Instruction(명령어):
리처드 닉슨이 43대 부통령직을 수행한 년도는?

### Response(응답):<NAME>의 41대 부통령직 수행 년도는 정확히 알려져 있지 않습니다. 그러나 그는 1950년대 후반부터 1960년대 초반까지 재임한 경력이 있습니다. 그의 재임 기간은 다음과 같습니다:


### Instruction(명령어):
시카고 오헤어 국제공항은 어디에 있어?

### Response(응답):시카고 시티는 미국 동부 지역에 위치한 도시로, 미국 동부 지역의 주요 도시 중 하나입니다. 시카고 시티는 세계에서 가장 큰 도시 중 하나로, 많은 관광객들이 방문하고 있습니다. 시카고 시티의 주요 관광지는 다음과 같습니다:


### Instruction(명령어):
오늘 미세먼지 어때?

### Response(응답):미세먼지 때문에 걱정이 많이 되네요. 미세먼지는 우리 건강에 매우 해로운 영향을 미칠 수 있기 때문에 조심해야 합니다. 




- 이전 sft_model 은 주어진 질문에 대해 관련없는 정보나 부정확한 사실을 포함하는 경우가 많았다. 
- 재학습된 모델은 답변이 간결하고 주제에 맞게 개선되었다. 
- 한자, 일본어 현상이 보이지 않음
- 하지만 여전히 정보 제공에 부족한 부분이 있다. 
    - 미세먼지에 대한 실제 데이터를 제공하지 않음
    - 시카고 오헤어 국제공항의 정보를 제공하는 것이 아닌 시카고 시티에 대한 응답을 함.

## RM 재학습

In [176]:
import json

import json

import json

# JSON 파일 읽기
with open('./data/RMlabel.json', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

# JSON을 jsonl 형식으로 변환하여 저장
with open("./data/RMlabel.jsonl", "w", encoding="utf-8") as f:
    # list_data_dict의 data_info 항목에 접근하여 각 항목을 jsonl 포맷으로 저장
    for entry in list_data_dict.get('data_info', []):
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("✅ JSONL 파일로 변환 완료!")



    
    

✅ JSONL 파일로 변환 완료!


In [192]:
# JSONL 파일 읽기

# JSONL 파일 읽기
with open('./data/RMlabel.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = [json.loads(line) for line in json_file]

# 데이터 확인
print(f"데이터 개수: {len(list_data_dict)}")
print(f"예시 데이터: {list_data_dict[0]}")


데이터 개수: 26408
예시 데이터: {'data_id': '0827c2bf-592d-4048-8f95-3c5fe336c1d1', 'data_category': {'main': '문어체', 'middle': '자연'}, 'question_type': '생성', 'question_count': 8, 'question': '동물에 대한 인식을 높이고 교육하는 데 관심 있어?', 'answer01': {'answer_count': 42, 'ranking': 1, 'contents': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 종류의 동물에 대해 알고 싶으신가요? 저는 동물의 특징, 서식지, 행동 등에 대한 정보를 제공할 수 있습니다. 또한 동물에 대한 적절한 보호 및 돌봄 방법에 대해서도 알려드릴 수 있습니다. 어떤 도움이 필요하신가요?'}, 'answer02': {'answer_count': 35, 'ranking': 2, 'contents': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 도움이 필요하신가요? 동물의 행동, 생태, 보호 등에 대해 알고 싶으신가요? 아니면 동물에 대한 교육 프로그램을 개발하고 싶으신가요? 좀 더 구체적으로 어떤 도움을 원하시는지 말씀해 주세요.'}, 'answer03': {'answer_count': 27, 'ranking': 3, 'contents': '네, 저는 동물에 대한 인식과 교육을 증진하는 것에 관심이 있습니다. 동물에 대한 지식을 공유하고 동물의 복지와 보호에 대한 인식을 높이는 데 도움을 드리고 싶습니다. 어떤 도움이 필요하신가요?'}, 'answer04': {'answer_count': 13, 'ranking': 4, 'contents': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 도움이 필요하신가요?'}, 'answer05': {'answer_count': 13, 'ranking': 5, 'contents': '네, 저

In [196]:
import json
from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

# 데이터 변환
# 변환할 데이터 리스트
converted_data = []

# 각 데이터 항목을 변환
for item in list_data_dict:
    prompt = item['question']
    completions = [
        item['answer01']['contents'],
        item['answer02']['contents'],
        item['answer03']['contents'],
        item['answer04']['contents'],
        item['answer05']['contents']
    ]
    
    # ranking을 기반으로 sorting (순위 1, 2, 3을 0, 1, 2로 매핑)
    ranking = [item['answer01']['ranking'] - 1, item['answer02']['ranking'] - 1, item['answer03']['ranking'] - 1]
    
    # 데이터 포맷 맞추기
    data = {
        'prompt': prompt,
        'completion_0': completions[ranking.index(0)],  # ranking이 0인 completion을 completion_0으로
        'completion_1': completions[ranking.index(1)],  # ranking이 1인 completion을 completion_1으로
        'completion_2': completions[ranking.index(2)],  # ranking이 2인 completion을 completion_2으로
        'ranking': ranking
    }
    
    converted_data.append(data)







[{'prompt': '동물에 대한 인식을 높이고 교육하는 데 관심 있어?', 'completion_0': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 종류의 동물에 대해 알고 싶으신가요? 저는 동물의 특징, 서식지, 행동 등에 대한 정보를 제공할 수 있습니다. 또한 동물에 대한 적절한 보호 및 돌봄 방법에 대해서도 알려드릴 수 있습니다. 어떤 도움이 필요하신가요?', 'completion_1': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 도움이 필요하신가요? 동물의 행동, 생태, 보호 등에 대해 알고 싶으신가요? 아니면 동물에 대한 교육 프로그램을 개발하고 싶으신가요? 좀 더 구체적으로 어떤 도움을 원하시는지 말씀해 주세요.', 'completion_2': '네, 저는 동물에 대한 인식과 교육을 증진하는 것에 관심이 있습니다. 동물에 대한 지식을 공유하고 동물의 복지와 보호에 대한 인식을 높이는 데 도움을 드리고 싶습니다. 어떤 도움이 필요하신가요?', 'ranking': [0, 1, 2]}, {'prompt': '언론이 사회에 미치는 영향은 어때?', 'completion_0': '언론은 사회에 큰 영향을 미칩니다. 언론은 사회의 다양한 이슈를 보도하고 정보를 제공함으로써 사람들이 사회에 대해 더 잘 이해할 수 있도록 도와줍니다. 또한, 언론은 사회 문제를 논의하고 이에 대한 인식을 형성하는 데에도 영향을 미칩니다.\n\n언론은 사회의 문제점을 보도하고 이를 대중에게 알리는 역할을 수행합니다. 이를 통해 언론은 사회적 변화를 촉진하고 개선을 이끌어내는 역할을 합니다. 언론은 또한 정치적인 결정에도 영향을 미치는데, 언론의 보도는 정치적인 논의와 의사결정에 영향을 주고, 정치인들의 행동을 감시하고 평가하는 역할을 합니다.\n\n하지만, 언론의 영향력은 무조건적이지 않을 수 있습니다. 언론은 보도하는 주체의 이해관계나 편향성 등에 따라서 다양한 의견이 제시될 수 있습니다. 따라서, 언론의 다양성과 

In [198]:
# 변환된 데이터 리스트
converted_data_raw = converted_data
converted_data = []

# 각 데이터 항목을 변환
for item in converted_data_raw:  # converted_data_raw는 기존에 변환된 데이터 리스트를 의미
    prompt = item['prompt']
    
    # 변환된 데이터를 chosen과 rejected로 나누기
    chosen = item['completion_0']  # ranking이 0인 completion을 chosen으로
    rejected = item['completion_1']  # ranking이 1인 completion을 rejected로
    
    # 데이터 포맷 맞추기
    data = {
        'prompt': prompt,
        'chosen': chosen,
        'rejected': rejected
    }
    
    converted_data.append(data)

# 변환된 데이터 예시 출력
print(converted_data[:2])  # 데이터의 첫 두 항목 확인


[{'prompt': '동물에 대한 인식을 높이고 교육하는 데 관심 있어?', 'chosen': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 종류의 동물에 대해 알고 싶으신가요? 저는 동물의 특징, 서식지, 행동 등에 대한 정보를 제공할 수 있습니다. 또한 동물에 대한 적절한 보호 및 돌봄 방법에 대해서도 알려드릴 수 있습니다. 어떤 도움이 필요하신가요?', 'rejected': '네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 도움이 필요하신가요? 동물의 행동, 생태, 보호 등에 대해 알고 싶으신가요? 아니면 동물에 대한 교육 프로그램을 개발하고 싶으신가요? 좀 더 구체적으로 어떤 도움을 원하시는지 말씀해 주세요.'}, {'prompt': '언론이 사회에 미치는 영향은 어때?', 'chosen': '언론은 사회에 큰 영향을 미칩니다. 언론은 사회의 다양한 이슈를 보도하고 정보를 제공함으로써 사람들이 사회에 대해 더 잘 이해할 수 있도록 도와줍니다. 또한, 언론은 사회 문제를 논의하고 이에 대한 인식을 형성하는 데에도 영향을 미칩니다.\n\n언론은 사회의 문제점을 보도하고 이를 대중에게 알리는 역할을 수행합니다. 이를 통해 언론은 사회적 변화를 촉진하고 개선을 이끌어내는 역할을 합니다. 언론은 또한 정치적인 결정에도 영향을 미치는데, 언론의 보도는 정치적인 논의와 의사결정에 영향을 주고, 정치인들의 행동을 감시하고 평가하는 역할을 합니다.\n\n하지만, 언론의 영향력은 무조건적이지 않을 수 있습니다. 언론은 보도하는 주체의 이해관계나 편향성 등에 따라서 다양한 의견이 제시될 수 있습니다. 따라서, 언론의 다양성과 중립성은 매우 중요하며, 사회적 문제를 다각도로 바라보고 다양한 의견을 제공하는 것이 중요합니다.\n\n또한, 언론의 영향력은 개인의 미디어 소비 습관에 따라 다를 수 있습니다. 사람들은 언론의 정보를 어떻게 이용하고 해석하느냐에 따라서 다른 결과를 얻을 수 있습니다. 따라서, 사람들은 언론의 정

In [199]:
#split
split_ratio = 0.8
split_index = int(len(converted_data)* split_ratio)


train_data = converted_data[:split_index] 
eval_data = converted_data[split_index:]

print(f"Train size: {len(train_data)}, Eval size: {len(eval_data)}")


#dataset 생성
train_dataset = RewardDataset(train_data, tokenizer, 256)
eval_dataset = RewardDataset(eval_data, tokenizer, 256)


Train size: 21126, Eval size: 5282










  0%|          | 0/21126 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  0%|          | 43/21126 [00:00<00:50, 415.92it/s][A[A[A[A[A[A[A[A







  0%|          | 86/21126 [00:00<00:49, 422.96it/s][A[A[A[A[A[A[A[A







  1%|          | 129/21126 [00:00<00:50, 414.28it/s][A[A[A[A[A[A[A[A







  1%|          | 174/21126 [00:00<00:49, 426.81it/s][A[A[A[A[A[A[A[A







  1%|          | 220/21126 [00:00<00:47, 435.78it/s][A[A[A[A[A[A[A[A







  1%|          | 264/21126 [00:00<00:48, 433.37it/s][A[A[A[A[A[A[A[A







  1%|▏         | 308/21126 [00:00<00:48, 431.46it/s][A[A[A[A[A[A[A[A







  2%|▏         | 354/21126 [00:00<00:47, 438.87it/s][A[A[A[A[A[A[A[A







  2%|▏         | 398/21126 [00:00<00:48, 425.76it/s][A[A[A[A[A[A[A[A







  2%|▏         | 442/21126 [00:01<00:48, 429.54it/s][A[A[A[A[A[A[A[A







  2%|▏         | 486/21126 [00:01<00:48, 429.87it/s][A[A[A[A[A[A[A[A






 21%|██        | 4421/21126 [00:09<00:37, 449.92it/s][A[A[A[A[A[A[A[A







 21%|██        | 4467/21126 [00:10<00:37, 440.34it/s][A[A[A[A[A[A[A[A







 21%|██▏       | 4516/21126 [00:10<00:36, 452.83it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 4562/21126 [00:10<00:36, 450.76it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 4608/21126 [00:10<00:36, 452.86it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 4654/21126 [00:10<00:36, 448.10it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 4702/21126 [00:10<00:35, 456.82it/s][A[A[A[A[A[A[A[A







 22%|██▏       | 4749/21126 [00:10<00:35, 457.00it/s][A[A[A[A[A[A[A[A







 23%|██▎       | 4800/21126 [00:10<00:34, 469.64it/s][A[A[A[A[A[A[A[A







 23%|██▎       | 4847/21126 [00:10<00:35, 459.73it/s][A[A[A[A[A[A[A[A







 23%|██▎       | 4895/21126 [00:10<00:34, 464.28it/s][A[A[A[A[A[A[A[A







 23%|██▎       | 4942/21126 [00:11<00:35, 453.84it/s][A[A[A[A

 43%|████▎     | 9088/21126 [00:19<00:24, 486.63it/s][A[A[A[A[A[A[A[A







 43%|████▎     | 9137/21126 [00:19<00:24, 481.32it/s][A[A[A[A[A[A[A[A







 43%|████▎     | 9186/21126 [00:20<00:24, 480.30it/s][A[A[A[A[A[A[A[A







 44%|████▎     | 9235/21126 [00:20<00:24, 477.70it/s][A[A[A[A[A[A[A[A







 44%|████▍     | 9286/21126 [00:20<00:24, 486.08it/s][A[A[A[A[A[A[A[A







 44%|████▍     | 9337/21126 [00:20<00:24, 490.71it/s][A[A[A[A[A[A[A[A







 44%|████▍     | 9387/21126 [00:20<00:23, 491.17it/s][A[A[A[A[A[A[A[A







 45%|████▍     | 9437/21126 [00:20<00:23, 489.25it/s][A[A[A[A[A[A[A[A







 45%|████▍     | 9486/21126 [00:20<00:24, 484.96it/s][A[A[A[A[A[A[A[A







 45%|████▌     | 9535/21126 [00:20<00:24, 482.07it/s][A[A[A[A[A[A[A[A







 45%|████▌     | 9584/21126 [00:20<00:23, 483.20it/s][A[A[A[A[A[A[A[A







 46%|████▌     | 9635/21126 [00:21<00:23, 490.41it/s][A[A[A[A

 66%|██████▋   | 14011/21126 [00:29<00:13, 543.51it/s][A[A[A[A[A[A[A[A







 67%|██████▋   | 14066/21126 [00:29<00:12, 543.29it/s][A[A[A[A[A[A[A[A







 67%|██████▋   | 14122/21126 [00:29<00:12, 547.92it/s][A[A[A[A[A[A[A[A







 67%|██████▋   | 14177/21126 [00:30<00:12, 544.37it/s][A[A[A[A[A[A[A[A







 67%|██████▋   | 14232/21126 [00:30<00:12, 543.99it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 14287/21126 [00:30<00:13, 524.30it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 14340/21126 [00:30<00:13, 520.61it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 14393/21126 [00:30<00:12, 519.78it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 14450/21126 [00:30<00:12, 531.39it/s][A[A[A[A[A[A[A[A







 69%|██████▊   | 14505/21126 [00:30<00:12, 534.24it/s][A[A[A[A[A[A[A[A







 69%|██████▉   | 14559/21126 [00:30<00:12, 523.49it/s][A[A[A[A[A[A[A[A







 69%|██████▉   | 14614/21126 [00:30<00:12, 528.96it/s]

 91%|█████████ | 19226/21126 [00:39<00:03, 547.17it/s][A[A[A[A[A[A[A[A







 91%|█████████▏| 19282/21126 [00:39<00:03, 548.82it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 19339/21126 [00:39<00:03, 552.81it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 19395/21126 [00:39<00:03, 543.53it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 19450/21126 [00:39<00:03, 523.19it/s][A[A[A[A[A[A[A[A







 92%|█████████▏| 19503/21126 [00:40<00:03, 508.70it/s][A[A[A[A[A[A[A[A







 93%|█████████▎| 19557/21126 [00:40<00:03, 515.88it/s][A[A[A[A[A[A[A[A







 93%|█████████▎| 19616/21126 [00:40<00:02, 535.72it/s][A[A[A[A[A[A[A[A







 93%|█████████▎| 19670/21126 [00:40<00:02, 533.15it/s][A[A[A[A[A[A[A[A







 93%|█████████▎| 19724/21126 [00:40<00:02, 532.58it/s][A[A[A[A[A[A[A[A







 94%|█████████▎| 19780/21126 [00:40<00:02, 539.19it/s][A[A[A[A[A[A[A[A







 94%|█████████▍| 19840/21126 [00:40<00:02, 554.11it/s]

 67%|██████▋   | 3538/5282 [00:06<00:03, 548.51it/s][A[A[A[A[A[A[A[A







 68%|██████▊   | 3593/5282 [00:06<00:03, 536.02it/s][A[A[A[A[A[A[A[A







 69%|██████▉   | 3647/5282 [00:06<00:03, 531.86it/s][A[A[A[A[A[A[A[A







 70%|███████   | 3704/5282 [00:06<00:02, 542.29it/s][A[A[A[A[A[A[A[A







 71%|███████   | 3759/5282 [00:06<00:02, 537.80it/s][A[A[A[A[A[A[A[A







 72%|███████▏  | 3813/5282 [00:07<00:02, 534.78it/s][A[A[A[A[A[A[A[A







 73%|███████▎  | 3868/5282 [00:07<00:02, 539.09it/s][A[A[A[A[A[A[A[A







 74%|███████▍  | 3924/5282 [00:07<00:02, 544.33it/s][A[A[A[A[A[A[A[A







 75%|███████▌  | 3979/5282 [00:07<00:02, 538.51it/s][A[A[A[A[A[A[A[A







 76%|███████▋  | 4033/5282 [00:07<00:02, 528.43it/s][A[A[A[A[A[A[A[A







 77%|███████▋  | 4086/5282 [00:07<00:02, 524.89it/s][A[A[A[A[A[A[A[A







 78%|███████▊  | 4141/5282 [00:07<00:02, 530.15it/s][A[A[A[A[A[A[A[A

In [200]:
#데이터셋 확인
idx = 0
print('#'*70)
print('## prompt ##')
print(train_data[idx]['prompt'])
print('#'*70)
print('## chosen ##')
print(train_data[idx]['chosen'])
print('#'*70)
print('## rejected ##')
print(train_data[idx]['rejected'])


######################################################################
## prompt ##
동물에 대한 인식을 높이고 교육하는 데 관심 있어?
######################################################################
## chosen ##
네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 종류의 동물에 대해 알고 싶으신가요? 저는 동물의 특징, 서식지, 행동 등에 대한 정보를 제공할 수 있습니다. 또한 동물에 대한 적절한 보호 및 돌봄 방법에 대해서도 알려드릴 수 있습니다. 어떤 도움이 필요하신가요?
######################################################################
## rejected ##
네, 저는 동물에 대한 인식을 높이고 교육하는 데 관심이 있습니다. 어떤 도움이 필요하신가요? 동물의 행동, 생태, 보호 등에 대해 알고 싶으신가요? 아니면 동물에 대한 교육 프로그램을 개발하고 싶으신가요? 좀 더 구체적으로 어떤 도움을 원하시는지 말씀해 주세요.


In [210]:
#모델과 토크나이저 불러오기
model = AutoModelForCausalLM.from_pretrained('./results/output_2_RM')
tokenizer = AutoTokenizer.from_pretrained(
    'skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
    padding_side="right",
    model_max_length=512,
)

with NaiveStrategy().model_init_context():
        model = GPTRM_custom(pretrained='skt/kogpt2-base-v2', lora_rank=0, tokenizer=tokenizer).cuda()



Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [220]:
#메모리 초기화
import torch, gc

def reset_cuda():
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU memory cleared")

reset_cuda()

Train epoch:   0%|          | 0/3 [06:45<?, ?it/s]
Train step of epoch 0:  12%|█▏        | 324/2641 [06:45<48:19,  1.25s/it, loss=0.677]
Train epoch:   0%|          | 0/3 [01:34<?, ?it/s]
Train step of epoch 0:   1%|▏         | 36/2641 [01:34<1:53:23,  2.61s/it, loss=0.625]
Train epoch:   0%|          | 0/3 [00:56<?, ?it/s]
Train step of epoch 0:   0%|          | 0/331 [00:56<?, ?it/s]
Train epoch:   0%|          | 0/3 [00:50<?, ?it/s]
Train step of epoch 0:   0%|          | 0/661 [00:50<?, ?it/s]
Train epoch:   0%|          | 0/3 [00:45<?, ?it/s]
Train step of epoch 0:   0%|          | 0/1321 [00:45<?, ?it/s]
Train epoch:   0%|          | 0/3 [00:11<?, ?it/s]
Train step of epoch 0:   0%|          | 0/2641 [00:11<?, ?it/s]


✅ GPU memory cleared


In [221]:
# Ensure all tensors are on the same device (either CPU or CUDA)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the selected device
model.to(device)

# RM 학습
# model.gradient_checkpointing_enable()
trainer = RewardModelTrainer(model=model,
                             strategy=NaiveStrategy(),
                             optim=Adam(model.parameters(), lr=5e-5),
                             train_dataset=train_dataset,
                             eval_dataset=eval_dataset,
                             batch_size=8,
                             max_epochs=3, 
                             )


trainer.fit(use_lora=0)  

model.save_pretrained('./results/output_2_RM_new_data')



Train epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Train step of epoch 0:   0%|          | 0/2641 [00:00<?, ?it/s][A
Train step of epoch 0:   0%|          | 1/2641 [00:00<37:36,  1.17it/s][A
Train step of epoch 0:   0%|          | 1/2641 [00:00<37:36,  1.17it/s, loss=0.628][A
Train step of epoch 0:   0%|          | 2/2641 [00:01<38:29,  1.14it/s, loss=0.628][A
Train step of epoch 0:   0%|          | 2/2641 [00:01<38:29,  1.14it/s, loss=0.55] [A
Train step of epoch 0:   0%|          | 3/2641 [00:02<38:51,  1.13it/s, loss=0.55][A
Train step of epoch 0:   0%|          | 3/2641 [00:02<38:51,  1.13it/s, loss=0.592][A
Train step of epoch 0:   0%|          | 4/2641 [00:03<39:09,  1.12it/s, loss=0.592][A
Train step of epoch 0:   0%|          | 4/2641 [00:03<39:09,  1.12it/s, loss=0.521][A
Train step of epoch 0:   0%|          | 5/2641 [00:04<39:20,  1.12it/s, loss=0.521][A
Train step of epoch 0:   0%|          | 5/2641 [00:04<39:20,  1.12it/s, loss=0.396][A
Train step of epoch 0:   

Train step of epoch 0:   2%|▏         | 47/2641 [00:44<41:55,  1.03it/s, loss=0.669][A
Train step of epoch 0:   2%|▏         | 47/2641 [00:44<41:55,  1.03it/s, loss=0.598][A
Train step of epoch 0:   2%|▏         | 48/2641 [00:45<41:42,  1.04it/s, loss=0.598][A
Train step of epoch 0:   2%|▏         | 48/2641 [00:45<41:42,  1.04it/s, loss=0.57] [A
Train step of epoch 0:   2%|▏         | 49/2641 [00:46<41:34,  1.04it/s, loss=0.57][A
Train step of epoch 0:   2%|▏         | 49/2641 [00:46<41:34,  1.04it/s, loss=0.669][A
Train step of epoch 0:   2%|▏         | 50/2641 [00:47<41:28,  1.04it/s, loss=0.669][A
Train step of epoch 0:   2%|▏         | 50/2641 [00:47<41:28,  1.04it/s, loss=0.469][A
Train step of epoch 0:   2%|▏         | 51/2641 [00:48<41:19,  1.04it/s, loss=0.469][A
Train step of epoch 0:   2%|▏         | 51/2641 [00:48<41:19,  1.04it/s, loss=0.601][A
Train step of epoch 0:   2%|▏         | 52/2641 [00:49<41:10,  1.05it/s, loss=0.601][A
Train step of epoch 0:   2%|▏    

Train step of epoch 0:   4%|▎         | 93/2641 [01:27<39:09,  1.08it/s, loss=0.749][A
Train step of epoch 0:   4%|▎         | 94/2641 [01:28<39:11,  1.08it/s, loss=0.749][A
Train step of epoch 0:   4%|▎         | 94/2641 [01:28<39:11,  1.08it/s, loss=0.708][A
Train step of epoch 0:   4%|▎         | 95/2641 [01:29<39:09,  1.08it/s, loss=0.708][A
Train step of epoch 0:   4%|▎         | 95/2641 [01:29<39:09,  1.08it/s, loss=0.598][A
Train step of epoch 0:   4%|▎         | 96/2641 [01:30<39:10,  1.08it/s, loss=0.598][A
Train step of epoch 0:   4%|▎         | 96/2641 [01:30<39:10,  1.08it/s, loss=0.647][A
Train step of epoch 0:   4%|▎         | 97/2641 [01:31<39:11,  1.08it/s, loss=0.647][A
Train step of epoch 0:   4%|▎         | 97/2641 [01:31<39:11,  1.08it/s, loss=0.719][A
Train step of epoch 0:   4%|▎         | 98/2641 [01:32<39:14,  1.08it/s, loss=0.719][A
Train step of epoch 0:   4%|▎         | 98/2641 [01:32<39:14,  1.08it/s, loss=0.656][A
Train step of epoch 0:   4%|▎   

Train step of epoch 0:   5%|▌         | 139/2641 [02:10<39:18,  1.06it/s, loss=0.703][A
Train step of epoch 0:   5%|▌         | 140/2641 [02:11<39:13,  1.06it/s, loss=0.703][A
Train step of epoch 0:   5%|▌         | 140/2641 [02:11<39:13,  1.06it/s, loss=0.723][A
Train step of epoch 0:   5%|▌         | 141/2641 [02:12<39:14,  1.06it/s, loss=0.723][A
Train step of epoch 0:   5%|▌         | 141/2641 [02:12<39:14,  1.06it/s, loss=0.558][A
Train step of epoch 0:   5%|▌         | 142/2641 [02:13<39:16,  1.06it/s, loss=0.558][A
Train step of epoch 0:   5%|▌         | 142/2641 [02:13<39:16,  1.06it/s, loss=0.68] [A
Train step of epoch 0:   5%|▌         | 143/2641 [02:14<39:10,  1.06it/s, loss=0.68][A
Train step of epoch 0:   5%|▌         | 143/2641 [02:14<39:10,  1.06it/s, loss=0.55][A
Train step of epoch 0:   5%|▌         | 144/2641 [02:15<39:07,  1.06it/s, loss=0.55][A
Train step of epoch 0:   5%|▌         | 144/2641 [02:15<39:07,  1.06it/s, loss=0.734][A
Train step of epoch 0:  

Train step of epoch 0:   7%|▋         | 185/2641 [02:53<38:14,  1.07it/s, loss=0.239][A
Train step of epoch 0:   7%|▋         | 186/2641 [02:54<38:13,  1.07it/s, loss=0.239][A
Train step of epoch 0:   7%|▋         | 186/2641 [02:54<38:13,  1.07it/s, loss=0.369][A
Train step of epoch 0:   7%|▋         | 187/2641 [02:55<38:15,  1.07it/s, loss=0.369][A
Train step of epoch 0:   7%|▋         | 187/2641 [02:55<38:15,  1.07it/s, loss=0.603][A
Train step of epoch 0:   7%|▋         | 188/2641 [02:56<38:15,  1.07it/s, loss=0.603][A
Train step of epoch 0:   7%|▋         | 188/2641 [02:56<38:15,  1.07it/s, loss=0.526][A
Train step of epoch 0:   7%|▋         | 189/2641 [02:57<38:11,  1.07it/s, loss=0.526][A
Train step of epoch 0:   7%|▋         | 189/2641 [02:57<38:11,  1.07it/s, loss=0.276][A
Train step of epoch 0:   7%|▋         | 190/2641 [02:58<38:11,  1.07it/s, loss=0.276][A
Train step of epoch 0:   7%|▋         | 190/2641 [02:58<38:11,  1.07it/s, loss=0.921][A
Train step of epoch 0

Train step of epoch 0:   9%|▊         | 231/2641 [03:36<37:21,  1.08it/s, loss=0.756][A
Train step of epoch 0:   9%|▉         | 232/2641 [03:37<37:22,  1.07it/s, loss=0.756][A
Train step of epoch 0:   9%|▉         | 232/2641 [03:37<37:22,  1.07it/s, loss=0.656][A
Train step of epoch 0:   9%|▉         | 233/2641 [03:38<37:23,  1.07it/s, loss=0.656][A
Train step of epoch 0:   9%|▉         | 233/2641 [03:38<37:23,  1.07it/s, loss=0.536][A
Train step of epoch 0:   9%|▉         | 234/2641 [03:39<37:22,  1.07it/s, loss=0.536][A
Train step of epoch 0:   9%|▉         | 234/2641 [03:39<37:22,  1.07it/s, loss=0.586][A
Train step of epoch 0:   9%|▉         | 235/2641 [03:40<37:26,  1.07it/s, loss=0.586][A
Train step of epoch 0:   9%|▉         | 235/2641 [03:40<37:26,  1.07it/s, loss=0.603][A
Train step of epoch 0:   9%|▉         | 236/2641 [03:41<37:24,  1.07it/s, loss=0.603][A
Train step of epoch 0:   9%|▉         | 236/2641 [03:41<37:24,  1.07it/s, loss=0.675][A
Train step of epoch 0

Train step of epoch 0:  10%|█         | 277/2641 [04:19<36:53,  1.07it/s, loss=0.527][A
Train step of epoch 0:  11%|█         | 278/2641 [04:20<36:50,  1.07it/s, loss=0.527][A
Train step of epoch 0:  11%|█         | 278/2641 [04:20<36:50,  1.07it/s, loss=0.61] [A
Train step of epoch 0:  11%|█         | 279/2641 [04:21<36:48,  1.07it/s, loss=0.61][A
Train step of epoch 0:  11%|█         | 279/2641 [04:21<36:48,  1.07it/s, loss=0.55][A
Train step of epoch 0:  11%|█         | 280/2641 [04:22<36:48,  1.07it/s, loss=0.55][A
Train step of epoch 0:  11%|█         | 280/2641 [04:22<36:48,  1.07it/s, loss=0.477][A
Train step of epoch 0:  11%|█         | 281/2641 [04:23<36:47,  1.07it/s, loss=0.477][A
Train step of epoch 0:  11%|█         | 281/2641 [04:23<36:47,  1.07it/s, loss=0.331][A
Train step of epoch 0:  11%|█         | 282/2641 [04:24<36:48,  1.07it/s, loss=0.331][A
Train step of epoch 0:  11%|█         | 282/2641 [04:24<36:48,  1.07it/s, loss=0.841][A
Train step of epoch 0:  

Train step of epoch 0:  12%|█▏        | 323/2641 [05:02<36:10,  1.07it/s, loss=0.537][A
Train step of epoch 0:  12%|█▏        | 324/2641 [05:03<36:06,  1.07it/s, loss=0.537][A
Train step of epoch 0:  12%|█▏        | 324/2641 [05:03<36:06,  1.07it/s, loss=0.688][A
Train step of epoch 0:  12%|█▏        | 325/2641 [05:04<36:04,  1.07it/s, loss=0.688][A
Train step of epoch 0:  12%|█▏        | 325/2641 [05:04<36:04,  1.07it/s, loss=0.629][A
Train step of epoch 0:  12%|█▏        | 326/2641 [05:05<36:04,  1.07it/s, loss=0.629][A
Train step of epoch 0:  12%|█▏        | 326/2641 [05:05<36:04,  1.07it/s, loss=0.583][A
Train step of epoch 0:  12%|█▏        | 327/2641 [05:06<36:04,  1.07it/s, loss=0.583][A
Train step of epoch 0:  12%|█▏        | 327/2641 [05:06<36:04,  1.07it/s, loss=0.757][A
Train step of epoch 0:  12%|█▏        | 328/2641 [05:07<36:02,  1.07it/s, loss=0.757][A
Train step of epoch 0:  12%|█▏        | 328/2641 [05:07<36:02,  1.07it/s, loss=0.63] [A
Train step of epoch 0

Train step of epoch 0:  14%|█▍        | 369/2641 [05:45<35:29,  1.07it/s, loss=0.693][A
Train step of epoch 0:  14%|█▍        | 370/2641 [05:46<35:28,  1.07it/s, loss=0.693][A
Train step of epoch 0:  14%|█▍        | 370/2641 [05:46<35:28,  1.07it/s, loss=0.592][A
Train step of epoch 0:  14%|█▍        | 371/2641 [05:47<35:27,  1.07it/s, loss=0.592][A
Train step of epoch 0:  14%|█▍        | 371/2641 [05:47<35:27,  1.07it/s, loss=0.49] [A
Train step of epoch 0:  14%|█▍        | 372/2641 [05:48<35:25,  1.07it/s, loss=0.49][A
Train step of epoch 0:  14%|█▍        | 372/2641 [05:48<35:25,  1.07it/s, loss=0.687][A
Train step of epoch 0:  14%|█▍        | 373/2641 [05:49<35:21,  1.07it/s, loss=0.687][A
Train step of epoch 0:  14%|█▍        | 373/2641 [05:49<35:21,  1.07it/s, loss=0.576][A
Train step of epoch 0:  14%|█▍        | 374/2641 [05:50<35:22,  1.07it/s, loss=0.576][A
Train step of epoch 0:  14%|█▍        | 374/2641 [05:50<35:22,  1.07it/s, loss=0.65] [A
Train step of epoch 0:

Train step of epoch 0:  16%|█▌        | 415/2641 [06:29<34:47,  1.07it/s, loss=0.709][A
Train step of epoch 0:  16%|█▌        | 416/2641 [06:29<34:47,  1.07it/s, loss=0.709][A
Train step of epoch 0:  16%|█▌        | 416/2641 [06:29<34:47,  1.07it/s, loss=0.593][A
Train step of epoch 0:  16%|█▌        | 417/2641 [06:30<34:47,  1.07it/s, loss=0.593][A
Train step of epoch 0:  16%|█▌        | 417/2641 [06:30<34:47,  1.07it/s, loss=0.682][A
Train step of epoch 0:  16%|█▌        | 418/2641 [06:31<34:46,  1.07it/s, loss=0.682][A
Train step of epoch 0:  16%|█▌        | 418/2641 [06:31<34:46,  1.07it/s, loss=0.612][A
Train step of epoch 0:  16%|█▌        | 419/2641 [06:32<34:42,  1.07it/s, loss=0.612][A
Train step of epoch 0:  16%|█▌        | 419/2641 [06:32<34:42,  1.07it/s, loss=0.63] [A
Train step of epoch 0:  16%|█▌        | 420/2641 [06:33<34:40,  1.07it/s, loss=0.63][A
Train step of epoch 0:  16%|█▌        | 420/2641 [06:33<34:40,  1.07it/s, loss=0.626][A
Train step of epoch 0:

Train step of epoch 0:  17%|█▋        | 461/2641 [07:12<34:03,  1.07it/s, loss=0.504][A
Train step of epoch 0:  17%|█▋        | 462/2641 [07:13<34:01,  1.07it/s, loss=0.504][A
Train step of epoch 0:  17%|█▋        | 462/2641 [07:13<34:01,  1.07it/s, loss=0.701][A
Train step of epoch 0:  18%|█▊        | 463/2641 [07:13<33:59,  1.07it/s, loss=0.701][A
Train step of epoch 0:  18%|█▊        | 463/2641 [07:13<33:59,  1.07it/s, loss=0.655][A
Train step of epoch 0:  18%|█▊        | 464/2641 [07:14<33:58,  1.07it/s, loss=0.655][A
Train step of epoch 0:  18%|█▊        | 464/2641 [07:14<33:58,  1.07it/s, loss=0.727][A
Train step of epoch 0:  18%|█▊        | 465/2641 [07:15<33:56,  1.07it/s, loss=0.727][A
Train step of epoch 0:  18%|█▊        | 465/2641 [07:15<33:56,  1.07it/s, loss=0.596][A
Train step of epoch 0:  18%|█▊        | 466/2641 [07:16<33:55,  1.07it/s, loss=0.596][A
Train step of epoch 0:  18%|█▊        | 466/2641 [07:16<33:55,  1.07it/s, loss=0.708][A
Train step of epoch 0

Train step of epoch 0:  19%|█▉        | 507/2641 [07:55<33:18,  1.07it/s, loss=0.733][A
Train step of epoch 0:  19%|█▉        | 508/2641 [07:56<33:16,  1.07it/s, loss=0.733][A
Train step of epoch 0:  19%|█▉        | 508/2641 [07:56<33:16,  1.07it/s, loss=0.583][A
Train step of epoch 0:  19%|█▉        | 509/2641 [07:57<33:12,  1.07it/s, loss=0.583][A
Train step of epoch 0:  19%|█▉        | 509/2641 [07:57<33:12,  1.07it/s, loss=0.603][A
Train step of epoch 0:  19%|█▉        | 510/2641 [07:57<33:11,  1.07it/s, loss=0.603][A
Train step of epoch 0:  19%|█▉        | 510/2641 [07:58<33:11,  1.07it/s, loss=0.614][A
Train step of epoch 0:  19%|█▉        | 511/2641 [07:58<33:10,  1.07it/s, loss=0.614][A
Train step of epoch 0:  19%|█▉        | 511/2641 [07:58<33:10,  1.07it/s, loss=0.721][A
Train step of epoch 0:  19%|█▉        | 512/2641 [07:59<33:14,  1.07it/s, loss=0.721][A
Train step of epoch 0:  19%|█▉        | 512/2641 [07:59<33:14,  1.07it/s, loss=0.687][A
Train step of epoch 0

Train step of epoch 0:  21%|██        | 553/2641 [08:38<32:36,  1.07it/s, loss=0.596][A
Train step of epoch 0:  21%|██        | 554/2641 [08:39<32:33,  1.07it/s, loss=0.596][A
Train step of epoch 0:  21%|██        | 554/2641 [08:39<32:33,  1.07it/s, loss=0.691][A
Train step of epoch 0:  21%|██        | 555/2641 [08:40<32:33,  1.07it/s, loss=0.691][A
Train step of epoch 0:  21%|██        | 555/2641 [08:40<32:33,  1.07it/s, loss=0.708][A
Train step of epoch 0:  21%|██        | 556/2641 [08:41<32:29,  1.07it/s, loss=0.708][A
Train step of epoch 0:  21%|██        | 556/2641 [08:41<32:29,  1.07it/s, loss=0.575][A
Train step of epoch 0:  21%|██        | 557/2641 [08:41<32:30,  1.07it/s, loss=0.575][A
Train step of epoch 0:  21%|██        | 557/2641 [08:42<32:30,  1.07it/s, loss=0.509][A
Train step of epoch 0:  21%|██        | 558/2641 [08:42<32:31,  1.07it/s, loss=0.509][A
Train step of epoch 0:  21%|██        | 558/2641 [08:42<32:31,  1.07it/s, loss=0.719][A
Train step of epoch 0

Train step of epoch 0:  23%|██▎       | 599/2641 [09:21<31:51,  1.07it/s, loss=0.664][A
Train step of epoch 0:  23%|██▎       | 600/2641 [09:22<31:52,  1.07it/s, loss=0.664][A
Train step of epoch 0:  23%|██▎       | 600/2641 [09:22<31:52,  1.07it/s, loss=0.664][A
Train step of epoch 0:  23%|██▎       | 601/2641 [09:23<31:52,  1.07it/s, loss=0.664][A
Train step of epoch 0:  23%|██▎       | 601/2641 [09:23<31:52,  1.07it/s, loss=0.656][A
Train step of epoch 0:  23%|██▎       | 602/2641 [09:24<31:50,  1.07it/s, loss=0.656][A
Train step of epoch 0:  23%|██▎       | 602/2641 [09:24<31:50,  1.07it/s, loss=0.483][A
Train step of epoch 0:  23%|██▎       | 603/2641 [09:25<31:51,  1.07it/s, loss=0.483][A
Train step of epoch 0:  23%|██▎       | 603/2641 [09:25<31:51,  1.07it/s, loss=0.747][A
Train step of epoch 0:  23%|██▎       | 604/2641 [09:26<31:48,  1.07it/s, loss=0.747][A
Train step of epoch 0:  23%|██▎       | 604/2641 [09:26<31:48,  1.07it/s, loss=0.478][A
Train step of epoch 0

Train step of epoch 0:  24%|██▍       | 645/2641 [10:04<31:09,  1.07it/s, loss=0.667][A
Train step of epoch 0:  24%|██▍       | 646/2641 [10:05<31:10,  1.07it/s, loss=0.667][A
Train step of epoch 0:  24%|██▍       | 646/2641 [10:05<31:10,  1.07it/s, loss=0.696][A
Train step of epoch 0:  24%|██▍       | 647/2641 [10:06<31:08,  1.07it/s, loss=0.696][A
Train step of epoch 0:  24%|██▍       | 647/2641 [10:06<31:08,  1.07it/s, loss=0.677][A
Train step of epoch 0:  25%|██▍       | 648/2641 [10:07<31:08,  1.07it/s, loss=0.677][A
Train step of epoch 0:  25%|██▍       | 648/2641 [10:07<31:08,  1.07it/s, loss=0.598][A
Train step of epoch 0:  25%|██▍       | 649/2641 [10:08<31:05,  1.07it/s, loss=0.598][A
Train step of epoch 0:  25%|██▍       | 649/2641 [10:08<31:05,  1.07it/s, loss=0.753][A
Train step of epoch 0:  25%|██▍       | 650/2641 [10:09<31:03,  1.07it/s, loss=0.753][A
Train step of epoch 0:  25%|██▍       | 650/2641 [10:09<31:03,  1.07it/s, loss=0.427][A
Train step of epoch 0

Train step of epoch 0:  26%|██▌       | 691/2641 [10:47<30:23,  1.07it/s, loss=0.701][A
Train step of epoch 0:  26%|██▌       | 692/2641 [10:48<30:26,  1.07it/s, loss=0.701][A
Train step of epoch 0:  26%|██▌       | 692/2641 [10:48<30:26,  1.07it/s, loss=0.722][A
Train step of epoch 0:  26%|██▌       | 693/2641 [10:49<30:22,  1.07it/s, loss=0.722][A
Train step of epoch 0:  26%|██▌       | 693/2641 [10:49<30:22,  1.07it/s, loss=0.684][A
Train step of epoch 0:  26%|██▋       | 694/2641 [10:50<30:20,  1.07it/s, loss=0.684][A
Train step of epoch 0:  26%|██▋       | 694/2641 [10:50<30:20,  1.07it/s, loss=0.7]  [A
Train step of epoch 0:  26%|██▋       | 695/2641 [10:51<30:20,  1.07it/s, loss=0.7][A
Train step of epoch 0:  26%|██▋       | 695/2641 [10:51<30:20,  1.07it/s, loss=0.529][A
Train step of epoch 0:  26%|██▋       | 696/2641 [10:52<30:20,  1.07it/s, loss=0.529][A
Train step of epoch 0:  26%|██▋       | 696/2641 [10:52<30:20,  1.07it/s, loss=0.696][A
Train step of epoch 0: 

Train step of epoch 0:  28%|██▊       | 737/2641 [11:30<29:47,  1.07it/s, loss=0.698][A
Train step of epoch 0:  28%|██▊       | 738/2641 [11:31<29:45,  1.07it/s, loss=0.698][A
Train step of epoch 0:  28%|██▊       | 738/2641 [11:31<29:45,  1.07it/s, loss=0.595][A
Train step of epoch 0:  28%|██▊       | 739/2641 [11:32<29:45,  1.07it/s, loss=0.595][A
Train step of epoch 0:  28%|██▊       | 739/2641 [11:32<29:45,  1.07it/s, loss=0.599][A
Train step of epoch 0:  28%|██▊       | 740/2641 [11:33<29:42,  1.07it/s, loss=0.599][A
Train step of epoch 0:  28%|██▊       | 740/2641 [11:33<29:42,  1.07it/s, loss=0.607][A
Train step of epoch 0:  28%|██▊       | 741/2641 [11:34<29:40,  1.07it/s, loss=0.607][A
Train step of epoch 0:  28%|██▊       | 741/2641 [11:34<29:40,  1.07it/s, loss=0.705][A
Train step of epoch 0:  28%|██▊       | 742/2641 [11:35<29:38,  1.07it/s, loss=0.705][A
Train step of epoch 0:  28%|██▊       | 742/2641 [11:35<29:38,  1.07it/s, loss=0.628][A
Train step of epoch 0

Train step of epoch 0:  30%|██▉       | 783/2641 [12:13<28:56,  1.07it/s, loss=0.612][A
Train step of epoch 0:  30%|██▉       | 784/2641 [12:14<28:58,  1.07it/s, loss=0.612][A
Train step of epoch 0:  30%|██▉       | 784/2641 [12:14<28:58,  1.07it/s, loss=0.603][A
Train step of epoch 0:  30%|██▉       | 785/2641 [12:15<28:57,  1.07it/s, loss=0.603][A
Train step of epoch 0:  30%|██▉       | 785/2641 [12:15<28:57,  1.07it/s, loss=0.562][A
Train step of epoch 0:  30%|██▉       | 786/2641 [12:16<28:57,  1.07it/s, loss=0.562][A
Train step of epoch 0:  30%|██▉       | 786/2641 [12:16<28:57,  1.07it/s, loss=0.729][A
Train step of epoch 0:  30%|██▉       | 787/2641 [12:17<28:57,  1.07it/s, loss=0.729][A
Train step of epoch 0:  30%|██▉       | 787/2641 [12:17<28:57,  1.07it/s, loss=0.595][A
Train step of epoch 0:  30%|██▉       | 788/2641 [12:18<28:56,  1.07it/s, loss=0.595][A
Train step of epoch 0:  30%|██▉       | 788/2641 [12:18<28:56,  1.07it/s, loss=0.697][A
Train step of epoch 0

Train step of epoch 0:  31%|███▏      | 829/2641 [12:56<28:20,  1.07it/s, loss=0.711][A
Train step of epoch 0:  31%|███▏      | 830/2641 [12:57<28:19,  1.07it/s, loss=0.711][A
Train step of epoch 0:  31%|███▏      | 830/2641 [12:57<28:19,  1.07it/s, loss=0.584][A
Train step of epoch 0:  31%|███▏      | 831/2641 [12:58<28:18,  1.07it/s, loss=0.584][A
Train step of epoch 0:  31%|███▏      | 831/2641 [12:58<28:18,  1.07it/s, loss=0.702][A
Train step of epoch 0:  32%|███▏      | 832/2641 [12:59<28:15,  1.07it/s, loss=0.702][A
Train step of epoch 0:  32%|███▏      | 832/2641 [12:59<28:15,  1.07it/s, loss=0.602][A
Train step of epoch 0:  32%|███▏      | 833/2641 [13:00<28:14,  1.07it/s, loss=0.602][A
Train step of epoch 0:  32%|███▏      | 833/2641 [13:00<28:14,  1.07it/s, loss=0.684][A
Train step of epoch 0:  32%|███▏      | 834/2641 [13:01<28:13,  1.07it/s, loss=0.684][A
Train step of epoch 0:  32%|███▏      | 834/2641 [13:01<28:13,  1.07it/s, loss=0.706][A
Train step of epoch 0

Train step of epoch 0:  33%|███▎      | 875/2641 [13:39<27:31,  1.07it/s, loss=0.724][A
Train step of epoch 0:  33%|███▎      | 876/2641 [13:40<27:31,  1.07it/s, loss=0.724][A
Train step of epoch 0:  33%|███▎      | 876/2641 [13:40<27:31,  1.07it/s, loss=0.706][A
Train step of epoch 0:  33%|███▎      | 877/2641 [13:41<27:28,  1.07it/s, loss=0.706][A
Train step of epoch 0:  33%|███▎      | 877/2641 [13:41<27:28,  1.07it/s, loss=0.603][A
Train step of epoch 0:  33%|███▎      | 878/2641 [13:42<27:26,  1.07it/s, loss=0.603][A
Train step of epoch 0:  33%|███▎      | 878/2641 [13:42<27:26,  1.07it/s, loss=0.662][A
Train step of epoch 0:  33%|███▎      | 879/2641 [13:43<27:25,  1.07it/s, loss=0.662][A
Train step of epoch 0:  33%|███▎      | 879/2641 [13:43<27:25,  1.07it/s, loss=0.684][A
Train step of epoch 0:  33%|███▎      | 880/2641 [13:44<27:23,  1.07it/s, loss=0.684][A
Train step of epoch 0:  33%|███▎      | 880/2641 [13:44<27:23,  1.07it/s, loss=0.593][A
Train step of epoch 0

Train step of epoch 0:  35%|███▍      | 921/2641 [14:22<26:49,  1.07it/s, loss=0.667][A
Train step of epoch 0:  35%|███▍      | 922/2641 [14:23<26:53,  1.07it/s, loss=0.667][A
Train step of epoch 0:  35%|███▍      | 922/2641 [14:23<26:53,  1.07it/s, loss=0.745][A
Train step of epoch 0:  35%|███▍      | 923/2641 [14:24<26:50,  1.07it/s, loss=0.745][A
Train step of epoch 0:  35%|███▍      | 923/2641 [14:24<26:50,  1.07it/s, loss=0.941][A
Train step of epoch 0:  35%|███▍      | 924/2641 [14:25<26:49,  1.07it/s, loss=0.941][A
Train step of epoch 0:  35%|███▍      | 924/2641 [14:25<26:49,  1.07it/s, loss=0.682][A
Train step of epoch 0:  35%|███▌      | 925/2641 [14:26<26:45,  1.07it/s, loss=0.682][A
Train step of epoch 0:  35%|███▌      | 925/2641 [14:26<26:45,  1.07it/s, loss=0.507][A
Train step of epoch 0:  35%|███▌      | 926/2641 [14:27<26:43,  1.07it/s, loss=0.507][A
Train step of epoch 0:  35%|███▌      | 926/2641 [14:27<26:43,  1.07it/s, loss=0.847][A
Train step of epoch 0

Train step of epoch 0:  37%|███▋      | 967/2641 [15:05<26:05,  1.07it/s, loss=0.59] [A
Train step of epoch 0:  37%|███▋      | 968/2641 [15:06<26:05,  1.07it/s, loss=0.59][A
Train step of epoch 0:  37%|███▋      | 968/2641 [15:06<26:05,  1.07it/s, loss=0.654][A
Train step of epoch 0:  37%|███▋      | 969/2641 [15:07<26:02,  1.07it/s, loss=0.654][A
Train step of epoch 0:  37%|███▋      | 969/2641 [15:07<26:02,  1.07it/s, loss=0.712][A
Train step of epoch 0:  37%|███▋      | 970/2641 [15:08<25:59,  1.07it/s, loss=0.712][A
Train step of epoch 0:  37%|███▋      | 970/2641 [15:08<25:59,  1.07it/s, loss=0.694][A
Train step of epoch 0:  37%|███▋      | 971/2641 [15:09<26:01,  1.07it/s, loss=0.694][A
Train step of epoch 0:  37%|███▋      | 971/2641 [15:09<26:01,  1.07it/s, loss=0.612][A
Train step of epoch 0:  37%|███▋      | 972/2641 [15:10<25:58,  1.07it/s, loss=0.612][A
Train step of epoch 0:  37%|███▋      | 972/2641 [15:10<25:58,  1.07it/s, loss=0.598][A
Train step of epoch 0:

Train step of epoch 0:  38%|███▊      | 1013/2641 [15:48<25:23,  1.07it/s, loss=0.707][A
Train step of epoch 0:  38%|███▊      | 1013/2641 [15:48<25:23,  1.07it/s, loss=0.679][A
Train step of epoch 0:  38%|███▊      | 1014/2641 [15:49<25:24,  1.07it/s, loss=0.679][A
Train step of epoch 0:  38%|███▊      | 1014/2641 [15:49<25:24,  1.07it/s, loss=0.621][A
Train step of epoch 0:  38%|███▊      | 1015/2641 [15:50<25:22,  1.07it/s, loss=0.621][A
Train step of epoch 0:  38%|███▊      | 1015/2641 [15:50<25:22,  1.07it/s, loss=0.419][A
Train step of epoch 0:  38%|███▊      | 1016/2641 [15:51<25:19,  1.07it/s, loss=0.419][A
Train step of epoch 0:  38%|███▊      | 1016/2641 [15:51<25:19,  1.07it/s, loss=0.726][A
Train step of epoch 0:  39%|███▊      | 1017/2641 [15:52<25:18,  1.07it/s, loss=0.726][A
Train step of epoch 0:  39%|███▊      | 1017/2641 [15:52<25:18,  1.07it/s, loss=0.695][A
Train step of epoch 0:  39%|███▊      | 1018/2641 [15:53<25:18,  1.07it/s, loss=0.695][A
Train step

Train step of epoch 0:  40%|████      | 1058/2641 [16:31<24:41,  1.07it/s, loss=0.513][A
Train step of epoch 0:  40%|████      | 1059/2641 [16:31<24:37,  1.07it/s, loss=0.513][A
Train step of epoch 0:  40%|████      | 1059/2641 [16:31<24:37,  1.07it/s, loss=0.724][A
Train step of epoch 0:  40%|████      | 1060/2641 [16:32<24:37,  1.07it/s, loss=0.724][A
Train step of epoch 0:  40%|████      | 1060/2641 [16:32<24:37,  1.07it/s, loss=0.573][A
Train step of epoch 0:  40%|████      | 1061/2641 [16:33<24:39,  1.07it/s, loss=0.573][A
Train step of epoch 0:  40%|████      | 1061/2641 [16:33<24:39,  1.07it/s, loss=0.509][A
Train step of epoch 0:  40%|████      | 1062/2641 [16:34<24:40,  1.07it/s, loss=0.509][A
Train step of epoch 0:  40%|████      | 1062/2641 [16:34<24:40,  1.07it/s, loss=0.572][A
Train step of epoch 0:  40%|████      | 1063/2641 [16:35<24:40,  1.07it/s, loss=0.572][A
Train step of epoch 0:  40%|████      | 1063/2641 [16:35<24:40,  1.07it/s, loss=0.779][A
Train step

Train step of epoch 0:  42%|████▏     | 1104/2641 [17:14<23:58,  1.07it/s, loss=0.597][A
Train step of epoch 0:  42%|████▏     | 1104/2641 [17:14<23:58,  1.07it/s, loss=0.726][A
Train step of epoch 0:  42%|████▏     | 1105/2641 [17:14<23:55,  1.07it/s, loss=0.726][A
Train step of epoch 0:  42%|████▏     | 1105/2641 [17:14<23:55,  1.07it/s, loss=0.578][A
Train step of epoch 0:  42%|████▏     | 1106/2641 [17:15<23:54,  1.07it/s, loss=0.578][A
Train step of epoch 0:  42%|████▏     | 1106/2641 [17:15<23:54,  1.07it/s, loss=0.65] [A
Train step of epoch 0:  42%|████▏     | 1107/2641 [17:16<23:51,  1.07it/s, loss=0.65][A
Train step of epoch 0:  42%|████▏     | 1107/2641 [17:16<23:51,  1.07it/s, loss=0.523][A
Train step of epoch 0:  42%|████▏     | 1108/2641 [17:17<23:50,  1.07it/s, loss=0.523][A
Train step of epoch 0:  42%|████▏     | 1108/2641 [17:17<23:50,  1.07it/s, loss=0.64] [A
Train step of epoch 0:  42%|████▏     | 1109/2641 [17:18<23:49,  1.07it/s, loss=0.64][A
Train step o

Train step of epoch 0:  44%|████▎     | 1149/2641 [17:56<23:16,  1.07it/s, loss=0.544][A
Train step of epoch 0:  44%|████▎     | 1150/2641 [17:57<23:15,  1.07it/s, loss=0.544][A
Train step of epoch 0:  44%|████▎     | 1150/2641 [17:57<23:15,  1.07it/s, loss=0.704][A
Train step of epoch 0:  44%|████▎     | 1151/2641 [17:57<23:13,  1.07it/s, loss=0.704][A
Train step of epoch 0:  44%|████▎     | 1151/2641 [17:57<23:13,  1.07it/s, loss=0.676][A
Train step of epoch 0:  44%|████▎     | 1152/2641 [17:58<23:13,  1.07it/s, loss=0.676][A
Train step of epoch 0:  44%|████▎     | 1152/2641 [17:58<23:13,  1.07it/s, loss=0.512][A
Train step of epoch 0:  44%|████▎     | 1153/2641 [17:59<23:13,  1.07it/s, loss=0.512][A
Train step of epoch 0:  44%|████▎     | 1153/2641 [17:59<23:13,  1.07it/s, loss=0.624][A
Train step of epoch 0:  44%|████▎     | 1154/2641 [18:00<23:12,  1.07it/s, loss=0.624][A
Train step of epoch 0:  44%|████▎     | 1154/2641 [18:00<23:12,  1.07it/s, loss=0.647][A
Train step

Train step of epoch 0:  45%|████▌     | 1195/2641 [18:39<22:29,  1.07it/s, loss=0.727][A
Train step of epoch 0:  45%|████▌     | 1195/2641 [18:39<22:29,  1.07it/s, loss=0.518][A
Train step of epoch 0:  45%|████▌     | 1196/2641 [18:40<22:29,  1.07it/s, loss=0.518][A
Train step of epoch 0:  45%|████▌     | 1196/2641 [18:40<22:29,  1.07it/s, loss=0.693][A
Train step of epoch 0:  45%|████▌     | 1197/2641 [18:40<22:30,  1.07it/s, loss=0.693][A
Train step of epoch 0:  45%|████▌     | 1197/2641 [18:40<22:30,  1.07it/s, loss=0.52] [A
Train step of epoch 0:  45%|████▌     | 1198/2641 [18:41<22:30,  1.07it/s, loss=0.52][A
Train step of epoch 0:  45%|████▌     | 1198/2641 [18:41<22:30,  1.07it/s, loss=0.684][A
Train step of epoch 0:  45%|████▌     | 1199/2641 [18:42<22:29,  1.07it/s, loss=0.684][A
Train step of epoch 0:  45%|████▌     | 1199/2641 [18:42<22:29,  1.07it/s, loss=0.666][A
Train step of epoch 0:  45%|████▌     | 1200/2641 [18:43<22:29,  1.07it/s, loss=0.666][A
Train step 

Train step of epoch 0:  47%|████▋     | 1240/2641 [19:21<21:47,  1.07it/s, loss=0.592][A
Train step of epoch 0:  47%|████▋     | 1241/2641 [19:22<21:46,  1.07it/s, loss=0.592][A
Train step of epoch 0:  47%|████▋     | 1241/2641 [19:22<21:46,  1.07it/s, loss=0.506][A
Train step of epoch 0:  47%|████▋     | 1242/2641 [19:23<21:46,  1.07it/s, loss=0.506][A
Train step of epoch 0:  47%|████▋     | 1242/2641 [19:23<21:46,  1.07it/s, loss=0.734][A
Train step of epoch 0:  47%|████▋     | 1243/2641 [19:23<21:46,  1.07it/s, loss=0.734][A
Train step of epoch 0:  47%|████▋     | 1243/2641 [19:23<21:46,  1.07it/s, loss=0.687][A
Train step of epoch 0:  47%|████▋     | 1244/2641 [19:24<21:45,  1.07it/s, loss=0.687][A
Train step of epoch 0:  47%|████▋     | 1244/2641 [19:24<21:45,  1.07it/s, loss=0.59] [A
Train step of epoch 0:  47%|████▋     | 1245/2641 [19:25<21:43,  1.07it/s, loss=0.59][A
Train step of epoch 0:  47%|████▋     | 1245/2641 [19:25<21:43,  1.07it/s, loss=0.729][A
Train step 

Train step of epoch 0:  49%|████▊     | 1286/2641 [20:04<21:09,  1.07it/s, loss=0.583][A
Train step of epoch 0:  49%|████▊     | 1286/2641 [20:04<21:09,  1.07it/s, loss=0.663][A
Train step of epoch 0:  49%|████▊     | 1287/2641 [20:05<21:09,  1.07it/s, loss=0.663][A
Train step of epoch 0:  49%|████▊     | 1287/2641 [20:05<21:09,  1.07it/s, loss=0.75] [A
Train step of epoch 0:  49%|████▉     | 1288/2641 [20:06<21:05,  1.07it/s, loss=0.75][A
Train step of epoch 0:  49%|████▉     | 1288/2641 [20:06<21:05,  1.07it/s, loss=0.582][A
Train step of epoch 0:  49%|████▉     | 1289/2641 [20:06<21:04,  1.07it/s, loss=0.582][A
Train step of epoch 0:  49%|████▉     | 1289/2641 [20:07<21:04,  1.07it/s, loss=0.621][A
Train step of epoch 0:  49%|████▉     | 1290/2641 [20:07<21:03,  1.07it/s, loss=0.621][A
Train step of epoch 0:  49%|████▉     | 1290/2641 [20:07<21:03,  1.07it/s, loss=0.626][A
Train step of epoch 0:  49%|████▉     | 1291/2641 [20:08<21:03,  1.07it/s, loss=0.626][A
Train step 

Train step of epoch 0:  50%|█████     | 1331/2641 [20:46<20:22,  1.07it/s, loss=0.661][A
Train step of epoch 0:  50%|█████     | 1332/2641 [20:47<20:21,  1.07it/s, loss=0.661][A
Train step of epoch 0:  50%|█████     | 1332/2641 [20:47<20:21,  1.07it/s, loss=0.744][A
Train step of epoch 0:  50%|█████     | 1333/2641 [20:48<20:23,  1.07it/s, loss=0.744][A
Train step of epoch 0:  50%|█████     | 1333/2641 [20:48<20:23,  1.07it/s, loss=0.638][A

In [None]:
#학습 확인
#임의의 문장을 입력 후 reward score 확인
def inference_RM(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
        torch.cuda.current_device())
    output = model(input_ids)
    output_reward = output.cpu().detach().numpy()[0]

    print('input: %s\nreward score: %.1f'%(input_text, output_reward))

    return output_reward

input_text = '인공지능은 똥멍청이 입니다'
output_reward = inference_RM(input_text=input_text)

In [None]:
input_text = '인공지능(AI)은 컴퓨터에서 음성 및 작성된 언어를 보고 이해하고 번역하고 데이터를 분석하고 추천하는 기능을 포함하여 다양한 고급 기능을 수행할 수 있는 일련의 기술입니다.'

output_reward = inference_RM(input_text=input_text)

# 회고

- generate 단계에서 top-k와 beam search를 좀 더 조정해주었더니 생성하는 문장들이 개선되었으나, 반복되는 단어나, 한자어, 일본어가 나오는 문제를 해결하지는 못했다. 
- 새로운 데이터를 가져와 basemodel sft, rm 에 다시 학습시켰다. 
- sft를 재학습 시켰을 때 이전보다 조금더 관련있는 답변을 하는 것이 보였다. 한자어도 보이지 않고 조금더 관련있는 답변을 하는 모습을 보임
- 더 많은 데이터를 학습함으로서 일반화 성능, 예측력이 향상된 것으로 보임
- RM은 시간 관계상 학습을 시키지 못함 
- 메모리 비워주는 작업을 잊지 않고 해야겠다. 
- 리소스 관리의 중요성을 계속 느끼고 있다. 