<a href="https://colab.research.google.com/github/skyshine460/Systematic_Review/blob/main/Pytorch_BERT_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pytorch と BERT を使ったベースライン（titleのみ）

In [1]:
import os
import sys

from google.colab import drive
drive.mount('/gdrive')

!cp /gdrive/MyDrive/Datasets/signate-471/train.csv .
!cp /gdrive/MyDrive/Datasets/signate-471/test.csv .
!cp /gdrive/MyDrive/Datasets/signate-471/sample_submit.csv .

Mounted at /gdrive


In [2]:
!pip install -q transformers

In [3]:
import math
import random
import time
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm

In [4]:
DATA_DIR = "./"
OUTPUT_DIR = "./"

warnings.filterwarnings("ignore")

In [5]:
# GPUを優先利用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# ログを出力する関数
def init_logger(log_file=OUTPUT_DIR + "train.log"):
    from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

    # ロガーを作成または取得（スクリプトの名前を使用）
    logger = getLogger(__name__)
    logger.setLevel(INFO)  # ログレベルをINFOに設定（INFO以上のログが出力される）

    # 標準出力用のハンドラを作成
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))  # メッセージのみを出力するフォーマット

    # ファイル出力用のハンドラを作成
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))  # メッセージのみを出力するフォーマット

    # ロガーにハンドラを追加（標準出力 + ファイル出力）
    logger.addHandler(handler1)
    logger.addHandler(handler2)

    return logger  # 初期化したロガーを返す

# ロガーを初期化
LOGGER = init_logger()

In [7]:
# ランダムシードを固定
def seed_torch(seed=42):
    random.seed(seed)    # Python 標準ライブラリ
    os.environ["PYTHONHASHSEED"] = str(seed)    # Python 環境変数 "PYTHONHASHSEED" を設定（ハッシュ関連の乱数を固定）
    np.random.seed(seed)    # NumPy
    torch.manual_seed(seed)    # PyTorch（CPU用）
    torch.cuda.manual_seed(seed)    # PyTorch（GPU用）
    torch.cuda.manual_seed_all(seed)    # PyTorch（マルチGPU用）
    torch.backends.cudnn.deterministic = True    # PyTorch の cuDNN を再現性のあるモードに設定（計算の決定性を保証）

# シード値の設定
seed = 471
seed_torch(seed)


In [8]:
# データセットの読み込み
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
sub = pd.read_csv(DATA_DIR + "sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]

In [9]:
# pos/neg の割合
# trainデータとtestデータの分布を確認
# この値を境に、モデルの出力を 0 と 1 にします。→参考値。適切な値を検討
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)

0.023282372444280715


In [10]:
# 交差検証用の番号をデータに付与する関数
def get_train_data(train):
    # StratifiedKFoldを使用して交差検証用の分割を作成
    Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    # 交差検証の分割に基づき、各行に "fold" 番号を付与
    for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
        train.loc[val_index, "fold"] = int(n)  # 検証用のインデックスに fold 番号を振る

    # "fold" 列を符号なし8ビット整数型に変換（メモリ効率の向上）
    train["fold"] = train["fold"].astype(np.uint8)

    return train  # fold列が追加されたデータを返す

In [11]:
# データをそのまま渡す関数
def get_test_data(test):
    return test

train = get_train_data(train)

In [12]:
# PyTorch Datasetクラスを継承したカスタムデータセットクラス。
# テキストデータをBERT用にトークン化し、学習や推論に適した形式に整形。
class BaseDataset(Dataset):

    # BERT用のデータ準備
    def __init__(self, df, model_name, include_labels=True):

        # 指定したBERTモデルのトークナイザをロード
        tokenizer = T.BertTokenizer.from_pretrained(model_name)

        self.df = df  # データフレームを保持
        self.include_labels = include_labels  # ラベルを含むかどうかを保持

        # 'title' カラムのテキストデータをリストに変換
        self.title = df["title"].tolist()

        # BERTトークナイザでバッチエンコード
        self.encoded = tokenizer.batch_encode_plus(
            self.title,
            padding='max_length',             # 最大長に合わせてパディングを追加
            max_length=72,                    # 最大長を72トークンに設定
            truncation=True,                  # 最大長を超える場合は切り詰め
            return_attention_mask=True        # Attention Maskも返す
        )

        # ラベルを含む場合は 'judgement' カラムからラベルを取得
        if self.include_labels:
            self.labels = df["judgement"].values

    # データフレームの行数を返す関数
    def __len__(self):
        return len(self.df)

    # 指定されたインデックスのデータを取得。
    def __getitem__(self, idx):

        # トークナイズされた input_ids と attention_mask を取得
        input_ids = torch.tensor(self.encoded['input_ids'][idx])  # トークンID列
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])  # Attention Mask列

        # ラベルを含む場合はラベルも返す
        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()  # ラベル値をテンソルに変換
            return input_ids, attention_mask, label

        # ラベルが不要な場合は input_ids と attention_mask のみ返す
        return input_ids, attention_mask

In [13]:
# BERTモデルをベースとしたPyTorchのカスタムモデルクラス。
class BaseModel(nn.Module):

    # 指定されたBERTモデルをロードし、シーケンス分類用に設定。
    def __init__(self, model_name):

        super().__init__()  # nn.Moduleの初期化メソッドを呼び出し

        # Hugging FaceのBERTモデル（Sequence Classification用）をロード
        self.model = T.BertForSequenceClassification.from_pretrained(
            model_name,  # モデル名（事前学習済みBERTの名前）
            num_labels=1  # 出力ラベル数。1に設定しているので回帰/二値分類用。
        )

        # 出力にシグモイド関数を適用して確率を返す設定
        self.sigmoid = nn.Sigmoid()

    # 順伝播メソッド。モデルの計算グラフを定義。
    def forward(self, input_ids, attention_mask):

        # BERTモデルの順伝播。logits（未スケールの出力）を取得
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)

        # ロジットにシグモイド関数を適用して確率値に変換
        out = self.sigmoid(out.logits).squeeze()  # squeezeで次元を整形

        return out  # モデルの出力を返す


In [14]:
# 値の平均と現在の値を計算・保持するためのクラス。
class AverageMeter(object):

    # 初期化メソッド。
    def __init__(self):
        self.reset()

    # 値をリセット（初期化）する。
    def reset(self):
        self.val = 0    # 現在の値
        self.avg = 0    # 平均値
        self.sum = 0    # 合計値
        self.count = 0  # 値の更新回数

    # 値を更新し、平均値を計算する。
    def update(self, val, n=1):
        self.val = val                  # 現在の値を更新
        self.sum += val * n             # 合計値を更新
        self.count += n                 # 更新回数を増加
        self.avg = self.sum / self.count  # 平均値を計算

# 秒単位の時間を「分:秒」の形式に変換する。
def asMinutes(s):
    m = math.floor(s / 60)  # 分を計算
    s -= m * 60             # 秒を計算（余り）
    return "%dm %ds" % (m, s)

# 経過時間と残り時間を計算し、「経過時間（残り時間）」の形式で返す。
def timeSince(since, percent):
    now = time.time()  # 現在時刻を取得
    s = now - since    # 経過時間を計算
    es = s / percent   # 処理全体にかかる時間を推定
    rs = es - s        # 残り時間を計算
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))

In [15]:
# モデルを1エポック分学習する関数。
def train_fn(train_loader, model, criterion, optimizer, epoch, device):
    """
    Args:
        train_loader (DataLoader): 学習データのデータローダー。
        model (nn.Module): 学習対象のPyTorchモデル。
        criterion (nn.Module): 損失関数（例: nn.CrossEntropyLoss）。
        optimizer (torch.optim.Optimizer): 最適化アルゴリズム（例: Adam, SGD）。
        epoch (int): 現在のエポック数（0から開始）。
        device (torch.device): 計算に使用するデバイス（例: 'cuda' or 'cpu'）。

    Returns:
        float: エポック中の平均損失値。
    """
    start = end = time.time()  # 学習の開始時間を記録
    losses = AverageMeter()  # 平均損失値を記録するためのヘルパークラス

    # モデルを学習モードに切り替え（DropoutやBatchNormを有効化）
    model.train()

    # ミニバッチ学習を実行
    for step, (input_ids, attention_mask, labels) in enumerate(train_loader):
        optimizer.zero_grad()  # 勾配を初期化

        # データをデバイス（CPU/GPU）に転送
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)  # 現在のバッチサイズ

        # モデルの順伝播を実行
        y_preds = model(input_ids, attention_mask)

        # 損失を計算
        loss = criterion(y_preds, labels)

        # 平均損失を更新
        losses.update(loss.item(), batch_size)

        # 逆伝播を実行して勾配を計算
        loss.backward()

        # パラメータを更新
        optimizer.step()

        # ログ出力: 進捗状況や損失を表示
        if step % 100 == 0 or step == (len(train_loader) - 1):
            print(
                f"Epoch: [{epoch + 1}][{step}/{len(train_loader)}] "  # エポックと現在のステップ
                f"Elapsed {timeSince(start, float(step + 1) / len(train_loader)):s} "  # 経過時間と残り時間
                f"Loss: {losses.avg:.4f} "  # 現在の平均損失
            )

    # エポック全体の平均損失値を返す
    return losses.avg

In [16]:
# モデルの検証を行う関数。
def valid_fn(valid_loader, model, criterion, device):
    """
    Args:
        valid_loader (DataLoader): 検証データのデータローダー。
        model (nn.Module): 検証対象のPyTorchモデル。
        criterion (nn.Module): 損失関数（例: nn.CrossEntropyLoss）。
        device (torch.device): 計算に使用するデバイス（例: 'cuda' or 'cpu'）。

    Returns:
        tuple: 平均損失値（float）、検証データ全体の予測値（numpy配列）。
    """
    start = end = time.time()  # 検証開始時刻を記録
    losses = AverageMeter()  # 平均損失値を記録するためのヘルパークラス

    # モデルを評価モードに切り替え（DropoutやBatchNormを無効化）
    model.eval()
    preds = []  # 検証データ全体の予測値を保持するリスト

    # 検証データをバッチごとに処理
    for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
        # データをデバイス（CPU/GPU）に転送
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)  # 現在のバッチサイズ

        # 損失を計算（勾配計算は不要）
        with torch.no_grad():  # 検証時は勾配計算を無効化してメモリ効率を向上
            y_preds = model(input_ids, attention_mask)

        # 損失を計算
        loss = criterion(y_preds, labels)

        # 平均損失を更新
        losses.update(loss.item(), batch_size)

        # 予測結果を記録（CPUに転送してnumpy配列に変換）
        preds.append(y_preds.to("cpu").numpy())

        # 進捗状況をログ出力
        if step % 100 == 0 or step == (len(valid_loader) - 1):
            print(
                f"EVAL: [{step}/{len(valid_loader)}] "  # 現在のステップと全体のステップ数
                f"Elapsed {timeSince(start, float(step + 1) / len(valid_loader)):s} "  # 経過時間と残り時間
                f"Loss: {losses.avg:.4f} "  # 現在の平均損失
            )

    # 検証データ全体の予測値を1つの配列に結合
    predictions = np.concatenate(preds)

    # 平均損失値と予測値を返す
    return losses.avg, predictions


In [17]:
# 学習済みモデルを使用してテストデータの推論を行う関数。
def inference():

    predictions = []  # 各foldモデルの予測結果を格納するリスト

    # テストデータセットの準備（ラベルは含まない設定）
    test_dataset = BaseDataset(test, "bert-base-uncased", include_labels=False)

    # テストデータローダーの設定
    test_loader = DataLoader(
        test_dataset,  # テスト用データセット
        batch_size=16,  # バッチサイズ
        shuffle=False,  # 順序をシャッフルしない（順番通りのデータ取得）
        num_workers=4,  # データローディングのための並列処理数
        pin_memory=True  # メモリ固定（GPUの高速化に有効）
    )

    # 5つのfoldモデルで推論を行う
    for fold in range(5):
        LOGGER.info(f"========== model: bert-base-uncased fold: {fold} inference ==========")
        # モデルの構築
        model = BaseModel("bert-base-uncased")
        model.to(device)  # モデルをデバイス（GPU/CPU）に転送

        # 学習済みモデルの重みをロード
        model.load_state_dict(
            torch.load(OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth")["model"]
        )
        model.eval()  # モデルを評価モードに切り替え

        preds = []  # 現在のfoldモデルの予測結果を格納するリスト

        # テストデータローダーからデータをバッチ単位で取得して推論
        for i, (input_ids, attention_mask) in tqdm(enumerate(test_loader), total=len(test_loader)):
            # 入力データをデバイスに転送
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            # 推論（勾配計算は不要）
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)

            # CPUに戻してnumpy配列に変換し、リストに追加
            preds.append(y_preds.to("cpu").numpy())

        # 現在のfoldモデルの全バッチの予測結果を結合
        preds = np.concatenate(preds)

        # foldモデルの予測結果を全体のリストに追加
        predictions.append(preds)

    # 各foldモデルの予測値を平均化（アンサンブル）
    predictions = np.mean(predictions, axis=0)

    return predictions  # 最終予測値を返す

In [18]:
# 指定したフォールドでモデルの学習と評価を行う関数。
def train_loop(train, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # Data Loader (データローダーの準備)
    # ====================================================
    # 学習用データと検証用データに分割
    trn_idx = train[train["fold"] != fold].index  # 現在のフォールド以外を学習用データに
    val_idx = train[train["fold"] == fold].index  # 現在のフォールドを検証用データに

    train_folds = train.loc[trn_idx].reset_index(drop=True)  # 学習データを取得
    valid_folds = train.loc[val_idx].reset_index(drop=True)  # 検証データを取得

    # データセットを作成
    train_dataset = BaseDataset(train_folds, "bert-base-uncased")
    valid_dataset = BaseDataset(valid_folds, "bert-base-uncased")

    # データローダーを作成
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,  # バッチサイズ
        shuffle=True,  # 学習時はデータをシャッフル
        num_workers=4,  # データのロードに使用する並列ワーカー数
        pin_memory=True,  # メモリ固定（GPU高速化）
        drop_last=True,  # 最後の不完全なバッチを破棄
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=16,  # バッチサイズ
        shuffle=False,  # 検証時はシャッフルしない
        num_workers=4,
        pin_memory=True,
        drop_last=False,  # 不完全なバッチも使用
    )

    # ====================================================
    # Model (モデルの構築)
    # ====================================================
    model = BaseModel("bert-base-uncased")  # BERTベースのモデルを構築
    model.to(device)  # モデルをデバイス（CPU/GPU）に転送

    optimizer = T.AdamW(model.parameters(), lr=2e-5)  # AdamWオプティマイザ
    criterion = nn.BCELoss()  # バイナリクロスエントロピー損失関数

    # ====================================================
    # Loop (学習ループ)
    # ====================================================
    best_score = -1  # 最良スコアの初期値
    best_loss = np.inf  # 最良損失の初期値（無限大）

    # エポックごとに学習と評価を実行
    for epoch in range(3):
        start_time = time.time()  # エポック開始時刻を記録

        # 学習
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, device)

        # 検証
        avg_val_loss, preds = valid_fn(valid_loader, model, criterion, device)
        valid_labels = valid_folds["judgement"].values  # 検証データのラベルを取得

        # スコアの計算
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)

        # エポックごとのログを出力
        elapsed = time.time() - start_time  # エポックごとの経過時間
        LOGGER.info(
            f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s"
        )
        LOGGER.info(f"Epoch {epoch+1} - Score: {score}")

        # 最良スコアを更新した場合、モデルを保存
        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(
                {"model": model.state_dict(), "preds": preds},  # モデルの状態と予測値を保存
                OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth"
            )

    # 保存した最良モデルをロード
    check_point = torch.load(OUTPUT_DIR + f"bert-base-uncased_fold{fold}_best.pth")

    # 検証データに対する予測値を格納
    valid_folds["preds"] = check_point["preds"]

    return valid_folds  # 検証データを含むデータフレームを返す


In [19]:
# モデルの予測結果を基にFβスコアを計算してログに記録する関数。
def get_result(result_df):

    # 予測値を取得
    preds = result_df["preds"].values  # モデルの予測値（確率スコア）

    # 正解ラベルを取得
    labels = result_df["judgement"].values  # 検証データの正解ラベル

    # Fβスコアを計算（閾値で二値化）
    score = fbeta_score(labels, np.where(preds < border, 0, 1), beta=7.0)

    # スコアをログに記録
    LOGGER.info(f"Score: {score:<.5f}")  # 小数点以下5桁まで表示

In [20]:
# モデルの学習、交差検証、推論、結果保存を行うメイン関数。
def main():

    # Training (5-fold クロスバリデーション)
    oof_df = pd.DataFrame()  # Out-of-Fold (OOF) 結果を保存するデータフレーム
    for fold in range(5):
        # 各フォールドでモデルを学習・評価し、結果を取得
        _oof_df = train_loop(train, fold)
        # 全フォールドの結果を結合
        oof_df = pd.concat([oof_df, _oof_df])

        # 各フォールドの結果をログに記録
        LOGGER.info(f"========== fold: {fold} result ==========")
        get_result(_oof_df)  # 各フォールドの評価スコアを計算してログ出力

    # CV result (交差検証の結果)
    LOGGER.info(f"========== CV ==========")
    get_result(oof_df)  # 全体の交差検証スコアを計算してログ出力

    # Save OOF result (OOF結果の保存)
    oof_df.to_csv(OUTPUT_DIR + "oof_df.csv", index=False)  # OOF結果をCSVファイルとして保存

    # Inference (テストデータに対する推論)
    predictions = inference()  # 推論を実行し、予測値を取得
    predictions = np.where(predictions < border, 0, 1)  # 閾値で二値化

    # Submission (提出用ファイルの作成)
    sub["judgement"] = predictions  # テストデータの予測値を設定
    sub.to_csv(OUTPUT_DIR + "submission.csv", index=False, header=False)  # 提出ファイルをCSV形式で保存


In [21]:
if __name__ == "__main__":
    main()



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/1357] Elapsed 0m 1s (remain 36m 16s) Loss: 0.6762 
Epoch: [1][100/1357] Elapsed 0m 6s (remain 1m 25s) Loss: 0.1395 
Epoch: [1][200/1357] Elapsed 0m 12s (remain 1m 9s) Loss: 0.1216 
Epoch: [1][300/1357] Elapsed 0m 17s (remain 1m 0s) Loss: 0.1137 
Epoch: [1][400/1357] Elapsed 0m 22s (remain 0m 53s) Loss: 0.1035 
Epoch: [1][500/1357] Elapsed 0m 27s (remain 0m 47s) Loss: 0.1014 
Epoch: [1][600/1357] Elapsed 0m 32s (remain 0m 41s) Loss: 0.0958 
Epoch: [1][700/1357] Elapsed 0m 37s (remain 0m 35s) Loss: 0.0940 
Epoch: [1][800/1357] Elapsed 0m 43s (remain 0m 29s) Loss: 0.0882 
Epoch: [1][900/1357] Elapsed 0m 48s (remain 0m 24s) Loss: 0.0864 
Epoch: [1][1000/1357] Elapsed 0m 53s (remain 0m 18s) Loss: 0.0854 
Epoch: [1][1100/1357] Elapsed 0m 58s (remain 0m 13s) Loss: 0.0843 
Epoch: [1][1200/1357] Elapsed 1m 3s (remain 0m 8s) Loss: 0.0824 
Epoch: [1][1300/1357] Elapsed 1m 9s (remain 0m 2s) Loss: 0.0801 
Epoch: [1][1356/1357] Elapsed 1m 11s (remain 0m 0s) Loss: 0.0788 
EVAL: [0/340] E

Epoch 1 - avg_train_loss: 0.0788  avg_val_loss: 0.0520  time: 77s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0788  avg_val_loss: 0.0520  time: 77s
Epoch 1 - Score: 0.8134407784704273
INFO:__main__:Epoch 1 - Score: 0.8134407784704273
Epoch 1 - Save Best Score: 0.8134 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8134 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0520 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 43s) Loss: 0.0020 
Epoch: [2][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0485 
Epoch: [2][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0442 
Epoch: [2][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0446 
Epoch: [2][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.0427 
Epoch: [2][500/1357] Elapsed 0m 26s (remain 0m 45s) Loss: 0.0444 
Epoch: [2][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0438 
Epoch: [2][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0439 
Epoch: [2][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0442 
Epoch: [2][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0443 
Epoch: [2][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0443 
Epoch: [2][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0440 
Epoch: [2][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0439 
Epoch: [2][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0445 
Epoch: [2][1356/1357] E

Epoch 2 - avg_train_loss: 0.0440  avg_val_loss: 0.0607  time: 76s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0440  avg_val_loss: 0.0607  time: 76s
Epoch 2 - Score: 0.7374862702024164
INFO:__main__:Epoch 2 - Score: 0.7374862702024164


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0607 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 57s) Loss: 0.0006 
Epoch: [3][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0242 
Epoch: [3][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0267 
Epoch: [3][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0275 
Epoch: [3][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.0282 
Epoch: [3][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0280 
Epoch: [3][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0265 
Epoch: [3][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0262 
Epoch: [3][800/1357] Elapsed 0m 42s (remain 0m 29s) Loss: 0.0248 
Epoch: [3][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0250 
Epoch: [3][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0254 
Epoch: [3][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0267 
Epoch: [3][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0268 
Epoch: [3][1300/1357] Elapsed 1m 8s (remain 0m 2s) Loss: 0.0275 
Epoch: [3][1356/1357] E

Epoch 3 - avg_train_loss: 0.0280  avg_val_loss: 0.0524  time: 76s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0280  avg_val_loss: 0.0524  time: 76s
Epoch 3 - Score: 0.8385744234800838
INFO:__main__:Epoch 3 - Score: 0.8385744234800838
Epoch 3 - Save Best Score: 0.8386 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8386 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0524 


Score: 0.83857
INFO:__main__:Score: 0.83857
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 5s) Loss: 0.7679 
Epoch: [1][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.1669 
Epoch: [1][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.1363 
Epoch: [1][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.1197 
Epoch: [1][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.1132 
Epoch: [1][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.1079 
Epoch: [1][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.1018 
Epoch: [1][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0988 
Epoch: [1][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0954 
Epoch: [1][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0921 
Epoch: [1][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0898 
Epoch: [1][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0887 
Epoch: [1][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0882 
Epoch: [1][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0865 
Epoch: [1][1356/1357] Elapsed 1m 10s (remain 0m 0s) Loss: 0.0868 
EVAL: [0/340] Ela

Epoch 1 - avg_train_loss: 0.0868  avg_val_loss: 0.0708  time: 76s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0868  avg_val_loss: 0.0708  time: 76s
Epoch 1 - Score: 0.787510361978447
INFO:__main__:Epoch 1 - Score: 0.787510361978447
Epoch 1 - Save Best Score: 0.7875 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7875 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0708 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 55s) Loss: 0.0557 
Epoch: [2][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0536 
Epoch: [2][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0495 
Epoch: [2][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0490 
Epoch: [2][400/1357] Elapsed 0m 20s (remain 0m 50s) Loss: 0.0464 
Epoch: [2][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0462 
Epoch: [2][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0468 
Epoch: [2][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0473 
Epoch: [2][800/1357] Elapsed 0m 41s (remain 0m 28s) Loss: 0.0466 
Epoch: [2][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0459 
Epoch: [2][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0446 
Epoch: [2][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0446 
Epoch: [2][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0448 
Epoch: [2][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0456 
Epoch: [2][1356/1357] E

Epoch 2 - avg_train_loss: 0.0449  avg_val_loss: 0.0676  time: 76s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0449  avg_val_loss: 0.0676  time: 76s
Epoch 2 - Score: 0.6827813530058076
INFO:__main__:Epoch 2 - Score: 0.6827813530058076


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0676 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 4m 2s) Loss: 0.2705 
Epoch: [3][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0313 
Epoch: [3][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0319 
Epoch: [3][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0264 
Epoch: [3][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.0275 
Epoch: [3][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0275 
Epoch: [3][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0276 
Epoch: [3][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0275 
Epoch: [3][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0271 
Epoch: [3][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0276 
Epoch: [3][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0281 
Epoch: [3][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0277 
Epoch: [3][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0283 
Epoch: [3][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0279 
Epoch: [3][1356/1357] El

Epoch 3 - avg_train_loss: 0.0276  avg_val_loss: 0.0737  time: 76s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0276  avg_val_loss: 0.0737  time: 76s
Epoch 3 - Score: 0.8468225419664268
INFO:__main__:Epoch 3 - Score: 0.8468225419664268
Epoch 3 - Save Best Score: 0.8468 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8468 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0737 


Score: 0.84682
INFO:__main__:Score: 0.84682
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 7s) Loss: 0.8517 
Epoch: [1][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.1621 
Epoch: [1][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.1345 
Epoch: [1][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.1156 
Epoch: [1][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.1086 
Epoch: [1][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.1051 
Epoch: [1][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.1028 
Epoch: [1][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0976 
Epoch: [1][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0945 
Epoch: [1][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0924 
Epoch: [1][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0900 
Epoch: [1][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0870 
Epoch: [1][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0851 
Epoch: [1][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0822 
Epoch: [1][1356/1357] Elapsed 1m 10s (remain 0m 0s) Loss: 0.0825 
EVAL: [0/340] Ela

Epoch 1 - avg_train_loss: 0.0825  avg_val_loss: 0.0567  time: 76s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0825  avg_val_loss: 0.0567  time: 76s
Epoch 1 - Score: 0.8089133089133089
INFO:__main__:Epoch 1 - Score: 0.8089133089133089
Epoch 1 - Save Best Score: 0.8089 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8089 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0567 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 53s) Loss: 0.1989 
Epoch: [2][100/1357] Elapsed 0m 5s (remain 1m 6s) Loss: 0.0455 
Epoch: [2][200/1357] Elapsed 0m 10s (remain 1m 0s) Loss: 0.0436 
Epoch: [2][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0453 
Epoch: [2][400/1357] Elapsed 0m 20s (remain 0m 50s) Loss: 0.0460 
Epoch: [2][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0495 
Epoch: [2][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0503 
Epoch: [2][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0490 
Epoch: [2][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0482 
Epoch: [2][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0495 
Epoch: [2][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0482 
Epoch: [2][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0480 
Epoch: [2][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0476 
Epoch: [2][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0467 
Epoch: [2][1356/1357] E

Epoch 2 - avg_train_loss: 0.0464  avg_val_loss: 0.0537  time: 76s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0464  avg_val_loss: 0.0537  time: 76s
Epoch 2 - Score: 0.7853810264385692
INFO:__main__:Epoch 2 - Score: 0.7853810264385692


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0537 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 55s) Loss: 0.0014 
Epoch: [3][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0295 
Epoch: [3][200/1357] Elapsed 0m 10s (remain 1m 0s) Loss: 0.0281 
Epoch: [3][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0296 
Epoch: [3][400/1357] Elapsed 0m 20s (remain 0m 50s) Loss: 0.0302 
Epoch: [3][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0295 
Epoch: [3][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0292 
Epoch: [3][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0288 
Epoch: [3][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0291 
Epoch: [3][900/1357] Elapsed 0m 46s (remain 0m 23s) Loss: 0.0275 
Epoch: [3][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0281 
Epoch: [3][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0290 
Epoch: [3][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0293 
Epoch: [3][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0297 
Epoch: [3][1356/1357] E

Epoch 3 - avg_train_loss: 0.0295  avg_val_loss: 0.0489  time: 76s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0295  avg_val_loss: 0.0489  time: 76s
Epoch 3 - Score: 0.8425290784372204
INFO:__main__:Epoch 3 - Score: 0.8425290784372204
Epoch 3 - Save Best Score: 0.8425 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8425 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0489 


Score: 0.84253
INFO:__main__:Score: 0.84253
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 17s) Loss: 0.6078 
Epoch: [1][100/1357] Elapsed 0m 5s (remain 1m 8s) Loss: 0.1656 
Epoch: [1][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.1319 
Epoch: [1][300/1357] Elapsed 0m 15s (remain 0m 56s) Loss: 0.1213 
Epoch: [1][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.1129 
Epoch: [1][500/1357] Elapsed 0m 26s (remain 0m 45s) Loss: 0.1047 
Epoch: [1][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.1023 
Epoch: [1][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0986 
Epoch: [1][800/1357] Elapsed 0m 42s (remain 0m 29s) Loss: 0.0954 
Epoch: [1][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0937 
Epoch: [1][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0910 
Epoch: [1][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0886 
Epoch: [1][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0854 
Epoch: [1][1300/1357] Elapsed 1m 8s (remain 0m 2s) Loss: 0.0840 
Epoch: [1][1356/1357] Elapsed 1m 11s (remain 0m 0s) Loss: 0.0830 
EVAL: [0/340] El

Epoch 1 - avg_train_loss: 0.0830  avg_val_loss: 0.0734  time: 76s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0830  avg_val_loss: 0.0734  time: 76s
Epoch 1 - Score: 0.6431117309778398
INFO:__main__:Epoch 1 - Score: 0.6431117309778398
Epoch 1 - Save Best Score: 0.6431 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6431 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0734 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 54s) Loss: 0.0697 
Epoch: [2][100/1357] Elapsed 0m 5s (remain 1m 6s) Loss: 0.0650 
Epoch: [2][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0568 
Epoch: [2][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0484 
Epoch: [2][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.0507 
Epoch: [2][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0499 
Epoch: [2][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0486 
Epoch: [2][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0482 
Epoch: [2][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0477 
Epoch: [2][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0497 
Epoch: [2][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0502 
Epoch: [2][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0496 
Epoch: [2][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0490 
Epoch: [2][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0485 
Epoch: [2][1356/1357] E

Epoch 2 - avg_train_loss: 0.0481  avg_val_loss: 0.0711  time: 76s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0481  avg_val_loss: 0.0711  time: 76s
Epoch 2 - Score: 0.6228589224540642
INFO:__main__:Epoch 2 - Score: 0.6228589224540642


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0711 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 57s) Loss: 0.0011 
Epoch: [3][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0298 
Epoch: [3][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0298 
Epoch: [3][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0318 
Epoch: [3][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.0355 
Epoch: [3][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0341 
Epoch: [3][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0363 
Epoch: [3][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0345 
Epoch: [3][800/1357] Elapsed 0m 41s (remain 0m 29s) Loss: 0.0343 
Epoch: [3][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0344 
Epoch: [3][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0331 
Epoch: [3][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0332 
Epoch: [3][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0333 
Epoch: [3][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0339 
Epoch: [3][1356/1357] E

Epoch 3 - avg_train_loss: 0.0343  avg_val_loss: 0.0617  time: 76s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0343  avg_val_loss: 0.0617  time: 76s
Epoch 3 - Score: 0.8149010477299186
INFO:__main__:Epoch 3 - Score: 0.8149010477299186
Epoch 3 - Save Best Score: 0.8149 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8149 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0617 


Score: 0.81490
INFO:__main__:Score: 0.81490
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: [1][0/1357] Elapsed 0m 0s (remain 4m 16s) Loss: 0.6451 
Epoch: [1][100/1357] Elapsed 0m 5s (remain 1m 8s) Loss: 0.1454 
Epoch: [1][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.1399 
Epoch: [1][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.1247 
Epoch: [1][400/1357] Elapsed 0m 21s (remain 0m 50s) Loss: 0.1167 
Epoch: [1][500/1357] Elapsed 0m 26s (remain 0m 45s) Loss: 0.1128 
Epoch: [1][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.1057 
Epoch: [1][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.1023 
Epoch: [1][800/1357] Elapsed 0m 42s (remain 0m 29s) Loss: 0.1004 
Epoch: [1][900/1357] Elapsed 0m 47s (remain 0m 23s) Loss: 0.0977 
Epoch: [1][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0945 
Epoch: [1][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0914 
Epoch: [1][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0897 
Epoch: [1][1300/1357] Elapsed 1m 8s (remain 0m 2s) Loss: 0.0869 
Epoch: [1][1356/1357] Elapsed 1m 11s (remain 0m 0s) Loss: 0.0853 
EVAL: [0/340] El

Epoch 1 - avg_train_loss: 0.0853  avg_val_loss: 0.0546  time: 76s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0853  avg_val_loss: 0.0546  time: 76s
Epoch 1 - Score: 0.8173365326934613
INFO:__main__:Epoch 1 - Score: 0.8173365326934613
Epoch 1 - Save Best Score: 0.8173 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8173 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0546 
Epoch: [2][0/1357] Elapsed 0m 0s (remain 3m 50s) Loss: 0.0078 
Epoch: [2][100/1357] Elapsed 0m 5s (remain 1m 7s) Loss: 0.0497 
Epoch: [2][200/1357] Elapsed 0m 10s (remain 1m 1s) Loss: 0.0433 
Epoch: [2][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0476 
Epoch: [2][400/1357] Elapsed 0m 20s (remain 0m 50s) Loss: 0.0503 
Epoch: [2][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0485 
Epoch: [2][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0502 
Epoch: [2][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0516 
Epoch: [2][800/1357] Elapsed 0m 41s (remain 0m 28s) Loss: 0.0517 
Epoch: [2][900/1357] Elapsed 0m 46s (remain 0m 23s) Loss: 0.0500 
Epoch: [2][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0499 
Epoch: [2][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0492 
Epoch: [2][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0488 
Epoch: [2][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0480 
Epoch: [2][1356/1357] E

Epoch 2 - avg_train_loss: 0.0485  avg_val_loss: 0.0502  time: 76s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0485  avg_val_loss: 0.0502  time: 76s
Epoch 2 - Score: 0.8376094174464231
INFO:__main__:Epoch 2 - Score: 0.8376094174464231
Epoch 2 - Save Best Score: 0.8376 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8376 Model


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0502 
Epoch: [3][0/1357] Elapsed 0m 0s (remain 3m 52s) Loss: 0.0022 
Epoch: [3][100/1357] Elapsed 0m 5s (remain 1m 6s) Loss: 0.0240 
Epoch: [3][200/1357] Elapsed 0m 10s (remain 1m 0s) Loss: 0.0261 
Epoch: [3][300/1357] Elapsed 0m 15s (remain 0m 55s) Loss: 0.0263 
Epoch: [3][400/1357] Elapsed 0m 20s (remain 0m 49s) Loss: 0.0298 
Epoch: [3][500/1357] Elapsed 0m 26s (remain 0m 44s) Loss: 0.0303 
Epoch: [3][600/1357] Elapsed 0m 31s (remain 0m 39s) Loss: 0.0291 
Epoch: [3][700/1357] Elapsed 0m 36s (remain 0m 34s) Loss: 0.0291 
Epoch: [3][800/1357] Elapsed 0m 41s (remain 0m 28s) Loss: 0.0294 
Epoch: [3][900/1357] Elapsed 0m 46s (remain 0m 23s) Loss: 0.0290 
Epoch: [3][1000/1357] Elapsed 0m 52s (remain 0m 18s) Loss: 0.0292 
Epoch: [3][1100/1357] Elapsed 0m 57s (remain 0m 13s) Loss: 0.0297 
Epoch: [3][1200/1357] Elapsed 1m 2s (remain 0m 8s) Loss: 0.0292 
Epoch: [3][1300/1357] Elapsed 1m 7s (remain 0m 2s) Loss: 0.0286 
Epoch: [3][1356/1357] E

Epoch 3 - avg_train_loss: 0.0295  avg_val_loss: 0.0599  time: 76s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0295  avg_val_loss: 0.0599  time: 76s
Epoch 3 - Score: 0.7750736319950395
INFO:__main__:Epoch 3 - Score: 0.7750736319950395


EVAL: [339/340] Elapsed 0m 5s (remain 0m 0s) Loss: 0.0599 


Score: 0.83761
INFO:__main__:Score: 0.83761
Score: 0.83597
INFO:__main__:Score: 0.83597
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2553 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2553 [00:00<?, ?it/s]