好的！LoRA（Low-Rank Adaptation） 是一種輕量微調（Fine-tuning）方法，特別適合 低 VRAM 設備，因為它 只調整 Transformer 的部分參數，而不微調整個模型，從而大幅降低顯存需求。

In [1]:
import pandas as pd
data_path = "../data/open_source_8380.csv"  # 替換為你的檔案路徑
df = pd.read_csv(data_path)
df

Unnamed: 0,Inbound Message,Label
0,verified user details employee manager name ch...,0
1,hello team meetings skype meetings etc appeari...,0
2,cannot log vpn best,0
3,unable access tool page,0
4,skype error,0
...,...,...
8375,good afternoon receiving emails sent mail plea...,22
8376,telephony software issue,0
8377,vip windows password reset tifpdchb pedxruyf,0
8378,unable access machine utilities finish drawers...,44


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import pandas as pd
import numpy as np

# 設定 GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# 讀取 CSV 數據
file_path = "/mnt/data/open_source_8380.csv"
df = pd.read_csv(file_path)

# 定義 Few-Shot 訓練數據（每類取 5～10 筆）
few_shot_samples = df.groupby("Label").apply(lambda x: x.sample(n=min(10, len(x)))).reset_index(drop=True)

# 轉換成 Hugging Face Dataset 格式
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {key: val.squeeze(0) for key, val in encoding.items()}, torch.tensor(label)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 建立訓練數據集
train_dataset = CustomDataset(
    texts=few_shot_samples["Inbound Message"].tolist(),
    labels=few_shot_samples["Label"].tolist(),
    tokenizer=tokenizer
)

# LoRA 配置
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # 序列分類
    r=8,  # 低秩矩陣的維度
    lora_alpha=32,  # LoRA alpha 參數
    lora_dropout=0.1,  # Dropout
    target_modules=["query", "value"]  # 只調整注意力機制的 query 和 value 層
)

# 載入 BERT 模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df["Label"].unique()))
model = get_peft_model(model, lora_config)  # 加入 LoRA

# 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,  # Batch size 設小，避免爆 VRAM
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    report_to="none"  # 不傳送到 Hugging Face Hub
)

# Trainer 物件
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# 開始訓練
trainer.train()

# 十折驗證法

In [14]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}") 
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
print(f"Current GPU Memory Allocated: {torch.cuda.memory_allocated() / 1e9} GB")
print(f"Current GPU Memory Cached: {torch.cuda.memory_reserved() / 1e9} GB")

Using device: cuda
CUDA Available: True
CUDA Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU
Current GPU Memory Allocated: 1.802484224 GB
Current GPU Memory Cached: 3.401580544 GB


In [10]:
from transformers import BertTokenizer
from sklearn.model_selection import KFold
# 設定參數
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 20

# 初始化 BERT 的 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 初始化數據
messages = df['Inbound Message'].tolist()
labels = df['Label'].tolist()

# KFold 初始化
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [4]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, messages, labels, tokenizer, max_length):
        self.messages = messages
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.messages)

    def __getitem__(self, idx):
        message = str(self.messages[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            message,
            max_length=self.max_length,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# 定義訓練與評估函數
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    return total_loss / len(dataloader)

In [12]:
def eval_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
    
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

In [13]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np

# 進行 10 折交叉驗證
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(messages)):
    print(f"Fold {fold + 1}")

    # 折分數據
    train_messages = [messages[i] for i in train_idx]
    train_labels = [labels[i] for i in train_idx]
    val_messages = [messages[i] for i in val_idx]
    val_labels = [labels[i] for i in val_idx]

    # 建立 Dataset 和 DataLoader
    train_dataset = CustomDataset(
        messages=train_messages,
        labels=train_labels,
        tokenizer=tokenizer,
        max_length=MAX_LENGTH
    )
    val_dataset = CustomDataset(
        messages=val_messages,
        labels=val_labels,
        tokenizer=tokenizer,
        max_length=MAX_LENGTH
    )
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # 初始化模型
    NUM_LABELS = len(set(labels))
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=NUM_LABELS, ignore_mismatched_sizes=True)
    model.to(device)

    # 初始化優化器和損失函數
    optimizer = AdamW(model.parameters(), lr=5e-5)
    criterion = CrossEntropyLoss()

    # Early stopping 參數
    patience = 3  # 連續幾個 epoch 沒有提升就停止
    best_val_loss = float('inf')
    epochs_without_improvement = 0
    
    # 訓練模型
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        train_loss = train_epoch(model, train_dataloader, optimizer, criterion)
        val_loss, val_accuracy = eval_model(model, val_dataloader, criterion)
        print(f"Train loss: {train_loss} | Val loss: {val_loss} | Val accuracy: {val_accuracy}")

        # Early Stopping 判斷
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0  # 重置計數
        else:
            epochs_without_improvement += 1  # 增加計數
        
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered at epoch {epoch + 1}")
            break  # 終止訓練

    # 保存每折的結果
    fold_results.append({"fold": fold + 1, "val_loss": val_loss, "val_accuracy": val_accuracy.item()})

# 計算總體平均結果
avg_val_loss = np.mean([result["val_loss"] for result in fold_results])
avg_val_accuracy = np.mean([result["val_accuracy"] for result in fold_results])

print(f"10-fold Cross-Validation Results:")
print(f"Average Validation Loss: {avg_val_loss}")
print(f"Average Validation Accuracy: {avg_val_accuracy}")

Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:21<00:00,  2.35it/s]


Train loss: 1.9684618836489773 | Val loss: 1.6066787822066613 | Val accuracy: 0.6050119331742244
Epoch 2/20


100%|██████████| 472/472 [03:19<00:00,  2.37it/s]


Train loss: 1.44244602638281 | Val loss: 1.4014293298406422 | Val accuracy: 0.6396181384248211
Epoch 3/20


100%|██████████| 472/472 [03:19<00:00,  2.36it/s]


Train loss: 1.1143376033248034 | Val loss: 1.3731268405352 | Val accuracy: 0.6539379474940334
Epoch 4/20


100%|██████████| 472/472 [03:20<00:00,  2.36it/s]


Train loss: 0.8531525028964221 | Val loss: 1.4201320825882677 | Val accuracy: 0.6599045346062052
Epoch 5/20


100%|██████████| 472/472 [03:20<00:00,  2.35it/s]


Train loss: 0.644195708240998 | Val loss: 1.4609643156236072 | Val accuracy: 0.6229116945107399
Epoch 6/20


100%|██████████| 472/472 [03:19<00:00,  2.36it/s]


Train loss: 0.5100190417793722 | Val loss: 1.5433505767921232 | Val accuracy: 0.649164677804296
Early stopping triggered at epoch 6
Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:19<00:00,  2.37it/s]


Train loss: 1.9742553517601247 | Val loss: 1.6349444085696958 | Val accuracy: 0.6181384248210025
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.459527789915012 | Val loss: 1.4593111062949558 | Val accuracy: 0.6217183770883055
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.1555060867725286 | Val loss: 1.414582686041886 | Val accuracy: 0.6527446300715991
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.8950734835560039 | Val loss: 1.4402786780078456 | Val accuracy: 0.6551312649164678
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.6916218278061409 | Val loss: 1.4958188989252414 | Val accuracy: 0.6563245823389022
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.5314731493402841 | Val loss: 1.5821373271492292 | Val accuracy: 0.6288782816229117
Early stopping triggered at epoch 6
Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:19<00:00,  2.37it/s]


Train loss: 1.9695514984807725 | Val loss: 1.6699603301174235 | Val accuracy: 0.594272076372315
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.4756728250975326 | Val loss: 1.5143477793009776 | Val accuracy: 0.6085918854415274
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.1502575636920283 | Val loss: 1.4315804704180304 | Val accuracy: 0.649164677804296
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.8971369572425798 | Val loss: 1.46087062415087 | Val accuracy: 0.6408114558472554
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.6717480150193481 | Val loss: 1.5188826994513565 | Val accuracy: 0.6503579952267303
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.5156461417801299 | Val loss: 1.587437224275661 | Val accuracy: 0.6527446300715991
Early stopping triggered at epoch 6
Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.9836862865140883 | Val loss: 1.629950371553313 | Val accuracy: 0.5894988066825776
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.483702512742099 | Val loss: 1.4355329173915792 | Val accuracy: 0.6229116945107399
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.1787530099499528 | Val loss: 1.349069855685504 | Val accuracy: 0.6503579952267303
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.905929312709782 | Val loss: 1.4067130347467818 | Val accuracy: 0.6563245823389022
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.6819319546822522 | Val loss: 1.4810608861581334 | Val accuracy: 0.6348448687350836
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.540361968247946 | Val loss: 1.4726181525104451 | Val accuracy: 0.6563245823389022
Early stopping triggered at epoch 6
Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.986720365487923 | Val loss: 1.6567273674146183 | Val accuracy: 0.5883054892601433
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.4844833919936318 | Val loss: 1.5319658349145133 | Val accuracy: 0.6193317422434368
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.2277932709430235 | Val loss: 1.446274825986826 | Val accuracy: 0.613365155131265
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.9734150495190742 | Val loss: 1.4137413726662689 | Val accuracy: 0.613365155131265
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.7562833770209829 | Val loss: 1.5481858776425415 | Val accuracy: 0.6467780429594272
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.594547966337305 | Val loss: 1.5313587649813238 | Val accuracy: 0.6229116945107399
Epoch 7/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.4812797046013934 | Val loss: 1.6609298783653188 | Val accuracy: 0.6420047732696897
Early stopping triggered at epoch 7
Fold 6


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.9976675687199932 | Val loss: 1.8340930331428096 | Val accuracy: 0.5644391408114559
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.5258256359878233 | Val loss: 1.58941898255978 | Val accuracy: 0.5871121718377088
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.2041444061166149 | Val loss: 1.4948001762606062 | Val accuracy: 0.6229116945107399
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.9370533030448576 | Val loss: 1.498134711441004 | Val accuracy: 0.6276849642004774
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.7122204017083523 | Val loss: 1.5377196872009422 | Val accuracy: 0.616945107398568
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.5437752527453131 | Val loss: 1.5675279959192816 | Val accuracy: 0.630071599045346
Early stopping triggered at epoch 6
Fold 7


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.9399234734601893 | Val loss: 1.4968295153581872 | Val accuracy: 0.6276849642004774
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.4697138184713106 | Val loss: 1.4144841578771483 | Val accuracy: 0.6288782816229117
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.1585763915110443 | Val loss: 1.3633243486566364 | Val accuracy: 0.6431980906921241
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.9024665769410588 | Val loss: 1.3750107108422045 | Val accuracy: 0.6384248210023866
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.6979694666056815 | Val loss: 1.3572308668550455 | Val accuracy: 0.6575178997613366
Epoch 6/20


100%|██████████| 472/472 [03:19<00:00,  2.37it/s]


Train loss: 0.533882397866287 | Val loss: 1.3939152196893152 | Val accuracy: 0.662291169451074
Epoch 7/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.4339661336882766 | Val loss: 1.4704520848562133 | Val accuracy: 0.6599045346062052
Epoch 8/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.35407408355277475 | Val loss: 1.617097261941658 | Val accuracy: 0.6551312649164678
Early stopping triggered at epoch 8
Fold 8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.9992907503146236 | Val loss: 1.7482929859521255 | Val accuracy: 0.5871121718377088
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.5431400010267557 | Val loss: 1.5232337701995418 | Val accuracy: 0.6217183770883055
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.2572280691336777 | Val loss: 1.4365240920264766 | Val accuracy: 0.6396181384248211
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 0.9934179958555911 | Val loss: 1.547347144135889 | Val accuracy: 0.6372315035799523
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.8067623854504299 | Val loss: 1.506786120387743 | Val accuracy: 0.6396181384248211
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.6264893585437182 | Val loss: 1.6267741094782668 | Val accuracy: 0.6479713603818616
Early stopping triggered at epoch 6
Fold 9


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.37it/s]


Train loss: 1.9557508168331648 | Val loss: 1.7815079227933344 | Val accuracy: 0.5286396181384249
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.489527616776147 | Val loss: 1.5605523507550079 | Val accuracy: 0.5847255369928401
Epoch 3/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.202445075673572 | Val loss: 1.3488499792116992 | Val accuracy: 0.6384248210023866
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.9306567496288631 | Val loss: 1.3175628933141816 | Val accuracy: 0.6599045346062052
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.7124215619070298 | Val loss: 1.4329317583228058 | Val accuracy: 0.6563245823389022
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.5341724085353189 | Val loss: 1.5097187785607464 | Val accuracy: 0.6431980906921241
Epoch 7/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.4495799266733229 | Val loss: 1.487479492740811 | Val accuracy: 0.6694510739856803
Early stopping triggered at epoch 7
Fold 10


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.9657773641332732 | Val loss: 1.6069705722466954 | Val accuracy: 0.6193317422434368
Epoch 2/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 1.489341743479846 | Val loss: 1.4354533316954128 | Val accuracy: 0.6336515513126492
Epoch 3/20


100%|██████████| 472/472 [03:19<00:00,  2.37it/s]


Train loss: 1.1749951461235346 | Val loss: 1.3831188847433846 | Val accuracy: 0.6587112171837709
Epoch 4/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.9051675206618541 | Val loss: 1.3945068806972143 | Val accuracy: 0.6515513126491647
Epoch 5/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.6925637411521905 | Val loss: 1.352493039841922 | Val accuracy: 0.6730310262529833
Epoch 6/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.5591060591356481 | Val loss: 1.404189462931651 | Val accuracy: 0.6813842482100239
Epoch 7/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.4383112332325871 | Val loss: 1.451171434150552 | Val accuracy: 0.662291169451074
Epoch 8/20


100%|██████████| 472/472 [03:18<00:00,  2.38it/s]


Train loss: 0.3638418332093506 | Val loss: 1.574734388657336 | Val accuracy: 0.6479713603818616
Early stopping triggered at epoch 8
10-fold Cross-Validation Results:
Average Validation Loss: 1.5720086407830132
Average Validation Accuracy: 0.6479713603818615
