# 02. Model Training

**Target Metric: F1 Score**

**Reason: Imbalanced Label Distribution**

## 0. Imports & Setup

In [1]:
import platform, subprocess, psutil, torch, transformers, os, logging, warnings, json, yaml

import tensorflow as tf
import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments,              #type: ignore
    Trainer,                        #type: ignore
    EarlyStoppingCallback,          #type: ignore
    get_linear_schedule_with_warmup #type: ignore
)

# ----------------------------------- SETUP ---------------------------------- #

# Load configuration
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# matplotlib inline setup
%matplotlib inline

2025-09-01 16:07:13.786725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756739233.797885   45037 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756739233.801379   45037 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756739233.811145   45037 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756739233.811154   45037 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756739233.811155   45037 computation_placer.cc:177] computation placer alr

## 1. System Constraints

In [2]:
def check_system_constraints():
    """Check system resources and constraints for model training"""
    
    # Basic system info
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"Python version: {platform.python_version()}")
    print(f"Architecture: {platform.machine()}")
    
    # Check available memory
    try:
        memory = psutil.virtual_memory()
        print(f"Total RAM: {memory.total / (1024**3):.1f} GB")
        print(f"Available RAM: {memory.available / (1024**3):.1f} GB")
        print(f"RAM Usage: {memory.percent}%")
        
        # Disk space
        disk = psutil.disk_usage('/')
        print(f"Total Disk: {disk.total / (1024**3):.1f} GB")
        print(f"Free Disk: {disk.free / (1024**3):.1f} GB")
    except ImportError:
        print("psutil not installed - install with: pip install psutil")
    
    # Check CUDA/GPU availability
    print("GPU CHECK")
    
    # Check NVIDIA GPU
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✓ NVIDIA GPU detected")
            # Extract basic GPU info
            lines = result.stdout.split('\n')
            for line in lines:
                if 'Tesla' in line or 'GeForce' in line or 'Quadro' in line or 'RTX' in line or 'GTX' in line:
                    print(f"GPU: {line.strip()}")
        else:
            print("✗ NVIDIA GPU not detected or nvidia-smi not available")
    except FileNotFoundError:
        print("✗ nvidia-smi not found")
    
    # Check PyTorch GPU support
    try:
        print(f"PyTorch version: {torch.__version__}")
        if torch.cuda.is_available():
            print(f"✓ CUDA available: {torch.version.cuda}")   #type: ignore
            print(f"✓ GPU count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
                print(f"  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f} GB)")
        else:
            print("✗ CUDA not available in PyTorch")
    except ImportError:
        print("PyTorch not installed")
    
    # Check TensorFlow GPU support
    try:
        print(f"TensorFlow version: {tf.__version__}")
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            print(f"✓ TensorFlow GPU support: {len(gpus)} GPU(s)")
            for i, gpu in enumerate(gpus):
                print(f"  GPU {i}: {gpu.name}")
        else:
            print("✗ No GPU support in TensorFlow")
    except ImportError:
        print("TensorFlow not installed")
    
    # Check transformers library
    try:
        print(f"Transformers version: {transformers.__version__}")
    except ImportError:
        print("Transformers library not installed")
        
    print("\nRECOMMENDATIONS")
    
    # Memory recommendations
    try:
        if memory.available / (1024**3) < 8:    #type: ignore
            print("WARNING: Less than 8GB RAM available")
            print("   Consider closing other applications or using smaller batch sizes")
        
        if disk.free / (1024**3) < 10:          #type: ignore
            print("WARNING: Less than 10GB disk space available")
            print("   Model checkpoints and datasets may require significant space")
    except:
        pass
    
    # GPU recommendations
    try:
        if not torch.cuda.is_available():
            print("TIP: Training will use CPU only - consider using Google Colab or cloud GPU")
        elif torch.cuda.device_count() == 1:
            props = torch.cuda.get_device_properties(0)
            if props.total_memory < 8 * 1024**3:  # Less than 8GB VRAM
                print("TIP: GPU has limited memory - use smaller batch sizes and gradient accumulation")
    except:
        pass
    
    print("\nSystem constraints check complete")
    
    # Return device information for use in training
    return {
        'cuda_available': torch.cuda.is_available(),
        'gpu_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
        'memory_gb': memory.available / (1024**3) if 'memory' in locals() else None #type: ignore
    }

# Run the check and get system info
system_info = check_system_constraints()

# ------------------------------ GPU ACTIVATION ------------------------------ #
print("\nGPU ACTIVATION")

# Set up device for PyTorch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU activated: {torch.cuda.get_device_name(0)}")
    print(f"  Device: {device}")
    print(f"  Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"  Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    
    # Set memory allocation strategy (optional)
    torch.cuda.empty_cache()  # Clear cache
    print("  GPU cache cleared")
else:
    device = torch.device("cpu")
    print("Using CPU for training")
    print(f"  Device: {device}")

print(f"Training device set to: {device}")

Platform: Linux 6.8.0-65-generic
Python version: 3.10.12
Architecture: x86_64
Total RAM: 31.2 GB
Available RAM: 21.7 GB
RAM Usage: 30.5%
Total Disk: 419.7 GB
Free Disk: 316.1 GB
GPU CHECK
✓ NVIDIA GPU detected
GPU: |   0  NVIDIA GeForce RTX 3070 Ti     On  |   00000000:07:00.0  On |                  N/A |
PyTorch version: 2.7.1+cu126
✓ CUDA available: 12.6
✓ GPU count: 1
  GPU 0: NVIDIA GeForce RTX 3070 Ti (7.7 GB)
TensorFlow version: 2.19.0
✓ TensorFlow GPU support: 1 GPU(s)
  GPU 0: /physical_device:GPU:0
Transformers version: 4.54.1

RECOMMENDATIONS
TIP: GPU has limited memory - use smaller batch sizes and gradient accumulation

System constraints check complete

GPU ACTIVATION
GPU activated: NVIDIA GeForce RTX 3070 Ti
  Device: cuda
  Memory allocated: 0.00 GB
  Memory reserved: 0.00 GB
  GPU cache cleared
Training device set to: cuda


## 2. Load Training, Validation, and Test Data

In [3]:
train, val, test = config['data']['train_data'], config['data']['val_data'], config['data']['test_data']

train, val, test = pd.read_csv(train), pd.read_csv(val), pd.read_csv(test)

print(f"Train set size: {len(train)}, Val set size: {len(val)}, Test set size: {len(test)}")
print("")
print(f"Train \n {train.head()}")
print("")
print(f"Validation \n {val.head()}")
print("")
print(f"Test \n {test.head()}")

Train set size: 3232, Val set size: 693, Test set size: 693

Train 
                                                 text  label
0  The contract covers new energy-efficient AC dr...      1
1  The cranes would be installed onboard two frei...      1
2  Inca Contract Manufacturing will carry out the...      1
3  Finnish metal components supplier Component AY...      2
4  ` Low energy consumption and flexible loading ...      1

Validation 
                                                 text  label
0  Concord would focus on the development, manufa...      1
1  Why put up costly cell phone towers in thinly ...      1
2  Forum needs a clear signal of commitment from ...      1
3  TomTom has given assurances that it will conti...      2
4  The company said it observed a current stabili...      2

Test 
                                                 text  label
0  The Swedish player became majority owner of Ce...      1
1  Ruukki's order book at the end of 2010 was 30%...      2
2  Operat

## 3. Load Metadata

### 3.1 Load Metadata

In [4]:
# ------------------------------ LOAD METADATA ------------------------------ #
metadata = config['data']['metadata']
with open(metadata, 'r') as f:
    metadata = json.load(f)
print("Metadata loaded successfully.")

Metadata loaded successfully.


### 3.2 Display Dataset Information

In [5]:
# ------------------------ DISPLAY DATASET INFORMATION ----------------------- #
print(f"\nDataset Summary:")
print(f"   Total samples: {metadata['dataset_info']['total_samples']:,}")
print(f"   Number of classes: {metadata['dataset_info']['num_classes']}")
print(f"   Class names: {metadata['dataset_info']['class_names']}")
print(f"   Processing date: {metadata['dataset_info']['processing_date']}")


Dataset Summary:
   Total samples: 4,618
   Number of classes: 3
   Class names: ['negative', 'positive', 'neutral']
   Processing date: 2025-08-09T17:27:36.071920


### 3.3 Display Data Splits

In [6]:
# ---------------------------- DISPLAY DATA SPLITS --------------------------- #
print(f"\nData Splits:")
for split in ['train', 'val', 'test']:
    if split in metadata['splits']:
        size = metadata['splits'][split]['size']
        pct = metadata['splits'][split]['percentage']
        print(f"   {split.capitalize()}: {size:,} samples ({pct:.1f}%)")


Data Splits:
   Train: 3,232 samples (70.0%)
   Val: 693 samples (15.0%)
   Test: 693 samples (15.0%)


### 3.4 Display Label Distribution

In [7]:
# ------------------------ DISPLAY LABEL DISTRIBUTION ------------------------ #
print(f"\nLabel Distribution:")
for split in ['train', 'val', 'test']:
    if split in metadata['label_distribution']:
        print(f"   {split.capitalize()}:")
        for label, count in metadata['label_distribution'][split].items():
            print(f"      {label}: {count}")


Label Distribution:
   Train:
      neutral: 1890
      positive: 932
      negative: 410
   Val:
      neutral: 405
      positive: 200
      negative: 88
   Test:
      neutral: 405
      positive: 200
      negative: 88


### 3.5 Display Model Configuration

In [8]:
# ------------------------ DISPLAY MODEL CONFIGURATION ----------------------- #
if 'model_info' in metadata:
    print(f"\nModel Configuration:")
    print(f"   Tokenizer: {metadata['model_info']['tokenizer']}")
    print(f"   Max length: {metadata['model_info']['max_length']}")
    print(f"   Vocab size: {metadata['model_info']['vocab_size']:,}")


Model Configuration:
   Tokenizer: ProsusAI/finbert
   Max length: 512
   Vocab size: 30,522


### 3.6 Display Text Statistics

In [9]:
# -------------------------- DISPLAY TEXT STATISTICS ------------------------- #
if 'text_statistics' in metadata:
    print(f"\nText Statistics:")
    print(f"   Avg characters: {metadata['text_statistics']['avg_char_length']:.1f}")
    print(f"   Avg words: {metadata['text_statistics']['avg_word_length']:.1f}")
    print(f"   Word range: {metadata['text_statistics']['min_word_length']}-{metadata['text_statistics']['max_word_length']} words")


Text Statistics:
   Avg characters: 129.7
   Avg words: 21.1
   Word range: 7-52 words


### 3.7 Display Validation Status

In [10]:
# ------------------------- DISPLAY VALIDATION STATUS ------------------------ #
if 'validation_results' in metadata:
    status = metadata['validation_results']['overall_status']
    print(f"\nData Validation Status: {status}")
    if status != 'PASS':
        print("   Some validation issues detected - check preprocessing notebook")


Data Validation Status: PASS


## 4. Load Label Encoder Artifacts

In [11]:
artifact_path = config['data'].get('label_encoder')
label_to_id = {}
id_to_label = {}
class_weights = None

# Try to load label-encoder artifact (preferred source)
if artifact_path and Path(artifact_path).exists():
    with open(artifact_path, 'r') as f:
        le_data = json.load(f)

    label_to_id = le_data.get('label_to_id', {})  # label name -> id
    raw_id_to_label = le_data.get('id_to_label', {})  # keys may be strings
    id_to_label = {int(k): v for k, v in raw_id_to_label.items()} if raw_id_to_label else {}

    # Normalize class_weights from artifact: keys may be string ids or label names
    raw_class_weights = le_data.get('class_weights') or {}
    cw = {}
    for k, v in raw_class_weights.items():
        try:
            key_int = int(k)
            cw[key_int] = float(v)
        except ValueError:
            # key is label name -> map via label_to_id if possible
            if label_to_id and k in label_to_id:
                cw[label_to_id[k]] = float(v)
    if cw:
        class_weights = cw

# If no usable class_weights from artifact, try metadata
if class_weights is None:
    ld = metadata.get('label_distribution', {})
    if 'class_weights' in ld:
        raw = ld['class_weights']
        cw = {}
        for k, v in raw.items():
            try:
                key_int = int(k)
                cw[key_int] = float(v)
            except ValueError:
                if label_to_id and k in label_to_id:
                    cw[label_to_id[k]] = float(v)
        if cw:
            class_weights = cw
    elif 'overall' in ld:
        overall_dist = ld['overall']
        print(f"   Overall label distribution: {overall_dist}")
        total_samples = sum(overall_dist.values())
        num_labels = len(overall_dist)

        # mapping: prefer label_to_id (artifact), otherwise use metadata class_names order
        if label_to_id:
            mapping = label_to_id
        else:
            mapping = {name: i for i, name in enumerate(metadata['dataset_info']['class_names'])}

        cw = {}
        for label, count in overall_dist.items():
            class_id = mapping.get(label)
            if class_id is None:
                try:
                    class_id = int(label)
                except Exception:
                    continue
            cw[class_id] = total_samples / (num_labels * max(1, count))
        class_weights = cw

# Fallback to equal weights
if class_weights is None:
    num_classes = metadata['dataset_info']['num_classes']
    class_weights = {i: 1.0 for i in range(num_classes)}

print(f"   Class weights: {class_weights}")

# Also extract other useful parameters
num_classes = metadata['dataset_info']['num_classes']
class_names = metadata['dataset_info']['class_names']
max_length = metadata['model_info']['max_length'] if 'model_info' in metadata else 512
tokenizer_name = metadata['model_info']['tokenizer'] if 'model_info' in metadata else 'ProsusAI/finbert'

print(f"   Number of classes: {num_classes}")
print(f"   Class names: {class_names}")
print(f"   Max sequence length: {max_length}")
print(f"   Tokenizer: {tokenizer_name}")

   Class weights: {0: 2.626848691695108, 1: 0.5701234567901234, 2: 1.1556556556556556}
   Number of classes: 3
   Class names: ['negative', 'positive', 'neutral']
   Max sequence length: 512
   Tokenizer: ProsusAI/finbert


## 5. Load FinBERT Tokenizer

In [12]:
try:
    cfg_tokenizer_path = Path(config['data']['tokenizer'])
    tokenizer_dir = cfg_tokenizer_path.parent if cfg_tokenizer_path.is_file() else cfg_tokenizer_path

    print("Using tokenizer directory:", tokenizer_dir)
    if tokenizer_dir.exists():
        finbert_tokenizer = AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)
        print(f"FinBERT tokenizer loaded from local dir: {tokenizer_dir}")
    else:
        finbert_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
        print(f"FinBERT tokenizer loaded from hub: {tokenizer_name}")

    try:
        print(f"  - Vocab size: {len(finbert_tokenizer):,}")
    except Exception:
        pass
except Exception as e:
    print("Error loading FinBERT tokenizer:", e)

Using tokenizer directory: ../data/processed/artifacts/finbert_tokenizer
FinBERT tokenizer loaded from local dir: ../data/processed/artifacts/finbert_tokenizer
  - Vocab size: 30,522


## 6. Assign Class Weights

In [13]:
try:
    # Ensure required variables exist
    assert 'class_weights' in globals() and class_weights is not None

    # Build id -> label name mapping (prefer artifact id_to_label, then invert label_to_id, else use class_names)
    if id_to_label:
        id_to_name = id_to_label
    elif label_to_id:
        id_to_name = {int(v): k for k, v in label_to_id.items()}
    elif class_names:
        id_to_name = {i: name for i, name in enumerate(class_names)}
    else:
        id_to_name = {}

    # Map numeric class_weights -> label names
    mapped_class_weights = {}
    for cid in range(num_classes):
        w = float(class_weights.get(cid, 1.0))
        name = id_to_name.get(cid, str(cid))
        mapped_class_weights[name] = w

    # Prepare tensor for PyTorch loss (ordered by class id 0..num_classes-1)
    import torch
    class_weight_tensor = torch.tensor([float(class_weights.get(i, 1.0)) for i in range(num_classes)], dtype=torch.float)

    # Expose variables for downstream cells
    print("Class weights mapped to labels:")
    for name, w in mapped_class_weights.items():
        print(f"  {name}: {w:.4f}")
    print(f"PyTorch class weight tensor: {class_weight_tensor}")
except Exception as e:
    print("Error assigning class weights to labels:", e)

Class weights mapped to labels:
  negative: 2.6268
  positive: 0.5701
  neutral: 1.1557
PyTorch class weight tensor: tensor([2.6268, 0.5701, 1.1557])


## 7. Prepare Model

### 7.1 Load Model

In [14]:
model_name = "ProsusAI/finbert"

# Load model with proper configuration
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes,
    problem_type="single_label_classification",
    # Add attention and hidden dropout for better regularization
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1
)

# Set label mappings for human-readable output
model.config.id2label = {i: name for i, name in enumerate(class_names)}
model.config.label2id = {name: i for i, name in enumerate(class_names)}

# Enhanced dropout configuration
if hasattr(model, 'classifier'):
    # Add dropout to the classification head if it exists
    if hasattr(model.classifier, 'dropout'):
        model.classifier.dropout = nn.Dropout(0.15)  # Slightly higher for classification layer
    
    # If classifier doesn't have dropout, add it before the final layer
    elif hasattr(model.classifier, 'dense'):
        original_classifier = model.classifier
        model.classifier = nn.Sequential(
            nn.Dropout(0.15),
            original_classifier
        )

# Move model to device
model.to(device)

# Create weighted loss function using precomputed class weights
class_weight_tensor = class_weight_tensor.to(device)  # Move weights to same device as model

# Create custom weighted CrossEntropyLoss
weighted_loss_fn = nn.CrossEntropyLoss(weight=class_weight_tensor, label_smoothing=0.1)

print(f"Class weights applied:")
for i, (name, weight) in enumerate(zip(class_names, class_weight_tensor.cpu())):
    print(f" {name} (id={i}): {weight:.4f}")

# Model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

print(f"\nModel Summary:")
print(f" Model: {model_name}")
print(f" Classes: {num_classes} ({', '.join(class_names)})")
print(f" Total parameters: {total_params:,}")
print(f" Trainable parameters: {trainable_params:,}")
print(f" Frozen parameters: {frozen_params:,}")
print(f" Model size: ~{total_params * 4 / 1024**2:.1f} MB")
print(f" Device: {device}")

Class weights applied:
 negative (id=0): 2.6268
 positive (id=1): 0.5701
 neutral (id=2): 1.1557

Model Summary:
 Model: ProsusAI/finbert
 Classes: 3 (negative, positive, neutral)
 Total parameters: 109,484,547
 Trainable parameters: 109,484,547
 Frozen parameters: 0
 Model size: ~417.7 MB
 Device: cuda


### 7.2 Custom Trainer

In [15]:
class WeightedTrainer(Trainer):
    """Custom trainer with weighted loss for class imbalance"""
    
    def compute_loss(self, model, inputs, return_outputs=False):    #type: ignore
        labels = inputs.get("labels")
        outputs = model(**inputs)
        
        if hasattr(outputs, 'logits'):
            logits = outputs.logits
        else:
            logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]
        
        # Use weighted loss function
        loss = weighted_loss_fn(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

In [16]:
def compute_metrics(eval_pred):
    """
    Comprehensive metrics computation:
    - Macro-averaged F1, weighted F1, per-class F1
    - Precision and recall for each class
    - Accuracy for baseline comparison
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    try:
        # Core metrics
        accuracy = accuracy_score(labels, predictions)
        f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
        f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)
        f1_micro = f1_score(labels, predictions, average='micro', zero_division=0)
        f1_per_class = f1_score(labels, predictions, average=None, zero_division=0)
        
        # Detailed classification report
        report = classification_report(
            labels, predictions, 
            target_names=class_names, 
            output_dict=True, 
            zero_division=0
        )
        
        # Build comprehensive metrics dictionary
        metrics = {
            'accuracy': float(accuracy),
            'f1_macro': float(f1_macro),        # Primary metric for imbalanced data
            'f1_weighted': float(f1_weighted),   # Weighted by support
            'f1_micro': float(f1_micro),        # Overall accuracy equivalent
        }
        
        # Add per-class metrics
        for i, class_name in enumerate(class_names):
            if i < len(f1_per_class):                                                                           #type: ignore
                metrics[f'f1_{class_name}'] = float(f1_per_class[i])                                            #type: ignore
                metrics[f'precision_{class_name}'] = float(report.get(class_name, {}).get('precision', 0.0))    #type: ignore
                metrics[f'recall_{class_name}'] = float(report.get(class_name, {}).get('recall', 0.0))          #type: ignore
        
        return metrics
        
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {'accuracy': 0.0, 'f1_macro': 0.0}

In [17]:
def plot_confusion_matrix(predictions, labels, class_names, title="Confusion Matrix"):
    """Plot confusion matrix for detailed analysis"""
    cm = confusion_matrix(labels, predictions)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    return cm

### 7.3 Training Configuration

In [18]:
gpu_optimized_batch_size = 32
if system_info.get('memory_gb', 0) < 16:
    gpu_optimized_batch_size = 24

# Gradient accumulation for smaller batch sizes
gradient_accumulation_steps = 1
if gpu_optimized_batch_size < 32:
    gradient_accumulation_steps = 2  # Effective batch size = 24*2 = 48

eval_steps = 200
save_steps = 600  # Multiple of eval_steps for compatibility

# Create optimized training arguments
training_args = TrainingArguments(
    output_dir="../models/finbert_sentiment",
    
    num_train_epochs=4,  # Conservative for stability
    
    # Batch sizes: 16-32 range, optimized for GPU
    per_device_train_batch_size=gpu_optimized_batch_size,
    per_device_eval_batch_size=gpu_optimized_batch_size * 2,  # Larger for eval
    
    # Learning rate: 1e-5 to 5e-5 range for BERT
    learning_rate=2e-5,  # Sweet spot for FinBERT
    
    # AdamW optimizer with weight decay (automatically used)
    weight_decay=0.01,
    
    # Gradient accumulation for effective larger batch size
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_grad_norm=1.0,  # Gradient clipping
    
    # Mixed precision training (fp16) for RTX 3070 Ti
    fp16=True,
    
    # Data loading optimization
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    
    # Evaluation strategy: steps-based for frequent monitoring
    eval_strategy="steps",
    eval_steps=eval_steps,
    
    # Save strategy aligned with evaluation
    save_strategy="steps", 
    save_steps=save_steps,
    save_total_limit=3,  # Keep only best 3 checkpoints
    
    # Load best model at end based on F1-score
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",  # F1-macro for imbalanced data
    greater_is_better=True,
    
    # Monitoring and logging
    logging_steps=50,
    logging_strategy="steps",
    
    # Disable external services
    report_to=None,
    push_to_hub=False,
)

# The Trainer will use get_linear_schedule_with_warmup by default

# Early stopping monitoring F1-score
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

print(f"\nTraining Configuration Summary:")
print(f" Batch size: {gpu_optimized_batch_size} (train) / {gpu_optimized_batch_size * 2} (eval)")
print(f" Learning rate: {training_args.learning_rate} (BERT fine-tuning range)")
print(f" Epochs: {training_args.num_train_epochs} (BERT recommended)")
print(f" FP16: {training_args.fp16}")
print(f" Gradient accumulation: {gradient_accumulation_steps} steps")
print(f" Primary metric: F1-macro (imbalanced data focus)")
print(f" Eval/Save steps: {eval_steps}/{save_steps}")
print(f" Early stopping: 3 patience on F1-macro")


Training Configuration Summary:
 Batch size: 32 (train) / 64 (eval)
 Learning rate: 2e-05 (BERT fine-tuning range)
 Epochs: 4 (BERT recommended)
 FP16: True
 Gradient accumulation: 1 steps
 Primary metric: F1-macro (imbalanced data focus)
 Eval/Save steps: 200/600
 Early stopping: 3 patience on F1-macro


### 7.4 Wrap DataFrame in DataSet

In [19]:
class FinSentDataset(torch.utils.data.Dataset):
    """Lightweight Dataset wrapping a pandas DataFrame for Trainer"""
    def __init__(self, df, tokenizer, max_length=512, text_col='text', label_col='label'):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_col = text_col
        self.label_col = label_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row[self.text_col])
        label = int(row[self.label_col])
        enc = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Create dataset instances expected by Trainer
train_dataset = FinSentDataset(train, finbert_tokenizer, max_length=max_length, text_col='text', label_col='label')
val_dataset   = FinSentDataset(val, finbert_tokenizer,   max_length=max_length, text_col='text', label_col='label')
test_dataset  = FinSentDataset(test, finbert_tokenizer,  max_length=max_length, text_col='text', label_col='label')

print(f"Datasets: train={len(train_dataset)}, val={len(val_dataset)}, test={len(test_dataset)}")

Datasets: train=3232, val=693, test=693


### 7.5 Initialise Trainer

In [20]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=finbert_tokenizer,
    compute_metrics=compute_metrics,  # F1-focused metrics
    callbacks=[early_stopping],      # F1-based early stopping
)