# 02. Model Training

**Target Metric: F1 Score**

**Reason: Imbalanced Label Distribution**

## 0. Imports & Setup

In [1]:
import platform, subprocess, psutil, torch, transformers, os, logging, warnings, json, yaml

import tensorflow as tf
import pandas as pd
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments,              #type: ignore
    Trainer,                        #type: ignore
    EarlyStoppingCallback,          #type: ignore
    get_linear_schedule_with_warmup #type: ignore
)

# ----------------------------------- SETUP ---------------------------------- #

# Load configuration
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# matplotlib inline setup
%matplotlib inline

2025-08-12 17:00:55.906130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755014455.917668   16260 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755014455.921105   16260 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755014455.931308   16260 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755014455.931319   16260 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755014455.931320   16260 computation_placer.cc:177] computation placer alr

## 1. System Constraints

In [2]:
def check_system_constraints():
    """Check system resources and constraints for model training"""
    
    # Basic system info
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"Python version: {platform.python_version()}")
    print(f"Architecture: {platform.machine()}")
    
    # Check available memory
    try:
        memory = psutil.virtual_memory()
        print(f"Total RAM: {memory.total / (1024**3):.1f} GB")
        print(f"Available RAM: {memory.available / (1024**3):.1f} GB")
        print(f"RAM Usage: {memory.percent}%")
        
        # Disk space
        disk = psutil.disk_usage('/')
        print(f"Total Disk: {disk.total / (1024**3):.1f} GB")
        print(f"Free Disk: {disk.free / (1024**3):.1f} GB")
    except ImportError:
        print("psutil not installed - install with: pip install psutil")
    
    # Check CUDA/GPU availability
    print("GPU CHECK")
    
    # Check NVIDIA GPU
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✓ NVIDIA GPU detected")
            # Extract basic GPU info
            lines = result.stdout.split('\n')
            for line in lines:
                if 'Tesla' in line or 'GeForce' in line or 'Quadro' in line or 'RTX' in line or 'GTX' in line:
                    print(f"GPU: {line.strip()}")
        else:
            print("✗ NVIDIA GPU not detected or nvidia-smi not available")
    except FileNotFoundError:
        print("✗ nvidia-smi not found")
    
    # Check PyTorch GPU support
    try:
        print(f"PyTorch version: {torch.__version__}")
        if torch.cuda.is_available():
            print(f"✓ CUDA available: {torch.version.cuda}")   #type: ignore
            print(f"✓ GPU count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                props = torch.cuda.get_device_properties(i)
                print(f"  GPU {i}: {props.name} ({props.total_memory / 1024**3:.1f} GB)")
        else:
            print("✗ CUDA not available in PyTorch")
    except ImportError:
        print("PyTorch not installed")
    
    # Check TensorFlow GPU support
    try:
        print(f"TensorFlow version: {tf.__version__}")
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            print(f"✓ TensorFlow GPU support: {len(gpus)} GPU(s)")
            for i, gpu in enumerate(gpus):
                print(f"  GPU {i}: {gpu.name}")
        else:
            print("✗ No GPU support in TensorFlow")
    except ImportError:
        print("TensorFlow not installed")
    
    # Check transformers library
    try:
        print(f"Transformers version: {transformers.__version__}")
    except ImportError:
        print("Transformers library not installed")
        
    print("\nRECOMMENDATIONS")
    
    # Memory recommendations
    try:
        if memory.available / (1024**3) < 8:    #type: ignore
            print("WARNING: Less than 8GB RAM available")
            print("   Consider closing other applications or using smaller batch sizes")
        
        if disk.free / (1024**3) < 10:          #type: ignore
            print("WARNING: Less than 10GB disk space available")
            print("   Model checkpoints and datasets may require significant space")
    except:
        pass
    
    # GPU recommendations
    try:
        if not torch.cuda.is_available():
            print("TIP: Training will use CPU only - consider using Google Colab or cloud GPU")
        elif torch.cuda.device_count() == 1:
            props = torch.cuda.get_device_properties(0)
            if props.total_memory < 8 * 1024**3:  # Less than 8GB VRAM
                print("TIP: GPU has limited memory - use smaller batch sizes and gradient accumulation")
    except:
        pass
    
    print("\nSystem constraints check complete")
    
    # Return device information for use in training
    return {
        'cuda_available': torch.cuda.is_available(),
        'gpu_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
        'memory_gb': memory.available / (1024**3) if 'memory' in locals() else None #type: ignore
    }

# Run the check and get system info
system_info = check_system_constraints()

# ------------------------------ GPU ACTIVATION ------------------------------ #
print("\nGPU ACTIVATION")

# Set up device for PyTorch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU activated: {torch.cuda.get_device_name(0)}")
    print(f"  Device: {device}")
    print(f"  Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
    print(f"  Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
    
    # Set memory allocation strategy (optional)
    torch.cuda.empty_cache()  # Clear cache
    print("  GPU cache cleared")
else:
    device = torch.device("cpu")
    print("Using CPU for training")
    print(f"  Device: {device}")

print(f"Training device set to: {device}")

Platform: Linux 6.8.0-65-generic
Python version: 3.10.12
Architecture: x86_64
Total RAM: 31.2 GB
Available RAM: 21.6 GB
RAM Usage: 30.8%
Total Disk: 419.7 GB
Free Disk: 338.8 GB
GPU CHECK
✓ NVIDIA GPU detected
GPU: |   0  NVIDIA GeForce RTX 3070 Ti     On  |   00000000:07:00.0  On |                  N/A |
PyTorch version: 2.7.1+cu126
✓ CUDA available: 12.6
✓ GPU count: 1
  GPU 0: NVIDIA GeForce RTX 3070 Ti (7.7 GB)
TensorFlow version: 2.19.0
✓ TensorFlow GPU support: 1 GPU(s)
  GPU 0: /physical_device:GPU:0
Transformers version: 4.54.1

RECOMMENDATIONS
TIP: GPU has limited memory - use smaller batch sizes and gradient accumulation

System constraints check complete

GPU ACTIVATION
GPU activated: NVIDIA GeForce RTX 3070 Ti
  Device: cuda
  Memory allocated: 0.00 GB
  Memory reserved: 0.00 GB
  GPU cache cleared
Training device set to: cuda


## 2. Load Training, Validation, and Test Data

In [3]:
train, val, test = config['data']['train_data'], config['data']['val_data'], config['data']['test_data']

train, val, test = pd.read_csv(train), pd.read_csv(val), pd.read_csv(test)

print(f"Train set size: {len(train)}, Val set size: {len(val)}, Test set size: {len(test)}")
print("")
print(f"Train \n {train.head()}")
print("")
print(f"Validation \n {val.head()}")
print("")
print(f"Test \n {test.head()}")

Train set size: 3232, Val set size: 693, Test set size: 693

Train 
                                                 text  label
0  The contract covers new energy-efficient AC dr...      1
1  The cranes would be installed onboard two frei...      1
2  Inca Contract Manufacturing will carry out the...      1
3  Finnish metal components supplier Component AY...      2
4  ` Low energy consumption and flexible loading ...      1

Validation 
                                                 text  label
0  Concord would focus on the development, manufa...      1
1  Why put up costly cell phone towers in thinly ...      1
2  Forum needs a clear signal of commitment from ...      1
3  TomTom has given assurances that it will conti...      2
4  The company said it observed a current stabili...      2

Test 
                                                 text  label
0  The Swedish player became majority owner of Ce...      1
1  Ruukki's order book at the end of 2010 was 30%...      2
2  Operat

## 3. Load Metadata

### 3.1 Load Metadata

In [4]:
# ------------------------------ LOAD METADATA ------------------------------ #
metadata = config['data']['metadata']
with open(metadata, 'r') as f:
    metadata = json.load(f)
print("Metadata loaded successfully.")

Metadata loaded successfully.


### 3.2 Display Dataset Information

In [5]:
# ------------------------ DISPLAY DATASET INFORMATION ----------------------- #
print(f"\nDataset Summary:")
print(f"   Total samples: {metadata['dataset_info']['total_samples']:,}")
print(f"   Number of classes: {metadata['dataset_info']['num_classes']}")
print(f"   Class names: {metadata['dataset_info']['class_names']}")
print(f"   Processing date: {metadata['dataset_info']['processing_date']}")


Dataset Summary:
   Total samples: 4,618
   Number of classes: 3
   Class names: ['negative', 'positive', 'neutral']
   Processing date: 2025-08-09T17:27:36.071920


### 3.3 Display Data Splits

In [6]:
# ---------------------------- DISPLAY DATA SPLITS --------------------------- #
print(f"\nData Splits:")
for split in ['train', 'val', 'test']:
    if split in metadata['splits']:
        size = metadata['splits'][split]['size']
        pct = metadata['splits'][split]['percentage']
        print(f"   {split.capitalize()}: {size:,} samples ({pct:.1f}%)")


Data Splits:
   Train: 3,232 samples (70.0%)
   Val: 693 samples (15.0%)
   Test: 693 samples (15.0%)


### 3.4 Display Label Distribution

In [7]:
# ------------------------ DISPLAY LABEL DISTRIBUTION ------------------------ #
print(f"\nLabel Distribution:")
for split in ['train', 'val', 'test']:
    if split in metadata['label_distribution']:
        print(f"   {split.capitalize()}:")
        for label, count in metadata['label_distribution'][split].items():
            print(f"      {label}: {count}")


Label Distribution:
   Train:
      neutral: 1890
      positive: 932
      negative: 410
   Val:
      neutral: 405
      positive: 200
      negative: 88
   Test:
      neutral: 405
      positive: 200
      negative: 88


### 3.5 Display Model Configuration

In [8]:
# ------------------------ DISPLAY MODEL CONFIGURATION ----------------------- #
if 'model_info' in metadata:
    print(f"\nModel Configuration:")
    print(f"   Tokenizer: {metadata['model_info']['tokenizer']}")
    print(f"   Max length: {metadata['model_info']['max_length']}")
    print(f"   Vocab size: {metadata['model_info']['vocab_size']:,}")


Model Configuration:
   Tokenizer: ProsusAI/finbert
   Max length: 512
   Vocab size: 30,522


### 3.6 Display Text Statistics

In [9]:
# -------------------------- DISPLAY TEXT STATISTICS ------------------------- #
if 'text_statistics' in metadata:
    print(f"\nText Statistics:")
    print(f"   Avg characters: {metadata['text_statistics']['avg_char_length']:.1f}")
    print(f"   Avg words: {metadata['text_statistics']['avg_word_length']:.1f}")
    print(f"   Word range: {metadata['text_statistics']['min_word_length']}-{metadata['text_statistics']['max_word_length']} words")


Text Statistics:
   Avg characters: 129.7
   Avg words: 21.1
   Word range: 7-52 words


### 3.7 Display Validation Status

In [10]:
# ------------------------- DISPLAY VALIDATION STATUS ------------------------ #
if 'validation_results' in metadata:
    status = metadata['validation_results']['overall_status']
    print(f"\nData Validation Status: {status}")
    if status != 'PASS':
        print("   Some validation issues detected - check preprocessing notebook")


Data Validation Status: PASS


### 3.8 Extract Important Training Parameters

In [11]:
# ------------------- EXTRACT IMPORTANT TRAINING PARAMETERS ------------------ #
print(f"\nTraining Parameters:")

# Check the structure of label_distribution to find class weights
if 'class_weights' in metadata['label_distribution']:
    # If class weights are stored separately
    class_weights_raw = metadata['label_distribution']['class_weights']
    class_weights = {int(k): float(v) for k, v in class_weights_raw.items()}
elif 'overall' in metadata['label_distribution']:
    # If we need to calculate weights from overall distribution
    overall_dist = metadata['label_distribution']['overall']
    print(f"   Overall label distribution: {overall_dist}")
    
    # Calculate class weights if they're not pre-calculated
    # This is a simple inverse frequency weighting
    total_samples = sum(overall_dist.values())
    num_classes = len(overall_dist)
    class_weights = {}
    
    # Map label names to IDs (assuming order: negative=0, neutral=1, positive=2)
    label_to_id = {'negative': 0, 'neutral': 1, 'positive': 2}
    
    for label, count in overall_dist.items():
        class_id = label_to_id.get(label, len(label_to_id))
        weight = total_samples / (num_classes * count)
        class_weights[class_id] = weight
else:
    print("   Class weights not found in metadata, using equal weights")
    num_classes = metadata['dataset_info']['num_classes']
    class_weights = {i: 1.0 for i in range(num_classes)}

print(f"   Class weights: {class_weights}")

# Also extract other useful parameters
num_classes = metadata['dataset_info']['num_classes']
class_names = metadata['dataset_info']['class_names']
max_length = metadata['model_info']['max_length'] if 'model_info' in metadata else 512
tokenizer_name = metadata['model_info']['tokenizer'] if 'model_info' in metadata else 'ProsusAI/finbert'

print(f"   Number of classes: {num_classes}")
print(f"   Class names: {class_names}")
print(f"   Max sequence length: {max_length}")
print(f"   Tokenizer: {tokenizer_name}")

print(f"\nMetadata analysis complete!")


Training Parameters:
   Overall label distribution: {'neutral': 2700, 'positive': 1332, 'negative': 586}
   Class weights: {1: 0.5701234567901234, 2: 1.1556556556556556, 0: 2.626848691695108}
   Number of classes: 3
   Class names: ['negative', 'positive', 'neutral']
   Max sequence length: 512
   Tokenizer: ProsusAI/finbert

Metadata analysis complete!


## 4. Load Label Encoder Artifacts

## 5. Load FinBERT Tokenizer

## 6. Assign Class Weights

## 7. Prepare Model

Load the pre-trained "ProsusAI/finbert" model with a classification head

Decide on fine-tuning strategy (freeze early layers vs. full fine-tuning)

Consider adding dropout layers for regularization

Set up the model to output logits for your 3 sentiment classes

Move the model to your available device (GPU/CPU)

Batch size: Start with 16 or 32, adjust based on GPU memory

Learning rate: Typical range 1e-5 to 5e-5 for BERT fine-tuning

Number of epochs: Usually 3-5 epochs for BERT models

Optimizer: AdamW with weight decay

Scheduler: Linear warmup with decay

Early stopping: Monitor validation F1-score, not just loss

Use the class weights you calculated in preprocessing

Consider weighted loss function (CrossEntropyLoss with weight parameter)

Alternative: Focal Loss for hard example mining

Monitor per-class metrics, not just overall accuracy

For F1-score focus:


Implement custom compute_metrics function for Trainer

Calculate macro-averaged F1, weighted F1, and per-class F1

Include precision and recall for each class

Generate confusion matrices for detailed analysis

Output directory for model checkpoints

Evaluation strategy (steps vs. epochs)

Save strategy aligned with evaluation

Load best model at end based on F1-score

Gradient accumulation if using small batch sizes

Mixed precision training (fp16) if GPU supports it

Your model and tokenizer

Training and validation datasets

Training arguments

Custom compute_metrics function

Early stopping callback

Potentially custom loss function with class weights

Checkpoint saving strategy

Logging frequency for monitoring

Validation frequency

Memory management during training

How to handle potential GPU memory issues