In [1]:
import json
import torch
import numpy as np
from pathlib import Path
from datetime import datetime
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback,
    TrainerCallback
)
from torch.utils.data import Dataset
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import warnings
import sys
from time import time

warnings.filterwarnings('ignore')
import gc
torch.cuda.empty_cache()
gc.collect()

print("All imports successful!")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


sys.path.append('./model')
from label_config import label2id, id2label, ENTITY_LABELS
print(f"Loaded {len(label2id)} labels, {len(ENTITY_LABELS)} entity types")

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
set_seed(42)
print("Random seed set")


class FinancialNERDataset(Dataset):
    def __init__(self, jsonl_file, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    self.data.append(json.loads(line.strip()))
        print(f"Loaded {len(self.data)} examples from {jsonl_file.name}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        labels = item['ner_tag_ids']
        encoding = self.tokenizer(
            item['text'],
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        padded_labels = labels + [-100] * (self.max_length - len(labels))
        padded_labels = padded_labels[:self.max_length]
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(padded_labels, dtype=torch.long)
        }

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)
    true_labels = []
    pred_labels = []
    for pred_seq, label_seq in zip(predictions, labels):
        true_seq = []
        pred_seq_labels = []
        for pred_id, label_id in zip(pred_seq, label_seq):
            if label_id != -100:
                true_seq.append(id2label[label_id])
                pred_seq_labels.append(id2label[pred_id])
        if true_seq:
            true_labels.append(true_seq)
            pred_labels.append(pred_seq_labels)
    return {
        'precision': precision_score(true_labels, pred_labels),
        'recall': recall_score(true_labels, pred_labels),
        'f1': f1_score(true_labels, pred_labels)
    }

print("All functions defined")


All imports successful!
Loaded 53 labels, 26 entity types
Random seed set
All functions defined


In [2]:
base_dir = Path.cwd().parent
data_dir = base_dir / 'data' / 'merged'
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = base_dir / 'model_outputs' / f'finbert_ner_{timestamp}'
output_dir.mkdir(parents=True, exist_ok=True)


print("LOADING MODEL AND DATA")

model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    use_safetensors=True
)
model.to(device)
print(f"Model loaded: {model_name}")

train_dataset = FinancialNERDataset(data_dir / 'train.jsonl', tokenizer)
val_dataset = FinancialNERDataset(data_dir / 'val.jsonl', tokenizer)
test_dataset = FinancialNERDataset(data_dir / 'test.jsonl', tokenizer)

print(f"\nDatasets loaded")
print(f"  Train: {len(train_dataset)}")
print(f"  Val: {len(val_dataset)}")
print(f"  Test: {len(test_dataset)}")

batch_size = 10
gradient_accumulation = 2
num_epochs = 5
learning_rate = 3e-5


LOADING MODEL AND DATA


Some weights of BertForTokenClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([53]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([53, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: ProsusAI/finbert
Loaded 19277 examples from train.jsonl
Loaded 4060 examples from val.jsonl
Loaded 3560 examples from test.jsonl

Datasets loaded
  Train: 19277
  Val: 4060
  Test: 3560


In [3]:
print(f"\nTraining Hyperparameters:")
print(f"  Epochs:                    {num_epochs}")
print(f"  Batch Size:                {batch_size}")
print(f"  Gradient Accumulation:     {gradient_accumulation}")
print(f"  Effective Batch Size:      {batch_size * gradient_accumulation}")
print(f"  Learning Rate:             {learning_rate}")
print(f"  Weight Decay:              0.01")
print(f"  Warmup Ratio:              0.1")


Training Hyperparameters:
  Epochs:                    5
  Batch Size:                10
  Gradient Accumulation:     2
  Effective Batch Size:      20
  Learning Rate:             3e-05
  Weight Decay:              0.01
  Warmup Ratio:              0.1


In [4]:
print("STARTING TRAINING")

if torch.cuda.is_available():
    torch.cuda.empty_cache()

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir=str(output_dir),
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=3,
    learning_rate=learning_rate,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=gradient_accumulation,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_steps=10,  
    logging_strategy='steps',
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    seed=42,
    report_to='none',
    disable_tqdm=True
)

class Every10StepsCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
        self.epoch_start_time = None
        
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time()
        total_steps = state.max_steps
        print(f"Training for {args.num_train_epochs} epochs ({total_steps} total steps)")

    
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time()
        epoch_num = int(state.epoch) + 1
        print(f"\nEpoch {epoch_num}/{args.num_train_epochs}")
        print("─"*80)
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and state.global_step > 0:
            step = state.global_step
            max_step = state.max_steps
            progress = (step / max_step) * 100
            
         
            if 'loss' in logs:
                msg = f"Step {step:4d}/{max_step} ({progress:5.1f}%)"
                msg += f" │ Loss: {logs['loss']:.4f}"
                
                if 'learning_rate' in logs:
                    msg += f" │ LR: {logs['learning_rate']:.2e}"
                
                print(msg)
            
          
            elif 'eval_f1' in logs:
                print(f"─"*80)
                print(f"Validation │ F1: {logs['eval_f1']:.4f} │ "
                      f"Precision: {logs['eval_precision']:.4f} │ "
                      f"Recall: {logs['eval_recall']:.4f}")
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if self.epoch_start_time:
            epoch_time = time() - self.epoch_start_time
            print(f"─"*80)
            print(f"✓ Epoch completed in {epoch_time:.1f}s\n")
    
    def on_train_end(self, args, state, control, **kwargs):
        if self.start_time:
            total_time = time() - self.start_time
            print("\n" + "="*80)
            print(f"TRAINING COMPLETE")
            print(f"Total time: {total_time/60:.1f} minutes ({total_time:.0f}s)")
            print("="*80 + "\n")


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
        Every10StepsCallback(),
        EarlyStoppingCallback(early_stopping_patience=3)
    ]
)


train_result = trainer.train()




STARTING TRAINING
Training for 5 epochs (4820 total steps)

Epoch 1/5
────────────────────────────────────────────────────────────────────────────────
Step   10/4820 (  0.2%) │ Loss: 3.9770 │ LR: 5.60e-07
{'loss': 3.977, 'grad_norm': 7.581172466278076, 'learning_rate': 5.601659751037344e-07, 'epoch': 0.01037344398340249}
Step   20/4820 (  0.4%) │ Loss: 3.9142 │ LR: 1.18e-06
{'loss': 3.9142, 'grad_norm': 7.770468235015869, 'learning_rate': 1.1825726141078837e-06, 'epoch': 0.02074688796680498}
Step   30/4820 (  0.6%) │ Loss: 3.7855 │ LR: 1.80e-06
{'loss': 3.7855, 'grad_norm': 7.890115261077881, 'learning_rate': 1.8049792531120333e-06, 'epoch': 0.03112033195020747}
Step   40/4820 (  0.8%) │ Loss: 3.5739 │ LR: 2.43e-06
{'loss': 3.5739, 'grad_norm': 7.936939239501953, 'learning_rate': 2.4273858921161828e-06, 'epoch': 0.04149377593360996}
Step   50/4820 (  1.0%) │ Loss: 3.2876 │ LR: 3.05e-06
{'loss': 3.2876, 'grad_norm': 7.796020984649658, 'learning_rate': 3.049792531120332e-06, 'epoch': 0.0