In [1]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
from config import DEVICE, SEED, MODEL_CONFIG, TRAINING_CONFIG, DATASET_CONFIG
from model import JointCausalModel
from utility import compute_class_weights, label_value_counts
from dataset_collator import CausalDataset, CausalDatasetCollator
from config import id2label_cls, id2label_bio, id2label_rel
from evaluate_joint_causal_model import evaluate_model, print_eval_report
from trainer import train_model
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# train_data_path = "/home/rnorouzini/JointLearning/datasets/expert_multi_task_data/train.csv"
train_data_path = "/home/rnorouzini/JointLearning/datasets/pseudo_annotate_data/llama3_8b_processed.csv"
val_data_path = "/home/rnorouzini/JointLearning/datasets/expert_multi_task_data/val.csv"
test_data_path = "/home/rnorouzini/JointLearning/datasets/expert_multi_task_data/test.csv"
train_df = pd.read_csv(train_data_path)
val_df = pd.read_csv(val_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
train_dataset = CausalDataset(
    train_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%
val_dataset = CausalDataset(
    val_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%
test_dataset = CausalDataset(
    test_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%

In [4]:
labels_flat = label_value_counts(train_dataset)
# %%
cls_label_flat = labels_flat["cls_labels_flat"]
bio_label_flat = labels_flat["bio_labels_flat"]
rel_label_flat = labels_flat["rel_labels_flat"]
# %%
# Calculate class weights
cls_weights = compute_class_weights(labels_list=cls_label_flat, num_classes=MODEL_CONFIG["num_cls_labels"], technique="ens", ignore_index=-100)
bio_weights = compute_class_weights(labels_list=bio_label_flat, num_classes=MODEL_CONFIG["num_bio_labels"], technique="ens", ignore_index=-100)
rel_weights = compute_class_weights(labels_list=rel_label_flat, num_classes=MODEL_CONFIG["num_rel_labels"], technique="ens", ignore_index=-100)
print(f"CLS Weights: {cls_weights}")
print(f"BIO Weights: {bio_weights}")
print(f"REL Weights: {rel_weights}")
# %%

cls_labels_value_counts:
 1    75746
0    24254
Name: count, dtype: int64
bio_labels_value_counts:
  6      1965606
 3       603618
 1       556437
-100     200000
 2        95248
 0        95057
 5        72190
 4        11638
Name: count, dtype: int64
rel_labels_value_counts:
 0    223800
1    115514
Name: count, dtype: int64
CLS Weights: tensor([0.0010, 0.0010])
BIO Weights: tensor([0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010])
REL Weights: tensor([0.0010, 0.0010])


In [9]:
collator = CausalDatasetCollator(
    tokenizer=train_dataset.tokenizer
)
# %%
# take a 100 samples from train_dataset
# train_dataset = torch.utils.data.Subset(train_dataset, random.sample(range(len(train_dataset)), 20))
# val_dataset = torch.utils.data.Subset(val_dataset, random.sample(range(len(val_dataset)), 20))
# # %%
train_dataloader = DataLoader(
    train_dataset,
    batch_size=TRAINING_CONFIG["batch_size"],
    collate_fn=collator,
    shuffle=True

)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=TRAINING_CONFIG["batch_size"],
    collate_fn=collator,
    shuffle=False
)

In [10]:
model = JointCausalModel(
    encoder_name=MODEL_CONFIG["encoder_name"],
    num_cls_labels=MODEL_CONFIG["num_cls_labels"],
    num_bio_labels=MODEL_CONFIG["num_bio_labels"],
    num_rel_labels=MODEL_CONFIG["num_rel_labels"],
    dropout=MODEL_CONFIG["dropout"]
)


In [11]:
optimizer = optim.AdamW(
    model.parameters(),
    lr=TRAINING_CONFIG["learning_rate"],
    weight_decay=TRAINING_CONFIG["weight_decay"]
)
# %%
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=2
)

In [12]:
model_save_path = r"/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/expert_bert_GCE_weakSP_model.pt"

In [13]:
trained_model, training_history = train_model(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        num_epochs=TRAINING_CONFIG["num_epochs"],
        device=DEVICE,
        id2label_cls=id2label_cls,
        id2label_bio=id2label_bio,
        id2label_rel=id2label_rel,
        model_save_path=model_save_path,
        scheduler=scheduler,
        patience_epochs=TRAINING_CONFIG["num_epochs"],
        seed=SEED,
        max_grad_norm=TRAINING_CONFIG["gradient_clip_val"],
        eval_fn_metrics=evaluate_model, # Pass your evaluate_model function here
        print_report_fn=print_eval_report, # Pass your print_eval_report function here
        is_silver_training=True
    )

--- Training Configuration ---
Device: cuda
Number of Epochs: 10
Seed: 8642
Optimizer: AdamW (LR: 1e-05, Weight Decay: 0.01)
Scheduler: ReduceLROnPlateau
Gradient Clipping: Disabled (Max Norm: N/A)
Early Stopping Patience: 10
Model Save Path: /home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/expert_bert_GCE_weakSP_model.pt
Mode: Silver Data Training (GCE)
GCE q value: 0.7
Task loss weights not provided, using default: {'cls': 1.0, 'bio': 1.0, 'rel': 1.0}
CLS Class Weights: None
BIO Class Weights: None
REL Class Weights: None
----------------------------


Epoch 1/10 [Training]:   0%|          | 0/6250 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                                                                                                 


Epoch 1/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.7838
  Average Validation Loss:         0.8575
  Overall Validation Avg F1 (Macro): 0.6645
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7048
    Macro Precision: 0.7881
    Macro Recall:    0.7243
    Accuracy:        0.7200
    Per-class details:
      non-causal  : F1=0.6379 (P=0.9328, R=0.4847, Support=229.0)
      causal      : F1=0.7717 (P=0.6435, R=0.9638, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4028
    Macro Precision: 0.3742
    Macro Recall:    0.4860
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.340 (P=0.285, R=0.422, S=263.0)
      I-C       : F1=0.534 (P=0.430, R=0.703, S=1451.0)
      B-E       : F1=0.193 (P=0.268, R=0.151

                                                                                                                                 


Epoch 2/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.5903
  Average Validation Loss:         0.8662
  Overall Validation Avg F1 (Macro): 0.6730
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.6948
    Macro Precision: 0.7799
    Macro Recall:    0.7154
    Accuracy:        0.7111
    Per-class details:
      non-causal  : F1=0.6243 (P=0.9231, R=0.4716, Support=229.0)
      causal      : F1=0.7653 (P=0.6366, R=0.9593, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4247
    Macro Precision: 0.3816
    Macro Recall:    0.5200
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.358 (P=0.307, R=0.430, S=263.0)
      I-C       : F1=0.536 (P=0.439, R=0.689, S=1451.0)
      B-E       : F1=0.369 (P=0.321, R=0.435

                                                                                                                                 


Epoch 3/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.5097
  Average Validation Loss:         0.8110
  Overall Validation Avg F1 (Macro): 0.6937
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7315
    Macro Precision: 0.7979
    Macro Recall:    0.7460
    Accuracy:        0.7422
    Per-class details:
      non-causal  : F1=0.6778 (P=0.9313, R=0.5328, Support=229.0)
      causal      : F1=0.7852 (P=0.6646, R=0.9593, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4391
    Macro Precision: 0.4021
    Macro Recall:    0.5226
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.368 (P=0.310, R=0.452, S=263.0)
      I-C       : F1=0.542 (P=0.430, R=0.735, S=1451.0)
      B-E       : F1=0.362 (P=0.328, R=0.402

                                                                                                                                 


Epoch 4/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.4508
  Average Validation Loss:         0.8106
  Overall Validation Avg F1 (Macro): 0.6991
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7495
    Macro Precision: 0.7898
    Macro Recall:    0.7586
    Accuracy:        0.7556
    Per-class details:
      non-causal  : F1=0.7105 (P=0.8940, R=0.5895, Support=229.0)
      causal      : F1=0.7885 (P=0.6856, R=0.9276, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4378
    Macro Precision: 0.3969
    Macro Recall:    0.5293
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.388 (P=0.338, R=0.456, S=263.0)
      I-C       : F1=0.543 (P=0.456, R=0.673, S=1451.0)
      B-E       : F1=0.357 (P=0.326, R=0.395

                                                                                                                                 


Epoch 5/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3799
  Average Validation Loss:         0.7804
  Overall Validation Avg F1 (Macro): 0.7048
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7596
    Macro Precision: 0.7940
    Macro Recall:    0.7672
    Accuracy:        0.7644
    Per-class details:
      non-causal  : F1=0.7254 (P=0.8917, R=0.6114, Support=229.0)
      causal      : F1=0.7938 (P=0.6962, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4429
    Macro Precision: 0.4011
    Macro Recall:    0.5309
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.378 (P=0.327, R=0.449, S=263.0)
      I-C       : F1=0.551 (P=0.461, R=0.686, S=1451.0)
      B-E       : F1=0.365 (P=0.329, R=0.410

                                                                                                                                 


Epoch 6/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3641
  Average Validation Loss:         0.7783
  Overall Validation Avg F1 (Macro): 0.7069
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7641
    Macro Precision: 0.7989
    Macro Recall:    0.7717
    Accuracy:        0.7689
    Per-class details:
      non-causal  : F1=0.7306 (P=0.8981, R=0.6157, Support=229.0)
      causal      : F1=0.7977 (P=0.6997, R=0.9276, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4452
    Macro Precision: 0.4035
    Macro Recall:    0.5329
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.390 (P=0.341, R=0.456, S=263.0)
      I-C       : F1=0.556 (P=0.469, R=0.682, S=1451.0)
      B-E       : F1=0.364 (P=0.327, R=0.410

                                                                                                                                 


Epoch 7/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3543
  Average Validation Loss:         0.7863
  Overall Validation Avg F1 (Macro): 0.7058
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7550
    Macro Precision: 0.7891
    Macro Recall:    0.7628
    Accuracy:        0.7600
    Per-class details:
      non-causal  : F1=0.7202 (P=0.8854, R=0.6070, Support=229.0)
      causal      : F1=0.7899 (P=0.6928, R=0.9186, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4463
    Macro Precision: 0.4040
    Macro Recall:    0.5386
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.379 (P=0.331, R=0.445, S=263.0)
      I-C       : F1=0.554 (P=0.464, R=0.689, S=1451.0)
      B-E       : F1=0.366 (P=0.336, R=0.402

                                                                                                                                 


Epoch 8/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3437
  Average Validation Loss:         0.7877
  Overall Validation Avg F1 (Macro): 0.7045
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7572
    Macro Precision: 0.7925
    Macro Recall:    0.7650
    Accuracy:        0.7622
    Per-class details:
      non-causal  : F1=0.7221 (P=0.8910, R=0.6070, Support=229.0)
      causal      : F1=0.7922 (P=0.6939, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4436
    Macro Precision: 0.4011
    Macro Recall:    0.5350
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.383 (P=0.331, R=0.452, S=263.0)
      I-C       : F1=0.558 (P=0.466, R=0.693, S=1451.0)
      B-E       : F1=0.375 (P=0.336, R=0.424

                                                                                                                                 


Epoch 9/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3414
  Average Validation Loss:         0.7777
  Overall Validation Avg F1 (Macro): 0.7045
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7596
    Macro Precision: 0.7940
    Macro Recall:    0.7672
    Accuracy:        0.7644
    Per-class details:
      non-causal  : F1=0.7254 (P=0.8917, R=0.6114, Support=229.0)
      causal      : F1=0.7938 (P=0.6962, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4434
    Macro Precision: 0.4011
    Macro Recall:    0.5341
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.382 (P=0.331, R=0.452, S=263.0)
      I-C       : F1=0.557 (P=0.467, R=0.692, S=1451.0)
      B-E       : F1=0.374 (P=0.336, R=0.421

                                                                                                                                  


Epoch 10/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3432
  Average Validation Loss:         0.7881
  Overall Validation Avg F1 (Macro): 0.7067
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7614
    Macro Precision: 0.7993
    Macro Recall:    0.7696
    Accuracy:        0.7667
    Per-class details:
      non-causal  : F1=0.7258 (P=0.9026, R=0.6070, Support=229.0)
      causal      : F1=0.7969 (P=0.6959, R=0.9321, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4492
    Macro Precision: 0.4049
    Macro Recall:    0.5440
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.378 (P=0.326, R=0.449, S=263.0)
      I-C       : F1=0.556 (P=0.464, R=0.694, S=1451.0)
      B-E       : F1=0.373 (P=0.335, R=0.42



```
--- Training Configuration ---
Device: cuda
Number of Epochs: 10
Seed: 8642
Optimizer: AdamW (LR: 1e-05, Weight Decay: 0.01)
Scheduler: ReduceLROnPlateau
Gradient Clipping: Disabled (Max Norm: N/A)
Early Stopping Patience: 10
Model Save Path: /home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/expert_bert_GCE_weakSP_model.pt
Mode: Silver Data Training (GCE)
GCE q value: 0.7
Task loss weights not provided, using default: {'cls': 1.0, 'bio': 1.0, 'rel': 1.0}
CLS Class Weights: None
BIO Class Weights: None
REL Class Weights: None
----------------------------
Epoch 1/10 [Training]:   0%|          | 0/6250 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                                                                                                 
================================================================================
Epoch 1/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.7838
  Average Validation Loss:         0.8575
  Overall Validation Avg F1 (Macro): 0.6645
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7048
    Macro Precision: 0.7881
    Macro Recall:    0.7243
    Accuracy:        0.7200
    Per-class details:
      non-causal  : F1=0.6379 (P=0.9328, R=0.4847, Support=229.0)
      causal      : F1=0.7717 (P=0.6435, R=0.9638, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4028
    Macro Precision: 0.3742
    Macro Recall:    0.4860
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.340 (P=0.285, R=0.422, S=263.0)
      I-C       : F1=0.534 (P=0.430, R=0.703, S=1451.0)
      B-E       : F1=0.193 (P=0.268, R=0.151, S=271.0)
      I-E       : F1=0.577 (P=0.488, R=0.704, S=1846.0)
      B-CE      : F1=0.000 (P=0.000, R=0.000, S=15.0)
      I-CE      : F1=0.336 (P=0.226, R=0.649, S=77.0)
      O         : F1=0.841 (P=0.922, R=0.772, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.8859
    Macro Precision: 0.8842
    Macro Recall:    0.8877
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.922 (P=0.927, R=0.917, S=605.0)
      Rel_CE      : F1=0.850 (P=0.842, R=0.858, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.6645
================================================================================

                                                                                                                                 
================================================================================
Epoch 2/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.5903
  Average Validation Loss:         0.8662
  Overall Validation Avg F1 (Macro): 0.6730
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.6948
    Macro Precision: 0.7799
    Macro Recall:    0.7154
    Accuracy:        0.7111
    Per-class details:
      non-causal  : F1=0.6243 (P=0.9231, R=0.4716, Support=229.0)
      causal      : F1=0.7653 (P=0.6366, R=0.9593, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4247
    Macro Precision: 0.3816
    Macro Recall:    0.5200
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.358 (P=0.307, R=0.430, S=263.0)
      I-C       : F1=0.536 (P=0.439, R=0.689, S=1451.0)
      B-E       : F1=0.369 (P=0.321, R=0.435, S=271.0)
      I-E       : F1=0.578 (P=0.476, R=0.733, S=1846.0)
      B-CE      : F1=0.000 (P=0.000, R=0.000, S=15.0)
      I-CE      : F1=0.298 (P=0.198, R=0.597, S=77.0)
      O         : F1=0.834 (P=0.930, R=0.755, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.8995
    Macro Precision: 0.8969
    Macro Recall:    0.9023
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.931 (P=0.938, R=0.924, S=605.0)
      Rel_CE      : F1=0.868 (P=0.856, R=0.881, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.6730
================================================================================

                                                                                                                                 
================================================================================
Epoch 3/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.5097
  Average Validation Loss:         0.8110
  Overall Validation Avg F1 (Macro): 0.6937
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7315
    Macro Precision: 0.7979
    Macro Recall:    0.7460
    Accuracy:        0.7422
    Per-class details:
      non-causal  : F1=0.6778 (P=0.9313, R=0.5328, Support=229.0)
      causal      : F1=0.7852 (P=0.6646, R=0.9593, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4391
    Macro Precision: 0.4021
    Macro Recall:    0.5226
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.368 (P=0.310, R=0.452, S=263.0)
      I-C       : F1=0.542 (P=0.430, R=0.735, S=1451.0)
      B-E       : F1=0.362 (P=0.328, R=0.402, S=271.0)
      I-E       : F1=0.597 (P=0.510, R=0.722, S=1846.0)
      B-CE      : F1=0.083 (P=0.111, R=0.067, S=15.0)
      I-CE      : F1=0.284 (P=0.195, R=0.519, S=77.0)
      O         : F1=0.838 (P=0.931, R=0.761, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9106
    Macro Precision: 0.9115
    Macro Recall:    0.9096
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.939 (P=0.937, R=0.942, S=601.0)
      Rel_CE      : F1=0.882 (P=0.886, R=0.877, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.6937
================================================================================

                                                                                                                                 
================================================================================
Epoch 4/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.4508
  Average Validation Loss:         0.8106
  Overall Validation Avg F1 (Macro): 0.6991
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7495
    Macro Precision: 0.7898
    Macro Recall:    0.7586
    Accuracy:        0.7556
    Per-class details:
      non-causal  : F1=0.7105 (P=0.8940, R=0.5895, Support=229.0)
      causal      : F1=0.7885 (P=0.6856, R=0.9276, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4378
    Macro Precision: 0.3969
    Macro Recall:    0.5293
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.388 (P=0.338, R=0.456, S=263.0)
      I-C       : F1=0.543 (P=0.456, R=0.673, S=1451.0)
      B-E       : F1=0.357 (P=0.326, R=0.395, S=271.0)
      I-E       : F1=0.583 (P=0.502, R=0.696, S=1846.0)
      B-CE      : F1=0.093 (P=0.071, R=0.133, S=15.0)
      I-CE      : F1=0.254 (P=0.164, R=0.571, S=77.0)
      O         : F1=0.845 (P=0.922, R=0.781, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9099
    Macro Precision: 0.9162
    Macro Recall:    0.9044
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.941 (P=0.928, R=0.954, S=608.0)
      Rel_CE      : F1=0.879 (P=0.904, R=0.855, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.6991
================================================================================

                                                                                                                                 
================================================================================
Epoch 5/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3799
  Average Validation Loss:         0.7804
  Overall Validation Avg F1 (Macro): 0.7048
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7596
    Macro Precision: 0.7940
    Macro Recall:    0.7672
    Accuracy:        0.7644
    Per-class details:
      non-causal  : F1=0.7254 (P=0.8917, R=0.6114, Support=229.0)
      causal      : F1=0.7938 (P=0.6962, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4429
    Macro Precision: 0.4011
    Macro Recall:    0.5309
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.378 (P=0.327, R=0.449, S=263.0)
      I-C       : F1=0.551 (P=0.461, R=0.686, S=1451.0)
      B-E       : F1=0.365 (P=0.329, R=0.410, S=271.0)
      I-E       : F1=0.592 (P=0.515, R=0.696, S=1846.0)
      B-CE      : F1=0.100 (P=0.080, R=0.133, S=15.0)
      I-CE      : F1=0.266 (P=0.175, R=0.558, S=77.0)
      O         : F1=0.847 (P=0.920, R=0.785, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9119
    Macro Precision: 0.9147
    Macro Recall:    0.9093
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.941 (P=0.935, R=0.948, S=610.0)
      Rel_CE      : F1=0.882 (P=0.894, R=0.871, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.7048
================================================================================

                                                                                                                                 
================================================================================
Epoch 6/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3641
  Average Validation Loss:         0.7783
  Overall Validation Avg F1 (Macro): 0.7069
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7641
    Macro Precision: 0.7989
    Macro Recall:    0.7717
    Accuracy:        0.7689
    Per-class details:
      non-causal  : F1=0.7306 (P=0.8981, R=0.6157, Support=229.0)
      causal      : F1=0.7977 (P=0.6997, R=0.9276, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4452
    Macro Precision: 0.4035
    Macro Recall:    0.5329
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.390 (P=0.341, R=0.456, S=263.0)
      I-C       : F1=0.556 (P=0.469, R=0.682, S=1451.0)
      B-E       : F1=0.364 (P=0.327, R=0.410, S=271.0)
      I-E       : F1=0.585 (P=0.506, R=0.692, S=1846.0)
      B-CE      : F1=0.103 (P=0.083, R=0.133, S=15.0)
      I-CE      : F1=0.272 (P=0.178, R=0.571, S=77.0)
      O         : F1=0.847 (P=0.919, R=0.786, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9115
    Macro Precision: 0.9143
    Macro Recall:    0.9089
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.941 (P=0.935, R=0.947, S=603.0)
      Rel_CE      : F1=0.882 (P=0.894, R=0.871, S=310.0)
--------------------------------------------------------------------------------
Status: 🎉 New best model saved! Overall Avg F1: 0.7069
================================================================================

                                                                                                                                 
================================================================================
Epoch 7/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3543
  Average Validation Loss:         0.7863
  Overall Validation Avg F1 (Macro): 0.7058
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7550
    Macro Precision: 0.7891
    Macro Recall:    0.7628
    Accuracy:        0.7600
    Per-class details:
      non-causal  : F1=0.7202 (P=0.8854, R=0.6070, Support=229.0)
      causal      : F1=0.7899 (P=0.6928, R=0.9186, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4463
    Macro Precision: 0.4040
    Macro Recall:    0.5386
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.379 (P=0.331, R=0.445, S=263.0)
      I-C       : F1=0.554 (P=0.464, R=0.689, S=1451.0)
      B-E       : F1=0.366 (P=0.336, R=0.402, S=271.0)
      I-E       : F1=0.579 (P=0.505, R=0.679, S=1846.0)
      B-CE      : F1=0.136 (P=0.103, R=0.200, S=15.0)
      I-CE      : F1=0.263 (P=0.171, R=0.571, S=77.0)
      O         : F1=0.845 (P=0.918, R=0.784, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9160
    Macro Precision: 0.9220
    Macro Recall:    0.9108
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.945 (P=0.932, R=0.957, S=606.0)
      Rel_CE      : F1=0.887 (P=0.912, R=0.865, S=310.0)
--------------------------------------------------------------------------------
Status: Overall Avg F1 did not improve. Best: 0.7069. Patience: 1/10
================================================================================

                                                                                                                                 
================================================================================
Epoch 8/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3437
  Average Validation Loss:         0.7877
  Overall Validation Avg F1 (Macro): 0.7045
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7572
    Macro Precision: 0.7925
    Macro Recall:    0.7650
    Accuracy:        0.7622
    Per-class details:
      non-causal  : F1=0.7221 (P=0.8910, R=0.6070, Support=229.0)
      causal      : F1=0.7922 (P=0.6939, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4436
    Macro Precision: 0.4011
    Macro Recall:    0.5350
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.383 (P=0.331, R=0.452, S=263.0)
      I-C       : F1=0.558 (P=0.466, R=0.693, S=1451.0)
      B-E       : F1=0.375 (P=0.336, R=0.424, S=271.0)
      I-E       : F1=0.583 (P=0.506, R=0.688, S=1846.0)
      B-CE      : F1=0.095 (P=0.074, R=0.133, S=15.0)
      I-CE      : F1=0.267 (P=0.174, R=0.571, S=77.0)
      O         : F1=0.845 (P=0.920, R=0.782, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9127
    Macro Precision: 0.9158
    Macro Recall:    0.9098
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.942 (P=0.935, R=0.949, S=603.0)
      Rel_CE      : F1=0.884 (P=0.897, R=0.871, S=310.0)
--------------------------------------------------------------------------------
Status: Overall Avg F1 did not improve. Best: 0.7069. Patience: 2/10
================================================================================

                                                                                                                                 
================================================================================
Epoch 9/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3414
  Average Validation Loss:         0.7777
  Overall Validation Avg F1 (Macro): 0.7045
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7596
    Macro Precision: 0.7940
    Macro Recall:    0.7672
    Accuracy:        0.7644
    Per-class details:
      non-causal  : F1=0.7254 (P=0.8917, R=0.6114, Support=229.0)
      causal      : F1=0.7938 (P=0.6962, R=0.9231, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4434
    Macro Precision: 0.4011
    Macro Recall:    0.5341
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.382 (P=0.331, R=0.452, S=263.0)
      I-C       : F1=0.557 (P=0.467, R=0.692, S=1451.0)
      B-E       : F1=0.374 (P=0.336, R=0.421, S=271.0)
      I-E       : F1=0.582 (P=0.505, R=0.687, S=1846.0)
      B-CE      : F1=0.098 (P=0.077, R=0.133, S=15.0)
      I-CE      : F1=0.266 (P=0.173, R=0.571, S=77.0)
      O         : F1=0.845 (P=0.919, R=0.782, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9105
    Macro Precision: 0.9129
    Macro Recall:    0.9082
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.940 (P=0.935, R=0.945, S=605.0)
      Rel_CE      : F1=0.881 (P=0.891, R=0.871, S=310.0)
--------------------------------------------------------------------------------
Status: Overall Avg F1 did not improve. Best: 0.7069. Patience: 3/10
================================================================================

                                                                                                                                  
================================================================================
Epoch 10/10 Summary
--------------------------------------------------------------------------------
  Average Training Loss:           0.3432
  Average Validation Loss:         0.7881
  Overall Validation Avg F1 (Macro): 0.7067
--------------------------------------------------------------------------------
Task-Specific Validation Performance:

  [Task 1: Sentence Classification]
    Macro F1-Score:  0.7614
    Macro Precision: 0.7993
    Macro Recall:    0.7696
    Accuracy:        0.7667
    Per-class details:
      non-causal  : F1=0.7258 (P=0.9026, R=0.6070, Support=229.0)
      causal      : F1=0.7969 (P=0.6959, R=0.9321, Support=221.0)

  [Task 2: BIO Prediction (Token-BIO)]
    Macro F1-Score:  0.4492
    Macro Precision: 0.4049
    Macro Recall:    0.5440
    Per-tag details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      B-C       : F1=0.378 (P=0.326, R=0.449, S=263.0)
      I-C       : F1=0.556 (P=0.464, R=0.694, S=1451.0)
      B-E       : F1=0.373 (P=0.335, R=0.421, S=271.0)
      I-E       : F1=0.584 (P=0.504, R=0.694, S=1846.0)
      B-CE      : F1=0.140 (P=0.107, R=0.200, S=15.0)
      I-CE      : F1=0.270 (P=0.177, R=0.571, S=77.0)
      O         : F1=0.844 (P=0.921, R=0.780, S=11513.0)

  [Task 3: Relation Prediction]
    Macro F1-Score:  0.9093
    Macro Precision: 0.9100
    Macro Recall:    0.9087
    Per-relation type details (P=Precision, R=Recall, F1=F1-Score, S=Support):
      Rel_None    : F1=0.938 (P=0.937, R=0.940, S=600.0)
      Rel_CE      : F1=0.880 (P=0.883, R=0.877, S=310.0)
--------------------------------------------------------------------------------
Status: Overall Avg F1 did not improve. Best: 0.7069. Patience: 4/10
================================================================================

Loading best model state (in memory) with F1: 0.7069
Training history saved to /home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/training_history.json
```

In [14]:
trained_model.save_pretrained(
    r"/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP"
)

In [15]:
train_dataset.tokenizer.save_pretrained(
    r"/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP"
)

('/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP/tokenizer_config.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP/special_tokens_map.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP/vocab.txt',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP/added_tokens.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_GCE_weakSP/hf_exper_bert_GCE_weakSP/tokenizer.json')