In [1]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
from config import DEVICE, SEED, MODEL_CONFIG, TRAINING_CONFIG, DATASET_CONFIG
from model import JointCausalModel
from utility import compute_class_weights, label_value_counts
from dataset_collator import CausalDataset, CausalDatasetCollator
from config import id2label_cls, id2label_bio, id2label_rel
from evaluate_joint_causal_model import evaluate_model, print_eval_report
from trainer import train_model
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_path = "C:\\Users\\norouzin\\Desktop\\JointLearning\\datasets\\expert_multi_task_data\\train.csv"
val_data_path = "C:\\Users\\norouzin\\Desktop\\JointLearning\\datasets\\expert_multi_task_data\\val.csv"
test_data_path = "C:\\Users\\norouzin\\Desktop\\JointLearning\\datasets\\expert_multi_task_data\\test.csv"
train_df = pd.read_csv(train_data_path)
val_df = pd.read_csv(val_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
train_dataset = CausalDataset(
    train_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%
val_dataset = CausalDataset(
    val_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%
test_dataset = CausalDataset(
    test_df,
    tokenizer_name=MODEL_CONFIG["encoder_name"],
    max_length=DATASET_CONFIG["max_length"],
)
# %%

In [4]:
labels_flat = label_value_counts(train_dataset)
# %%
cls_label_flat = labels_flat["cls_labels_flat"]
bio_label_flat = labels_flat["bio_labels_flat"]
rel_label_flat = labels_flat["rel_labels_flat"]
# %%
# Calculate class weights
cls_weights = compute_class_weights(labels_list=cls_label_flat, num_classes=MODEL_CONFIG["num_cls_labels"], technique="ens", ignore_index=-100)
bio_weights = compute_class_weights(labels_list=bio_label_flat, num_classes=MODEL_CONFIG["num_bio_labels"], technique="ens", ignore_index=-100)
rel_weights = compute_class_weights(labels_list=rel_label_flat, num_classes=MODEL_CONFIG["num_rel_labels"], technique="ens", ignore_index=-100)
print(f"CLS Weights: {cls_weights}")
print(f"BIO Weights: {bio_weights}")
print(f"REL Weights: {rel_weights}")
# %%

cls_labels_value_counts:
 0    1047
1    1035
Name: count, dtype: int64
bio_labels_value_counts:
  6      52764
 3       8717
 1       6948
-100     4164
 2       1320
 0       1179
 5        483
 4         79
Name: count, dtype: int64
rel_labels_value_counts:
 0    2887
1    1494
Name: count, dtype: int64
CLS Weights: tensor([0.0015, 0.0016])
BIO Weights: tensor([0.0014, 0.0010, 0.0014, 0.0010, 0.0132, 0.0026, 0.0010])
REL Weights: tensor([0.0011, 0.0013])


In [5]:
collator = CausalDatasetCollator(
    tokenizer=train_dataset.tokenizer
)
# %%
# take a 100 samples from train_dataset
# train_dataset = torch.utils.data.Subset(train_dataset, random.sample(range(len(train_dataset)), 20))
# val_dataset = torch.utils.data.Subset(val_dataset, random.sample(range(len(val_dataset)), 20))
# # %%
train_dataloader = DataLoader(
    train_dataset,
    batch_size=TRAINING_CONFIG["batch_size"],
    collate_fn=collator,
    shuffle=True

)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=TRAINING_CONFIG["batch_size"],
    collate_fn=collator,
    shuffle=False
)

In [6]:
model = JointCausalModel(
    encoder_name=MODEL_CONFIG["encoder_name"],
    num_cls_labels=MODEL_CONFIG["num_cls_labels"],
    num_bio_labels=MODEL_CONFIG["num_bio_labels"],
    num_rel_labels=MODEL_CONFIG["num_rel_labels"],
    dropout=MODEL_CONFIG["dropout"]
)


In [7]:
optimizer = optim.AdamW(
    model.parameters(),
    lr=TRAINING_CONFIG["learning_rate"],
    weight_decay=TRAINING_CONFIG["weight_decay"]
)
# %%
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.1,
    patience=2
)

In [8]:
model_save_path = r"C:\Users\norouzin\Desktop\JointLearning\src\jointlearning\test_results\expert_bert_softmax_model.pt"

In [9]:
trained_model, training_history = train_model(
        model=model,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer=optimizer,
        num_epochs=TRAINING_CONFIG["num_epochs"],
        device=DEVICE,
        id2label_cls=id2label_cls,
        id2label_bio=id2label_bio,
        id2label_rel=id2label_rel,
        model_save_path=model_save_path,
        scheduler=scheduler,
        cls_class_weights=cls_weights,
        bio_class_weights=bio_weights, # Only for softmax
        rel_class_weights=rel_weights,
        patience_epochs=TRAINING_CONFIG["num_epochs"],
        seed=SEED,
        max_grad_norm=TRAINING_CONFIG["gradient_clip_val"],
        eval_fn_metrics=evaluate_model, # Pass your evaluate_model function here
        print_report_fn=print_eval_report, # Pass your print_eval_report function here
        is_silver_training=True,
        task_loss_weights={"cls": 1.0, "bio": 4.0, "rel": 1.0}
    )

--- Training Configuration ---
Device: cpu
Number of Epochs: 20
Seed: 8642
Optimizer: AdamW (LR: 5e-05, Weight Decay: 1.0)
Scheduler: ReduceLROnPlateau
Max Grad Norm: 1.0 (Mode: L2 norm if enabled)
Early Stopping Patience: 20
Model Save Path: C:\Users\norouzin\Desktop\JointLearning\src\jointlearning\test_results\expert_bert_softmax_model.pt
Mode: Silver Data Training (GCE)
GCE q value: 0.7
Using task loss weights: {'cls': 1.0, 'bio': 4.0, 'rel': 1.0}
CLS Class Weights: Provided
BIO Class Weights: Provided
REL Class Weights: Provided
----------------------------


Epoch 1/20 [Training]:   0%|          | 0/131 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


                                                                                                                             

KeyboardInterrupt: 

In [11]:
trained_model.save_pretrained(
    r"/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax"
)

In [17]:
train_dataset.tokenizer.save_pretrained(
    r"/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax"
)

('/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax/tokenizer_config.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax/special_tokens_map.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax/vocab.txt',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax/added_tokens.json',
 '/home/rnorouzini/JointLearning/src/jointlearning/expert_bert_softmax/hf_exper_bert_softmax/tokenizer.json')

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Initialize tokenizer and model (model uses default bert-base-uncased here)
    # In a real case, load your specific fine-tuned model and tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG["encoder_name"])
        # Pass the config explicitly if not loading a full pretrained model
        model = JointCausalModel(
            encoder_name=MODEL_CONFIG["encoder_name"],
            num_cls_labels=MODEL_CONFIG["num_cls_labels"],
            num_bio_labels=MODEL_CONFIG["num_bio_labels"],
            num_rel_labels=MODEL_CONFIG["num_rel_labels"],
            dropout=MODEL_CONFIG["dropout"]
        )
        model.to(device)
    except Exception as e:
        print(f"Error initializing model/tokenizer: {e}")
        print("Please ensure 'bert-base-uncased' is available or replace with your model path.")
        exit()

    for tc_idx, tc in enumerate(test_cases):
        print(f"\n--- Running Test: {tc['name']} ---")
        print(f"Text: '{tc['text']}'")
        print(f"Settings: {tc['settings']}")

        texts_batch = [tc["text"]]
        tokenized_inputs = tokenizer(
            texts_batch,
            return_tensors="pt",
            padding="max_length", # Pad to max_length for consistent bio_token_ids length
            max_length=32,      # A small max_length for testing
            truncation=True,
            return_offsets_mapping=True
        )
        
        # Apply the mock for the model's forward pass
        # Store original forward and restore it later if needed, or make mock part of model for test mode
        original_forward = model.forward
        model.forward = get_mock_forward_fn(tc["mock_data"], device=device)

        try:
            predictions = model.predict_batch(
                texts_batch,
                tokenized_inputs,
                device=device,
                **tc["settings"]
            )
            result = predictions[0] # We are processing one sentence at a time

            print(f"  Predicted Output: {result}")

            # Basic Assertions (more detailed assertions can be added)
            assert result["causal"] == tc["expected_causal"], \
                f"Causal flag mismatch. Expected {tc['expected_causal']}, Got {result['causal']}"
            assert len(result["relations"]) == tc["expected_relations_count"], \
                f"Relations count mismatch. Expected {tc['expected_relations_count']}, Got {len(result['relations'])}"

            if "expected_relations_texts" in tc:
                extracted_rel_texts = sorted([(r["cause"], r["effect"]) for r in result["relations"]])
                expected_rel_texts = sorted(tc["expected_relations_texts"])
                assert extracted_rel_texts == expected_rel_texts, \
                    f"Relation texts mismatch. Expected {expected_rel_texts}, Got {extracted_rel_texts}"

            print(f"  Test PASSED!")

        except AssertionError as e:
            print(f"  Test FAILED: {e}")
        except Exception as e:
            print(f"  Test ERRORED: {e}")
            import traceback
            traceback.print_exc()
        finally:
            model.forward = original_forward # Restore original forward method