In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install evaluate imbalanced-learn matplotlib scikit-learn transformers datasets torch

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

**Random Sampler with Explainability with 8 epochs**

In [None]:
import os
import numpy as np
import pandas as pd
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_from_disk, Features, Value, Sequence
import torch
from torch import nn
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve, auc
import matplotlib.pyplot as plt
import shap # Import SHAP

# Optional: Set environment variable for MPS memory management if needed
# os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

hf_token = os.getenv("HF_ACCESS_TOKEN")

# --- Configuration ---
TOKENIZED_DATASET_DIR = '/content/drive/My Drive/Readmissions_Research/Data/tokenized_hf_dataset_explainability'
MODEL_CHECKPOINT = "distilbert-base-uncased"
OUTPUT_DIR = "/content/drive/My Drive/Readmissions_Research/Results/RandomOverSampler_Explainability_15E"
NUM_LABELS = 2
MAX_LENGTH = 512

# --- Load Tokenized Dataset from Disk ---
print(f"Loading tokenized dataset from directory: {TOKENIZED_DATASET_DIR}...")
try:
    dataset = load_from_disk(TOKENIZED_DATASET_DIR)
    print(f"Loaded dataset with {len(dataset)} records.")
    print("\nDataset Info:")
    print(dataset)
    # Now expecting 'prompt' as well for SHAP explanations
    required_cols = ['input_ids', 'attention_mask', 'label', 'prompt']
    if not all(col in dataset.column_names for col in required_cols):
        missing_cols = [col for col in required_cols if col not in dataset.column_names]
        raise ValueError(f"Dataset loaded from disk is missing required columns for SHAP: {missing_cols}")
except FileNotFoundError:
    print(f"Error: Tokenized dataset directory not found at {TOKENIZED_DATASET_DIR}")
    print("Ensure you have run the updated tokenizer.py (keeping 'prompt' column) successfully.")
    exit()
except Exception as e:
    print(f"Error loading tokenized dataset from disk: {e}")
    exit()

# --- Prepare Dataset for Training ---
print("\nSplitting the dataset...")
split_dataset = dataset.train_test_split(test_size=0.25, seed=42)
train_dataset_original_hf = split_dataset["train"] # Keep as HF dataset
eval_dataset_hf = split_dataset["test"]   # Keep as HF dataset
print(f"Original training samples: {len(train_dataset_original_hf)}, Evaluation samples: {len(eval_dataset_hf)}")

# --- Apply RandomOverSampler to the Training Data ---
print("\nApplying RandomOverSampler to the training data...")
try:
    train_df_original = train_dataset_original_hf.to_pandas()
    X_train = train_df_original.drop(columns=['label']) # Features include prompt, input_ids, attention_mask
    y_train = train_df_original['label']
    print(f"Class distribution before RandomOverSampler: {y_train.value_counts().to_dict()}")
    ros = RandomOverSampler(random_state=42)
    X_resampled_df, y_resampled_series = ros.fit_resample(X_train, y_train)
    train_df_resampled = X_resampled_df.copy()
    train_df_resampled['label'] = y_resampled_series
    print(f"Class distribution after RandomOverSampler: {train_df_resampled['label'].value_counts().to_dict()}")
    print(f"New training dataset size after RandomOverSampler: {len(train_df_resampled)}")
    # Convert resampled pandas DataFrame back to Hugging Face Dataset using the original features schema
    train_dataset = Dataset.from_pandas(train_df_resampled, features=train_dataset_original_hf.features)
    print("Created new training dataset with RandomOverSampler samples.")
except ImportError:
    print("Error: `imbalanced-learn` library not found. Please install it: pip install imbalanced-learn")
    train_dataset = train_dataset_original_hf
except Exception as e:
    print(f"Error during RandomOverSampler application: {e}")
    train_dataset = train_dataset_original_hf

# --- Initialize Model ---
print(f"\nInitializing model from {MODEL_CHECKPOINT}...")
try:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)
    print("Model initialized successfully.")
except Exception as e:
    print(f"Error initializing model: {e}")
    exit()

# --- Define Metrics ---
print("\nLoading evaluation metrics...")
try:
    accuracy_metric_eval = evaluate.load("accuracy")
    precision_metric_eval = evaluate.load("precision")
    recall_metric_eval = evaluate.load("recall")
    f1_metric_eval = evaluate.load("f1")
    print("Metrics loaded.")
except Exception as e:
    print(f"Error loading metrics: {e}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    try:
        accuracy = accuracy_metric_eval.compute(predictions=predictions, references=labels)["accuracy"]
        precision = precision_metric_eval.compute(predictions=predictions, references=labels, average="binary")["precision"]
        recall = recall_metric_eval.compute(predictions=predictions, references=labels, average="binary")["recall"]
        f1 = f1_metric_eval.compute(predictions=predictions, references=labels, average="binary")["f1"]
    except Exception as e:
        print(f"Error computing metrics (inside try block): {e}")
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# --- Training Setup ---
print("\nSetting up training arguments and trainer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    if tokenizer.pad_token is None: # Essential for DataCollator and consistent padding
        tokenizer.pad_token = tokenizer.eos_token # Common practice if pad_token is missing
        model.config.pad_token_id = tokenizer.pad_token_id
        print("Set tokenizer.pad_token to tokenizer.eos_token.")
except Exception as e:
    print(f"Error initializing tokenizer for DataCollator: {e}")
    exit()

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR, num_train_epochs=8, per_device_train_batch_size=4,
    per_device_eval_batch_size=4, learning_rate=2e-5, weight_decay=0.01,
    eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True,
    metric_for_best_model="f1", greater_is_better=True, logging_dir='./logs_oversampled_epochs3',
    logging_steps=100, push_to_hub=False,
)

print("Using standard Trainer as RandomOverSampler has (likely) balanced the training data.")
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset_hf, # Use eval_dataset_hf
    compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator,
)
print("Trainer initialized.")

# --- Train and Evaluate ---
print("\nStarting training...")
try:
    trainer.train()
    print("Training finished.")
except Exception as e:
    print(f"Error during training: {e}")
    exit()

print("\nEvaluating the best model on the evaluation set (default threshold 0.5)...")
try:
    eval_results_default_threshold = trainer.evaluate(eval_dataset=eval_dataset_hf) # Pass eval_dataset_hf
    print("\nEvaluation Results (Default Threshold):")
    for key, value in eval_results_default_threshold.items():
        print(f"  {key}: {value:.4f}")
except Exception as e:
    print(f"Error during default threshold evaluation: {e}")

# --- Threshold Tuning Section ---
print("\n--- Starting Threshold Tuning ---")
try:
    predictions_output = trainer.predict(eval_dataset_hf) # Use eval_dataset_hf
    logits = predictions_output.predictions
    true_labels = predictions_output.label_ids
    probabilities_all_classes = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    probs_positive_class = probabilities_all_classes[:, 1]
    print(f"Successfully got probabilities for {len(probs_positive_class)} evaluation samples.")

    thresholds = np.arange(0.05, 1.0, 0.01)
    best_f1 = -1; best_threshold = -1; best_precision = -1; best_recall = -1
    precision_scores, recall_scores, f1_scores_thresh = [], [], []

    for threshold in thresholds:
        predicted_labels = (probs_positive_class >= threshold).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(
            true_labels, predicted_labels, average='binary', zero_division=0
        )
        precision_scores.append(precision); recall_scores.append(recall); f1_scores_thresh.append(f1)
        if f1 > best_f1:
            best_f1, best_threshold, best_precision, best_recall = f1, threshold, precision, recall

    print("\n--- Threshold Tuning Results ---")
    print(f"Best Threshold found: {best_threshold:.2f}")
    print(f"  Precision at best threshold: {best_precision:.4f}")
    print(f"  Recall at best threshold:    {best_recall:.4f}")
    print(f"  F1-Score at best threshold:  {best_f1:.4f}")

    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, precision_scores, label='Precision', marker='.')
    plt.plot(thresholds, recall_scores, label='Recall', marker='.')
    plt.plot(thresholds, f1_scores_thresh, label='F1-Score', marker='.')
    plt.axvline(best_threshold, color='r', linestyle='--', label=f'Best Threshold (F1={best_f1:.2f}) @ {best_threshold:.2f}')
    plt.title('Precision, Recall, and F1-Score vs. Classification Threshold')
    plt.xlabel('Threshold'); plt.ylabel('Score'); plt.legend(); plt.grid(True)
    plot_path = os.path.join(OUTPUT_DIR, "threshold_tuning_plot_epochs3.png")
    plt.savefig(plot_path); print(f"Threshold tuning plot saved to {plot_path}")

except Exception as e:
    print(f"Error during threshold tuning: {e}")

# --- SHAP Explainer Section ---
print("\n--- Starting SHAP Explanations ---")
try:
    # Ensure model is on CPU for SHAP if using certain explainers or if issues arise
    # model.to("cpu") # SHAP can sometimes be memory intensive with GPU

    # Create an explainer object for Hugging Face transformers
    # We pass the model and tokenizer. SHAP will create a pipeline.
    print("Creating SHAP explainer...")
    explainer = shap.Explainer(model, tokenizer)
    print("SHAP explainer created.")

    # Select a few instances from the evaluation set for explanation
    # Preferably instances where the model predicted Readmitted=1 and was correct, or just predicted Readmitted=1

    # Get predictions on the eval set if not already done
    if 'predictions_output' not in locals(): # Check if we ran predict in threshold tuning
        print("Running predictions on eval_dataset for SHAP...")
        predictions_output = trainer.predict(eval_dataset_hf) # Use eval_dataset_hf
        # logits = predictions_output.predictions # Already defined if threshold tuning ran
        # true_labels = predictions_output.label_ids # Already defined
        # probabilities_all_classes = torch.softmax(torch.tensor(logits), dim=-1).numpy() # Already defined
        # probs_positive_class = probabilities_all_classes[:, 1] # Already defined

    # Convert eval_dataset_hf back to pandas to easily select prompts along with labels and predictions
    eval_df_for_shap = eval_dataset_hf.to_pandas()
    eval_df_for_shap['predicted_label'] = np.argmax(logits, axis=-1)
    eval_df_for_shap['probability_readmitted'] = probs_positive_class


    # Example: Explain first 2 instances predicted as Readmitted=1
    predicted_readmitted_samples = eval_df_for_shap[eval_df_for_shap['predicted_label'] == 1]

    if not predicted_readmitted_samples.empty:
        num_samples_to_explain = min(2, len(predicted_readmitted_samples)) # Explain up to 2 samples
        samples_to_explain_df = predicted_readmitted_samples.head(num_samples_to_explain)

        print(f"\nExplaining {num_samples_to_explain} sample(s) where model predicted Readmitted=1:")

        # SHAP explainer for text models often expects a list of raw text strings
        raw_text_to_explain = samples_to_explain_df['prompt'].tolist()

        if raw_text_to_explain:
            print("Calculating SHAP values (this might take a while)...")
            # For some HuggingFace models, you might need to wrap model and tokenizer in a pipeline
            # Or pass text directly if explainer is configured with tokenizer
            shap_values = explainer(raw_text_to_explain)
            print("SHAP values calculated.")

            # shap_values object structure for text:
            # shap_values.values: [array_of_shap_values_for_output_0, array_of_shap_values_for_output_1] for each instance
            # shap_values.data: tokenized strings
            # For binary classification, shap_values.values[sample_idx, :, output_idx_for_class_1]
            # Or often simpler: shap_values_for_class_1 = shap_values[:,:,"1"] or shap_values[:,:,model.config.label2id["Readmitted_Label_Name_if_exists"]]

            for i in range(num_samples_to_explain):
                print(f"\nSHAP Explanation for Sample {i+1} (Predicted Readmitted=1):")
                print(f"True Label: {samples_to_explain_df['label'].iloc[i]}, Predicted Probability (Readmitted): {samples_to_explain_df['probability_readmitted'].iloc[i]:.4f}")

                # Generate a text plot for class 1 (assuming "Readmitted" is class 1)
                # The exact indexing for shap_values for text can vary.
                # Typically for binary, shap_values[i, :, 1] would be for class 1 of the i-th sample.
                # Or, use the class name if available in shap_values output.
                # Let's try to plot for the class "LABEL_1" which is common for binary.
                # If your labels are 0 and 1, "LABEL_1" often refers to the positive class.
                try:
                    # shap.plots.text(shap_values[i, :, "LABEL_1"]) # Newer API
                    # Forcing plot for class 1 (index 1 of the output dimension)
                    # The new shap.plots.text expects the shap_values object directly
                    # and often infers the output index or you can specify it.
                    # Let's get SHAP values for the positive class (index 1)
                    # The shap_values object for text classification is a list of Explanation objects
                    # if multiple texts are passed.

                    # If shap_values is a list of Explanation objects (one per input text)
                    current_shap_values_for_output1 = shap_values[i,:,1] # SHAP values for class 1 for current sample

                    print("Attempting to generate SHAP text plot...")
                    shap.plots.text(current_shap_values_for_output1, display=True) # Set display=True to try and print
                    plt.title(f"SHAP Text Plot for Sample {i+1} - Class 'Readmitted'")

                    # Saving the plot
                    shap_plot_path = os.path.join(OUTPUT_DIR, f"shap_text_plot_sample_{i+1}.png")
                    # Note: shap.plots.text directly renders. To save, you might need to handle the figure.
                    # For now, let's assume `display=True` prints to console or a plot window.
                    # To save, you might need:
                    # fig = shap.plots.text(current_shap_values_for_output1, show=False)
                    # plt.savefig(shap_plot_path)
                    # plt.close(fig)
                    print(f"SHAP text plot for sample {i+1} would be displayed. Manual save might be needed for files.")

                except Exception as plot_e:
                    print(f"Could not generate SHAP text plot for sample {i+1}: {plot_e}")
                    print("SHAP values for tokens (raw):")
                    # Try printing tokens and their SHAP values for class 1
                    # tokens = shap_values.data[i] # Original tokens for sample i
                    # s_values_class1 = shap_values.values[i][:, 1] # SHAP values for class 1 for sample i
                    # for token, s_val in zip(tokens, s_values_class1):
                    #    print(f"  '{token}': {s_val:.4f}")


        else:
            print("No samples predicted as Readmitted=1 in the selected eval subset to explain with SHAP.")
    else:
        print("No samples predicted as Readmitted=1 in the evaluation set to explain.")

except ImportError:
    print("SHAP library not found. Please install it: pip install shap")
except Exception as e:
    print(f"Error during SHAP explanations: {e}")

print("\nScript finished.")


Loading tokenized dataset from directory: /content/drive/My Drive/Readmissions_Research/Data/tokenized_hf_dataset_explainability...
Loaded dataset with 50000 records.

Dataset Info:
Dataset({
    features: ['label', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 50000
})

Splitting the dataset...
Original training samples: 37500, Evaluation samples: 12500

Applying RandomOverSampler to the training data...
Class distribution before RandomOverSampler: {0: 31381, 1: 6119}
Class distribution after RandomOverSampler: {1: 31381, 0: 31381}
New training dataset size after RandomOverSampler: 62762


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created new training dataset with RandomOverSampler samples.

Initializing model from distilbert-base-uncased...
Model initialized successfully.

Loading evaluation metrics...
Metrics loaded.

Setting up training arguments and trainer...
Using standard Trainer as RandomOverSampler has (likely) balanced the training data.
Trainer initialized.

Starting training...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6605,0.738646,0.61864,0.252445,0.685686,0.369027
