# Passive suicidality in a repressive U.S. political context: Aim I

_WIP - NOT FOR DISTRIBUTION_

_Formats and augments annotated Reddit data with excerpted rationales. Trains and tests baseline and in-domain-adapted BERT, RoBERTa, and DistilBERT pretrained models, and baseline Llama 3.1 for sequence classification. Tunes hyperparameters, saves best-performing model for inference. Includes LIME instance-wise explainability, regex for additional encodings, temperature scaling for calibration._

> aim_i_train_tune_predict.ipynb<br>
> Simone J. Skeen (01-08-2025)

1. [Prepare](#scrollTo=8z4T1_xNZXFb)
2. [Write](#scrollTo=vA0d1kH7bOYg)
3. [Preprocess](#scrollTo=woZYUo6JgN1X)
4. [Train-Adapt-Test](#scrollTo=rxzedr7a-kuK)
5. [Tune-Regularize](#scrollTo=G614tXUn0VWw)
6. [Infer (prelim)](#scrollTo=xnP3hWY2bGlp)
7. [Explain](#scrollTo=I5JVWw7spBaF)
8. [Calibrate](#scrollTo=2nSz7vQh5xYz)

### 1. Prepare
Installs, imports, and downloads requisite models and packages. Organizes RAP-consistent directory structure.
***

**Install**

In [None]:
%%capture

%pip install accelerate
%pip install bitsandbytes
%pip install brokenaxes
%pip install causalnlp
%pip install contractions
%pip install datasets
%pip install evaluate
%pip install lime
%pip install peft
#%pip install trl
%pip install unidecode
%pip install wandb

#%pip uninstall -y pyarrow datasets
#%pip install pyarrow datasets

#!python -m spacy download en_core_web_lg --user

**Import**

In [None]:
import accelerate
import ast
import bitsandbytes as bnb
import contractions
#import en_core_web_lg
import gzip
import huggingface_hub
import json
import lime
import logging
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import peft
import random
import re
import seaborn as sns
import sklearn
import spacy
import string
import torch
#import trl
import wandb.sdk
import warnings

from brokenaxes import brokenaxes
from causalnlp import Autocoder, CausalInferenceModel
from google.colab import drive
from lightgbm import LGBMClassifier
from lime.lime_text import LimeTextExplainer
from lime.submodular_pick import SubmodularPick
from matplotlib.lines import Line2D
from nltk.text import Text
from sklearn.feature_extraction import text
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (
                                     KFold,
                                     ParameterGrid,
                                     StratifiedKFold,
                                     train_test_split,
                                     )

from sklearn.metrics import (
                             accuracy_score,
                             average_precision_score,
                             classification_report,
                             cohen_kappa_score,
                             f1_score,
                             matthews_corrcoef,
                             )
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.utils.multiclass import type_of_target
from textblob import TextBlob
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data import (
                              DataLoader,
                              Dataset,
                              TensorDataset,
                              )
from tqdm import tqdm
from transformers import (
                          AdamW,
                          BertForSequenceClassification,
                          BertTokenizer,
                          DataCollatorForLanguageModeling,
                          DistilBertForSequenceClassification,
                          DistilBertTokenizer,
                          RobertaForSequenceClassification,
                          RobertaTokenizer,
                          Trainer,
                          TrainingArguments,
                          )
from unidecode import unidecode

#spacy.cli.download('en_core_web_lg')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.mode.copy_on_write = True

pd.set_option(
              'display.max_columns',
              None,
              )

pd.set_option(
              'display.max_rows',
              None,
              )

warnings.simplefilter(
                      action = 'ignore',
                      category = FutureWarning,
                      )

#!python -m prodigy stats

**Set env variables**

In [None]:
os.environ['OPENAI_API_KEY'] = ''
os.environ

**Mount gdrive**

In [None]:
drive.mount(
            '/content/drive',
            #force_remount = True,
            )

**Structure directories**

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality
#%cd /content/drive/My Drive/#<my_project_folder>

#%mkdir bar_policy_suicidality
#%cd bar_policy_suicidality

In [None]:
#%mkdir inputs outputs code temp

In [None]:
#%cd inputs
#%mkdir annotation archives data

In [None]:
#%cd ../outputs
#%mkdir models tables figures

In [None]:
bar_policy_suicidality/
├── inputs/
│   ├── archives
│   │   └── ### archive name TKTK
│   └── data
│       └── d_annotated.xlsx
├── outputs/
│   ├── models
│   ├── tables
│   └── figures
├── code/
└── temp/

### 2. Write
Writes and imports requisite custom scripts in .py.
***

In [None]:
%cd code

#### preprocess.py

**_augment_training_data_with_rationales_**

In [None]:
%%writefile preprocess.py

import pandas as pd

def augment_training_data_with_rationales(df):
    """
    Identifies all pos_1 strn, duplicates as new row below, replaces new row 'text' with appended concatenated asp-dep-val 'rtnl'.
    """
    augmented_rows = []

    for index, row in df.iterrows():
        augmented_rows.append(row)

        if row['strn'] == 1:
            duplicate_row = row.copy()
            duplicate_row['text'] = duplicate_row['rtnl']
            augmented_rows.append(duplicate_row)

    df_augmented = pd.DataFrame(augmented_rows)

    return df_augmented

**_dummy_code_augmented_rows_**

In [None]:
%%writefile -a preprocess.py

import pandas as pd

def dummy_code_augmented_rows(df):
    """
    Identifies all rationale-augmented rows in df, dummy codes for deletion prior to evaluation.
    """
    df = df.reset_index(drop = True)

    df['aug'] = 0

    for i in range(1, len(df)):
        if df.at[i, 'rtnl'] != '.' and df.at[i, 'rtnl'] == df.at[i-1, 'rtnl']:
            df.at[i, 'aug'] = 1

    return df

**_read_and_append_jsonl_posts_**

In [None]:
%%writefile -a preprocess.py

import os
import pandas as pd

def read_and_append_jsonl_posts(directory, chunk_size = 10000):
    """
    Reads and appends JSONL posts archives from Arctic Shift archives dir.
    """
    d_posts = pd.DataFrame()

    for filename in os.listdir(directory):
        if filename.endswith("_posts.jsonl"):
            filepath = os.path.join(
                                    directory,
                                    filename,
                                    )

            for chunk in pd.read_json(
                                      filepath,
                                      lines = True,
                                      chunksize = chunk_size,
                                      ):

                d_posts = pd.concat([d_posts, chunk], ignore_index = True)

    return d_posts

**_read_and_append_jsonl_comments_**

In [None]:
%%writefile -a preprocess.py

import os
import pandas as pd

def read_and_append_jsonl_comments(directory, chunk_size = 10000):
    """
    Reads and appends JSONL comments archives from Arctic Shift archives dir.
    """
    d_comments = pd.DataFrame()

    for filename in os.listdir(directory):
        if filename.endswith("_comments.jsonl"):
            filepath = os.path.join(
                                    directory,
                                    filename,
                                    )

            for chunk in pd.read_json(
                                      filepath,
                                      lines = True,
                                      chunksize = chunk_size,
                                      ):

                d_comments = pd.concat([d_comments, chunk], ignore_index = True)

    return d_comments

#### redact.py

**_ner_redact_post_texts_**

In [None]:
%%writefile redact.py

import spacy
nlp = spacy.load('en_core_web_lg')

def ner_redact_post_texts(p_text):
    """
    Redacts all named entities recognized by spaCy EntityRecognizer, replaces with <|PII|> pseudo-word token.
    """
    ne = list(
              [
               'PERSON',   ### people, including fictional
               'NORP',     ### nationalities or religious or political groups
               'FAC',      ### buildings, airports, highways, bridges, etc.
               'ORG',      ### companies, agencies, institutions, etc.
               #'GPE',     ### countries, cities, states
               'LOC',      ### non-GPE locations, mountain ranges, bodies of water
               'PRODUCT',  ### objects, vehicles, foods, etc. (not services)
               'EVENT',    ### named hurricanes, battles, wars, sports events, etc.
               ]
                )

    doc = nlp(p_text)
    ne_to_remove = []
    final_string = str(p_text)
    for sent in doc.ents:
        if sent.label_ in ne:
            ne_to_remove.append(str(sent.text))
    for n in range(len(ne_to_remove)):
        final_string = final_string.replace(
                                            ne_to_remove[n],
                                            '<|PII|>',
                                            )
    return final_string

#### bert_train.py

**_set_seed_**

In [None]:
%%writefile bert_train.py

import torch
import numpy as np
import random

def set_seed(seed):
    """
    Set random seeds for reproducibility in Pytorch.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

**_train_eval_save_bl_models_**

In [None]:
%%writefile -a bert_train.py

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
                          BertForSequenceClassification,
                          RobertaForSequenceClassification,
                          DistilBertForSequenceClassification,
                          BertTokenizer,
                          RobertaTokenizer,
                          DistilBertTokenizer,
                          get_linear_schedule_with_warmup,
                          )
from sklearn.metrics import f1_score, matthews_corrcoef, average_precision_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn import CrossEntropyLoss
import numpy as np
import pandas as pd

def train_eval_save_bl_models(target_datasets, targets_and_class_weights, models, save_path, cycle, hyperparameter_grid):
    """
    Fine-tune and eval pre-trained baseline LMs on multiple targets using target-specific train and test datasets.

    Training sets d_train_{target} are split using 5-fold cross-validation: 4 folds for training and 1 fold for validation. Training
    folds use augmented data; validation folds use original data. The best model is selected based on average performance across the
    folds and evaluated on separate test sets d_test_{target}.

    Parameters:
    -----------
    target_datasets : dict
        A dictionary where keys are target names and values are tuples containing the train and test datasets for each target.
        For example: {'asp': (d_train_asp, d_test_asp), 'dep': (d_train_dep, d_test_dep)}

    targets_and_class_weights : dict
        A dictionary where keys are target names and values are lists of class weights corresponding to each target.

    models : dict
        A dictionary of models to evaluate. Each key is a model name, and each value is a tuple containing:
        - the model class,
        - the tokenizer class,
        - the name of the pre-trained model.

    save_path : str, optional
        The path where the best models will be saved.

    cycle : str
        The cycle identifier: 'baseline', indicating performance with prespecified default params; 'adapted', indicating performance
        with in-domain adapted params.

    hyperparameter_grid : dict
        Dictionary containing hyperparameter values: batch_size, gradient_accumulation_steps, learning_rate, num_epochs, warmup_steps, and weight_decay.

    Returns:
    --------
    d_{cycle}_performance : pd.DataFrame
        A df containing performance metrics for each target and model per fold per pre-specified cycle,
        and final evaluation on test data.
    """

    # verify cycle

    print(f"CYCLE: {cycle}")

    # check CUDA

    print("CUDA: ", torch.cuda.is_available())
    use_cuda = torch.cuda.is_available()

    # set seed

    set_seed(56)

    # unpack hyperparameters

    batch_size = hyperparameter_grid.get('batch_size', 4)
    gradient_accumulation_steps = hyperparameter_grid.get('gradient_accumulation_steps', 1)
    learning_rate = hyperparameter_grid.get('learning_rate', 2e-5)
    num_epochs = hyperparameter_grid.get('num_epochs', 2)
    warmup_steps = hyperparameter_grid.get('warmup_steps', 0)
    weight_decay = hyperparameter_grid.get('weight_decay', 0.0)

    # best target x model F1 tracking

    best_f1_scores = {target: {'score': 0, 'model': None, 'model_instance': None} for target in targets_and_class_weights}
    results = []

    # training loop: target x model

    for target, class_weights in targets_and_class_weights.items():
        print("\n======================================================================================")
        print(f"Label: {target}")
        print("======================================================================================")

        # target-specific datasets

        d_train, d_test = target_datasets[target]

        # split augmented v. non-augmented data

        d_train_aug = d_train[d_train['aug'] == 1]
        d_train_no_aug = d_train[d_train['aug'] == 0]

        # prepare fold-wise training v. validation data

        X_train_aug, y_train_aug = d_train_aug['text'].values, d_train_aug[target].values
        X_train_no_aug, y_train_no_aug = d_train_no_aug['text'].values, d_train_no_aug[target].values
        X_test, y_test = d_test['text'].values, d_test[target].values

        # determine target type, encode (as needed)

        target_type = 'binary' if len(np.unique(y_train_aug)) <= 2 else 'multiclass'
        le = LabelEncoder() # Using separate LabelEncoder for each target data group to avoid encoding mismatch issues.
        if target_type == 'binary':
            #le = LabelEncoder()
            y_train_aug = le.fit_transform(y_train_aug)
            y_train_no_aug = le.fit_transform(y_train_no_aug) # Re-encode with new encoder
            y_test = le.transform(y_test)

        # define k folds

        k_fold = StratifiedKFold(
                                 n_splits = 5,
                                 shuffle = True,
                                 random_state = 56,
                                 )

        for model_name, (model_class, tokenizer_class, pretrained_model_name) in models.items():
            print(f"\nFine-tuning {model_name} for {target}")
            print("--------------------------------------------------------------------------------------")

            fold_f1, fold_mcc, fold_auprc = [], [], []  ### store fold-wise performance metrics

            # initialize tokenizer

            tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)

            for fold_idx, (train_no_aug_idx, valid_idx) in enumerate(k_fold.split(X_train_no_aug, y_train_no_aug)):
                print(f"\nFold {fold_idx + 1}/5")

                # create training set: combine aug = 1 (augmented) with fold-specific aug = 0 (non-augmented)

                X_train_fold_aug = X_train_aug
                y_train_fold_aug = y_train_aug

                X_train_fold_no_aug, X_valid_fold = X_train_no_aug[train_no_aug_idx], X_train_no_aug[valid_idx]
                y_train_fold_no_aug, y_valid_fold = y_train_no_aug[train_no_aug_idx], y_train_no_aug[valid_idx]

                # combine augmented and non-augmented training data

                X_train_fold = np.concatenate([X_train_fold_aug, X_train_fold_no_aug])
                y_train_fold = np.concatenate([y_train_fold_aug, y_train_fold_no_aug])

                # tokenize training and validation data

                encoded_train = tokenizer(
                                          X_train_fold.tolist(),
                                          padding = True,
                                          truncation = True,
                                          return_tensors = 'pt',
                                          )

                encoded_valid = tokenizer(
                                          X_valid_fold.tolist(),
                                          padding = True,
                                          truncation = True,
                                          return_tensors = 'pt',
                                          )

                train_dataset = TensorDataset(
                                              encoded_train['input_ids'],
                                              encoded_train['attention_mask'],
                                              torch.tensor(y_train_fold),
                                              )

                valid_dataset = TensorDataset(
                                              encoded_valid['input_ids'],
                                              encoded_valid['attention_mask'],
                                              torch.tensor(y_valid_fold),
                                              )

                train_loader = DataLoader(
                                          train_dataset,
                                          batch_size = batch_size,
                                          shuffle = True,
                                          )

                valid_loader = DataLoader(
                                          valid_dataset,
                                          batch_size = batch_size,
                                          shuffle = False,
                                          )

                # instantiate model

                model = model_class.from_pretrained(pretrained_model_name)

                # migrate to CUDA

                use_cuda = torch.cuda.is_available()
                if use_cuda:
                    model = model.cuda()

                # set optimizer + scheduler

                optimizer = torch.optim.AdamW(
                                              model.parameters(),
                                              lr = learning_rate,
                                              weight_decay = weight_decay,
                                              )

                total_steps = len(train_loader) * num_epochs
                #scheduler = torch.optim.lr_scheduler.LinearLR(
                #                                              optimizer,
                #                                              start_factor = 0.1,
                #                                              #total_iters = warmup_steps,
                #                                              total_iters = total_steps, # Fix: corrected from warmup_steps to total_steps
                #                                              )

                scheduler = get_linear_schedule_with_warmup(
                                                            optimizer,
                                                            num_warmup_steps = warmup_steps,
                                                            num_training_steps = total_steps
                                                            )

                # fine-tune model on training folds (x4)

                model.train()
                criterion = CrossEntropyLoss(weight = torch.tensor(
                                                                   class_weights,
                                                                   dtype = torch.float
                                                                   ).cuda() if use_cuda else torch.tensor(
                                                                                                          class_weights,
                                                                                                          dtype = torch.float
                                                                                                          )
                                             )

                for epoch in range(num_epochs):
                    for i, batch in enumerate(train_loader):
                        input_ids, attention_mask, labels = batch
                        labels = labels.long()

                        if use_cuda:
                            input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()

                        outputs = model(input_ids, attention_mask = attention_mask)
                        logits = outputs.logits
                        loss = criterion(logits, labels)

                        # accumulate gradients, normalize loss

                        loss = loss / gradient_accumulation_steps
                        loss.backward()

                        # update model weights post-accumulation steps

                        if (i + 1) % gradient_accumulation_steps == 0:
                            optimizer.step()
                            optimizer.zero_grad()

                        # apply learning rate scheduler

                        scheduler.step()

                # evaluate on validation fold (x1)

                model.eval()
                all_predictions, all_true_labels = [], []
                with torch.no_grad():
                    for batch in valid_loader:
                        input_ids, attention_mask, labels = batch

                        if use_cuda:
                            input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()

                        outputs = model(input_ids, attention_mask = attention_mask)
                        logits = outputs.logits
                        predictions = torch.argmax(logits, dim = 1).tolist()
                        all_predictions.extend(predictions)
                        all_true_labels.extend(labels.tolist())

                # performance metrics per validation fold

                f1_macro = f1_score(
                                    all_true_labels,
                                    all_predictions,
                                    average = 'macro',
                                    )

                mcc = matthews_corrcoef(
                                        all_true_labels,
                                        all_predictions,
                                        )

                auprc = average_precision_score(
                                                all_true_labels,
                                                all_predictions,
                                                average = 'macro',
                                                )

                fold_f1.append(f1_macro)
                fold_mcc.append(mcc)
                fold_auprc.append(auprc)

            # mean results over folds, track best model

            mean_f1 = np.mean(fold_f1)
            if mean_f1 > best_f1_scores[target]['score']:
                best_f1_scores[target]['score'] = mean_f1
                best_f1_scores[target]['model'] = model_name
                best_f1_scores[target]['model_instance'] = model

                save_model_name = f'{target}_{model_name}_best_{cycle}_model.pt'
                torch.save(model.state_dict(), save_path + save_model_name)

            # store results for each fold

            for i in range(5):
                results.append({
                                'target': target,
                                'model': model_name,
                                'fold': i + 1,
                                'f1_macro': fold_f1[i],
                                'mcc': fold_mcc[i],
                                'auprc': fold_auprc[i]
                                })

        # test on held-out d_test_{target} df

        print(f"\nTest on held-out d_test_{target} using the best {best_f1_scores[target]['model']} model")
        print("--------------------------------------------------------------------------------------")

        test_model = best_f1_scores[target]['model_instance']
        test_model.eval()

        #tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
        #tokenizer = models[best_f1_scores[target]['model']][1].from_pretrained(pretrained_model_name)

        # ensure correct tokenizer for testing

        best_model_name = best_f1_scores[target]['model']  # Retrieve the name of the best model
        best_pretrained_model_name = models[best_model_name][2]  # Retrieve the correct pretrained model name
        tokenizer = models[best_model_name][1].from_pretrained(best_pretrained_model_name)  # Use correct tokenizer class

        encoded_test = tokenizer(
                                 X_test.tolist(),
                                 padding = True,
                                 truncation = True,
                                 return_tensors = 'pt',
                                 )

        test_dataset = TensorDataset(
                                     encoded_test['input_ids'],
                                     encoded_test['attention_mask'],
                                     torch.tensor(y_test),
                                     )

        test_loader = DataLoader(
                                 test_dataset,
                                 batch_size = batch_size,
                                 shuffle = False,
                                 )

        all_test_predictions, all_test_true_labels = [], []

        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = batch

                if use_cuda:
                    input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()

                outputs = test_model(input_ids, attention_mask = attention_mask)
                logits = outputs.logits
                test_predictions = torch.argmax(logits, dim = 1).tolist()
                all_test_predictions.extend(test_predictions)
                all_test_true_labels.extend(labels.tolist())

        # preformance metrics for held-out d_test_{target} df

        test_f1_macro = f1_score(
                                 all_test_true_labels,
                                 all_test_predictions,
                                 average='macro',
                                 )

        test_mcc = matthews_corrcoef(
                                     all_test_true_labels,
                                     all_test_predictions,
                                     )

        test_auprc = average_precision_score(
                                             all_test_true_labels,
                                             all_test_predictions,
                                             average = 'macro',
                                             )

        # display

        print(f"Test F1 (macro) for {target}: {test_f1_macro}")
        print(f"Test MCC for {target}: {test_mcc}")
        print(f"Test AUPRC for {target}: {test_auprc}")

        # store

        results.append({
                        'target': target,
                        'model': best_f1_scores[target]['model'],
                        'fold': 'Test',
                        'f1_macro': test_f1_macro,
                        'mcc': test_mcc,
                        'auprc': test_auprc
                        })

    # summarize + return d_{cycle}_performance df

    print("\n--------------------------------------------------------------------------------------")
    print(f"Summary")
    print("--------------------------------------------------------------------------------------")

    for target, info in best_f1_scores.items():
        print(f"Best F1 (macro) for {target}: {info['score']} achieved by {info['model']}")

    d_performance = pd.DataFrame(results)
    print(f"\nd_{cycle}_performance:")
    print(d_performance.head(5))
    d_performance.to_excel(f'{save_path}d_{cycle}_performance.xlsx')

**_performance_scatterplot_**

In [None]:
%%writefile -a bert_train.py

from brokenaxes import brokenaxes
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

def performance_scatterplot(df, plot_name):
    """
    Creates a categorical scatterplot with custom aesthetics, markers, error bars, and a legend.

    Parameters:
    df (pd.DataFrame): Input dataframe containing columns 'target', 'f1_macro', 'model', and 'fold'.

    plot_name (str): The name used for saving the output plot file (without extension).

    Returns:
    --------
    Matplotlib Axes object containing the barplot.
    """

    # aesthetics

    model_colors = [
                    '#87bc45',
                    '#b33dc6',
                    '#27aeef',
                   ]

      ### SJS 10/1: last three colors in "Retro Metro (Default)" https://www.heavy.ai/blog/12-color-palettes-for-telling-better-stories-with-your-data

    sns.set_style(
                  style = 'whitegrid',
                  rc = None,
                  )

    # map target: numeric position

    target_mapping = {
                      'asp': 0,
                      'dep': 2,
                      'val': 4,
                      'prg': 6,
                      'tgd': 8,
                      'age': 10,
                      'race': 12,
                      'dbty': 14
                     }

    df['target_numeric'] = df['target'].map(target_mapping)
    df['target_numeric'] = pd.to_numeric(df['target_numeric'])

    # inject noise for jitter

    df['target_jitter'] = df['target_numeric'] + np.random.uniform(
                                                                   -0.35,
                                                                   0.35,
                                                                   size = len(df),
                                                                   )

    # initialize fig. with broken y-axis

    fig = plt.figure(figsize=(12, 5.5))
    bax = brokenaxes(
                     ylims = ((0, 0.1), (0.4, 1)), ### y-axis bounds
                     hspace = 0.1, ### y-axis break space
                     )

    # define colors: held-out test set ('fold' = Test)

    test_colors = {
                   'bert': '#87bc45',
                   'roberta': '#27aeef',
                   'distilbert': '#b33dc6',
                   }

    # distinguish markers: fold v. held-out test set

    for fold_value, marker in [('Test', 'o'), ('non-Test', '.')]:
        if fold_value == 'Test':
            #data_subset = d_v[d_v['fold'] == 'Test']
            data_subset = df[df['fold'] == 'Test']

            for model in data_subset['model'].unique():
                model_data = data_subset[data_subset['model'] == model]
                bax.scatter(
                            model_data['target_jitter'],
                            model_data['f1_macro'],
                            color = test_colors[model],
                            s = 40,
                            alpha = 0.6,
                            label = None,
                            marker = marker,
                            )
        else:
            #data_subset = d_v[d_v['fold'] != 'Test']
            data_subset = df[df['fold'] != 'Test']
            for model, color in test_colors.items():
                model_data = data_subset[data_subset['model'] == model]
                bax.scatter(
                            model_data['target_jitter'],
                            model_data['f1_macro'],
                            color = color,
                            s = 40,
                            alpha = 0.6,
                            label = None,
                            marker = marker,
                            )

#    for fold_value, marker in [('Test', 'o'), ('non-Test', '.')]:
#        if fold_value == 'Test':
#            data_subset = df[df['fold'] == 'Test']
#        else:
#            data_subset = df[df['fold'] != 'Test']

#        sns.scatterplot(
#                        data = data_subset,
#                        x = 'target_jitter',
#                        y = 'f1_macro',
#                        hue = 'model',
#                        palette = model_colors,
#                        s = 40,
#                        alpha = 0.6,
#                        marker = marker,
#                       )

    # mean and SD of f1_macro for each target x model

    mean_std_df = df.groupby(['target', 'model']).agg(
                                                      mean_f1_macro = ('f1_macro', 'mean'),
                                                      std_f1_macro = ('f1_macro', 'std'),
                                                      ).reset_index()

    # add target_numeric values to mean_std_df for plotting means and error bars

    #mean_std_df['target_numeric'] = mean_std_df['target'].map(target_mapping).astype(float)

    # x-axis offsets

    mean_std_df['target_numeric'] = mean_std_df['target'].map(target_mapping).astype(float)
    mean_std_df['target_offset'] = mean_std_df['target_numeric'] + mean_std_df['model'].map(
                                                                                            {'bert': -0.3,
                                                                                             'roberta': 0.0,
                                                                                             'distilbert': 0.3}
                                                                                            )

    #model_offsets = {
    #                 'bert-base-uncased': -0.3,
    #                 'roberta-base': 0.0,
    #                 'distilbert-base-uncased': 0.3,
    #                 }

    #mean_std_df['target_offset'] = mean_std_df['target_numeric'] + mean_std_df['model'].map(model_offsets)

    # means (SDs), error bars

    for model in mean_std_df['model'].unique():
        model_data = mean_std_df[mean_std_df['model'] == model]

    # inspect for NaNs

        if not model_data[['target_offset', 'mean_f1_macro', 'std_f1_macro']].isnull().any().any():
            plt.errorbar(
                         model_data['target_offset'],
                         model_data['mean_f1_macro'],
                         yerr = model_data['std_f1_macro'],
                         fmt = 'D',
                         markersize = 7,
                         capsize = 0,
                         elinewidth = 1,
                         markeredgewidth = 1,
                         color = test_colors[model]
                        )

    # x-tick: map to targets

    bax.set_xlabel(
                   'Target',
                   fontsize = 12,
                   labelpad = 30,
                   )

    bax.set_ylabel(
                   f'$F_1$ (macro): {plot_name}',
                   fontsize = 12,
                   labelpad = 30,
                   )

    # x-tick: label lower axis

    bax.axs[1].set_xticks(list(target_mapping.values()))
    bax.axs[1].set_xticklabels(list(target_mapping.keys()), rotation = 45, fontsize = 10)

    #sns.despine(left = True)
    bax.grid(
             #axis='x',
             False,
             )

    # line at 0.8 threshold

    #bax.axhline(
    #            y = 0.8,
    #            color = 'r',
    #            linewidth = 0.6,
    #            linestyle = '--',
    #            )

    #plt.xticks(
    #           [0, 2, 4, 6, 8, 10, 12, 14],
    #           ['asp', 'dep', 'val', 'prg', 'tgd', 'age', 'race', 'dbty']
    #          )

    # label axes

    #plt.ylim(0, 1)
    #ax = plt.gca()
    #ax.set_ylabel(
    #              '$F_1$ (macro)',
    #              fontsize = 12,
    #              labelpad = 10,
    #              )

    #ax.set_xlabel(
    #              'Target',
    #              fontsize = 12,
    #              labelpad = 10,
    #              )

    #sns.despine(left = True)
    #ax.grid(axis = 'x')

    # set line at 0.9 threshold

    #ax.axhline(
    #           y = 0.9,
    #           color = 'r',
    #           linewidth = 0.6,
    #           linestyle = '--',
    #           )

    # custom legend

    legend_elements = [
                       Line2D([0], [0], marker = 'o', color = 'w', label = 'bert', markersize = 8, markerfacecolor = '#87bc45', lw = 0),
                       Line2D([0], [0], marker = 'o', color = 'w', label = 'roberta', markersize = 8, markerfacecolor = '#27aeef', lw = 0),
                       Line2D([0], [0], marker = 'o', color = 'w', label = 'distilbert', markersize = 8, markerfacecolor = '#b33dc6', lw = 0),
                      ]

    bax.axs[0].legend(
                      handles = legend_elements,
                      loc = 'upper center',
                      bbox_to_anchor = (0.5, 1.15),
                      ncol = 4,
                      fontsize = 9,
                      frameon = False,
                      )

    # save

    plt.savefig(f'{plot_name}_low_res_scatter.png', dpi = 100)
    plt.savefig(f'{plot_name}_high_res_scatter.png', dpi = 300)

    # display

    plt.show()


**_iterative_stratified_train_test_split_with_rationales_**

In [None]:
%%writefile -a bert_train.py

import pandas as pd
from sklearn.model_selection import train_test_split

def iterative_stratified_train_test_split_with_rationales(df, targets, test_size, random_state):
    """
    Splits df into target-stratified train and test sets for each target in targets list:
    d_train_{target}, d_test_{target}, respectively. Partitions 'rationales' (aug = 1) to train
    set. Returns a dict with target names as keys
    """

    # initialize dict

    target_datasets = {}

    for target in targets:

        # create 'targets' col for stratification

        df_target = df.copy()
        df_target['targets'] = df[target]

        # split augmented vs. non-augmented rows

        aug_rows = df_target[df_target['aug'] == 1]
        non_aug_rows = df_target[df_target['aug'] != 1]

        if non_aug_rows.empty:
            print(f"No non-augmented rows for target {target}. Skipping...")
            continue

        # stratified train-test split on non-augmented rows only

        train_non_aug, test_non_aug = train_test_split(
                                                       non_aug_rows,
                                                       test_size = test_size,
                                                       stratify = non_aug_rows['targets'],
                                                       random_state = random_state,
                                                      )

        # concat augmented rows back into train set

        d_train = pd.concat([train_non_aug, aug_rows])

        # shuffle + reset index: train set

        d_train = d_train.sample(
                                 frac = 1,
                                 random_state = random_state,
                                 ).reset_index(drop = True)

        # retain 'text', 'aug', target cols

        d_train = d_train[['text', 'aug', target]]
        d_test = test_non_aug[['text', 'aug', target]]

        # reset index: test set

        d_test = d_test.reset_index(drop = True)

        # add train and test sets as tuples to target_datasets dict

        target_datasets[target] = (d_train, d_test)

        # inspect

        print(f"\nVerify: d_train_{target} 'aug' count")
        print(d_train['aug'].value_counts(normalize = False))
        print(f"\nVerify: d_test_{target} 'aug' count")
        print(d_test['aug'].value_counts(normalize = False))

        print(f"\n--------------------------------------------------------------------------------------")
        print(f"d_train_{target}: Augmented training data for target '{target}'")
        print(f"--------------------------------------------------------------------------------------")
        print(d_train.shape)
        print(d_train[target].value_counts(normalize = True))
        print(d_train.head(6))

        print(f"\n--------------------------------------------------------------------------------------")
        print(f"d_test_{target}: De-augmented testing data for target '{target}'")
        print(f"--------------------------------------------------------------------------------------")
        print(d_test.shape)
        print(d_test[target].value_counts(normalize = True))
        print(d_test.head(6))

    return target_datasets


**_tune_and_optimize_model_hyperparams_**

In [None]:
%%writefile -a bert_train.py

import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import pandas as pd

def tune_and_optimize_model_hyperparams(tokenizer, model_class, pretrained_model_name, d_train, d_test, target, class_weights, save_path, hyperparameter_grid):
    """
    Tune and optimize model hyperparameters for a specific model-target combination.

    Parameters:
    -----------

    tokenizer:
        Pre-trained tokenizer.

    model_class:
        Pre-trained model class.

    pretrained_model_name:
        Name of the pre-trained model.

    d_train : pd.DataFrame
        Training dataset.

    d_test : pd.DataFrame
        Test dataset.

    target : str
        The target variable for classification.

    class_weights : torch.tensor
        Weights for each class.

    save_path : str
        Path to save the best model.

    hyperparameter_grid : dict
        Dictionary where keys are hyperparameter names and values are lists of possible values.

    Returns:
    --------
    d_test : pd.DataFrame
        Test dataset with predictions and probabilities.

    d_tuned_performance : pd.DataFrame
        DataFrame with the performance metrics for each hyperparameter configuration.
    """

    # check CUDA

    use_cuda = torch.cuda.is_available()
    print("CUDA: ", use_cuda)

    # set seed

    set_seed(56)

    print("======================================================================================")
    print(f"Optimizing: {pretrained_model_name}\nTarget: {target}")
    print("======================================================================================")

    # tokenize train and test sets

    encoded_train = tokenizer(
                              d_train['text'].tolist(),
                              padding = True,
                              truncation = True,
                              return_tensors = 'pt',
                              )

    encoded_test = tokenizer(
                             d_test['text'].tolist(),
                             padding = True,
                             truncation = True,
                             return_tensors = 'pt',
                             )


    # accept dynamic target variables

    train_labels = torch.tensor(d_train[target].values)
    test_labels = torch.tensor(d_test[target].values)

    # prepare datasets

    train_dataset = TensorDataset(
                                  encoded_train['input_ids'],
                                  encoded_train['attention_mask'],
                                  train_labels,
                                  )

    test_dataset = TensorDataset(
                                 encoded_test['input_ids'],
                                 encoded_test['attention_mask'],
                                 test_labels,
                                 )

    #train_loader = DataLoader(
    #                          train_dataset,
    #                          batch_size = 8,  ### to be updated within grid search
    #                          shuffle = True,
    #                          )

    #test_loader = DataLoader(
    #                         test_dataset,
    #                         batch_size = 8,  ### to be updated within grid search
    #                         shuffle = False,
    #                         )

    # initialize class weights

    if use_cuda:
        class_weights = class_weights.cuda()

    # initialize tracking variables

    best_f1_macro = 0
    best_params = None
    best_model_state = None
    best_predictions = []
    best_probabilities = []

    f1_scores = []
    performance_data = []

    # hyperparam grid search: ParameterGrid

    for hyperparams in ParameterGrid(hyperparameter_grid):
        print(f"\nOptimizing with hyperparameters: {hyperparams}")

        train_loader = DataLoader(
                                  train_dataset,
                                  batch_size = hyperparams['batch_size'],
                                  shuffle = True
                                  )
        test_loader = DataLoader(
                                 test_dataset,
                                 batch_size = hyperparams['batch_size'],
                                 shuffle = False
                                 )

        print(f"\nTotal training rows: {len(train_dataset)}")
        print(f"Total evaluation rows: {len(test_dataset)}")
        print(f"Training batch size: {hyperparams['batch_size']}")
        print(f"Evaluation batch size: {hyperparams['batch_size']}")
        print(f"Total training batches: {len(train_loader)}")
        print(f"Total evaluation batches: {len(test_loader)}")
        print("\n")

        # initialize model

        model = model_class.from_pretrained(pretrained_model_name)
        if use_cuda:
            model.cuda()

        # initialize optimizer and lr scheduler

        optimizer = torch.optim.AdamW(
                                      model.parameters(),
                                      lr = hyperparams['learning_rate'],
                                      weight_decay = hyperparams['weight_decay']
                                      )

        # calculate total steps

        total_steps = len(train_loader) * hyperparams['num_epochs']

        # add scheduler with warmup steps

        scheduler = get_linear_schedule_with_warmup(
                                                    optimizer,
                                                    num_warmup_steps = hyperparams['warmup_steps'],
                                                    num_training_steps=total_steps
                                                    )

        criterion = CrossEntropyLoss(weight = class_weights)

        # training loop

        for epoch in range(hyperparams['num_epochs']):
            model.train()
            optimizer.zero_grad()
            for i, batch in enumerate(tqdm(train_loader, desc = f"Training Epoch {epoch + 1}/{hyperparams['num_epochs']}", leave=True)):
                input_ids, attention_mask, labels = batch
                if use_cuda:
                    input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss = loss / hyperparams['gradient_accumulation_steps']
                loss.backward()
                if (i + 1) % hyperparams['gradient_accumulation_steps'] == 0:
                    optimizer.step()
                    scheduler.step()  ### update learning rate
                    optimizer.zero_grad()

        # eval loop

        model.eval()
        all_predictions = []
        all_true_labels = []
        all_probabilities = []
        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = batch
                if use_cuda:
                    input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()
                outputs = model(input_ids, attention_mask=attention_mask)
                probabilities = torch.softmax(outputs.logits, dim = 1)
                predictions = torch.argmax(probabilities, dim = 1).cpu().tolist()
                all_predictions.extend(predictions)
                all_true_labels.extend(labels.cpu().tolist())
                all_probabilities.extend(probabilities.cpu().tolist())

        # calculate F1 (macro)

        current_f1_macro = f1_score(all_true_labels, all_predictions, average='macro')
        f1_scores.append(current_f1_macro)
        print(f"\nCurrent F1 macro with params {hyperparams}: {current_f1_macro}")

        # append F1 and current performance data

        performance_data.append({
                                 'pretrained_model_name': pretrained_model_name,
                                 'target': target,
                                 'f1_score': current_f1_macro,
                                 'batch_size': hyperparams['batch_size'],
                                 'weight_decay': hyperparams['weight_decay'],
                                 'learning_rate': hyperparams['learning_rate'],
                                 'warmup_steps': hyperparams['warmup_steps'],
                                 'num_epochs': hyperparams['num_epochs'],
                                 'gradient_accumulation_steps': hyperparams['gradient_accumulation_steps'],
        })

        if current_f1_macro > best_f1_macro:
            best_f1_macro = current_f1_macro
            best_params = hyperparams
            best_model_state = model.state_dict()
            best_predictions = all_predictions
            best_probabilities = all_probabilities

    #if len(best_predictions) == len(d_test):
    #    d_test['predicted_labels'] = best_predictions
    #    d_test['predicted_probabilities'] = best_probabilities
    #else:
    #    print("Error: Length of predictions does not match length of test set")

    d_test['predicted_labels'] = best_predictions
    d_test['predicted_probabilities'] = best_probabilities

    # save d_test_{target} with pred and prob

    print("--------------------------------------------------------------------------------------")
    print(f"Summary: {target}")
    print("--------------------------------------------------------------------------------------")

    print(d_test.head(6))
    d_test.to_excel(f'{save_path}/d_test_tuned_preds_{target}.xlsx')

    if best_model_state:
        model_path = f"{save_path}/{target}_{pretrained_model_name}_best_tuned_model.bin"
        torch.save(best_model_state, model_path)
        print("\nBest model saved with F1 macro:", best_f1_macro)
        print("Best hyperparameters:", best_params)

    # display F1 scores

    f1_mean = sum(f1_scores) / len(f1_scores)
    f1_std = (sum((x - f1_mean) ** 2 for x in f1_scores) / len(f1_scores)) ** 0.5
    print(f"Mean F1 macro: {f1_mean}")
    print(f"Standard deviation of F1 macro: {f1_std}")

    # df: target-wise

    d_tuned_performance = pd.DataFrame(performance_data)
    print(d_tuned_performance.head(10))

    # save: target-wise df

    d_tuned_performance.to_excel(f'{save_path}/d_tuned_performance_{target}.xlsx')

    return d_test, d_tuned_performance


**_tune_and_optimize_model_loss_accuracy_**

In [None]:
%%writefile -a bert_train.py

import matplotlib.pyplot as plt
import os
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
import pandas as pd

def tune_and_optimize_model_loss_accuracy(tokenizer, model_class, pretrained_model_name, d_train, d_test, target, class_weights, save_path, fixed_hyperparameters, num_epochs_range, cycle):
    """
    tktk

    Parameters:
    -----------

    tokenizer:
        Pre-trained tokenizer.

    model_class:
        Pre-trained model class.

    pretrained_model_name:
        Name of the pre-trained model.

    d_train : pd.DataFrame
        Training set.

    d_val : pd.DataFrame
        Validation set.

    target : str
        The target variable for classification.

    class_weights : torch.tensor
        Weights for each class.

    save_path : str
        Path to save the best model.

    tktk

    Returns:
    --------
    d_val : pd.DataFrame
        Validation set with predictions and probabilities.

    d_epochal_performance : pd.DataFrame
        DataFrame with the performance metrics for each hyperparameter configuration.
    """

    # check CUDA

    use_cuda = torch.cuda.is_available()
    print("CUDA: ", use_cuda)

    # display cycle

    print(f"CYCLE: {cycle}")

    # set seed

    set_seed(56)

    print("======================================================================================")
    print(f"Optimizing: {pretrained_model_name}\nTarget: {target}")
    print("======================================================================================")

    # tokenize train and test sets

    encoded_train = tokenizer(
                              d_train['text'].tolist(),
                              padding = True,
                              truncation = True,
                              return_tensors = 'pt',
                              )

    encoded_test = tokenizer(
                             d_test['text'].tolist(),
                             padding = True,
                             truncation = True,
                             return_tensors = 'pt',
                             )


    # accept dynamic target variables

    train_labels = torch.tensor(d_train[target].values)
    test_labels = torch.tensor(d_test[target].values)

    # prep datasets

    train_dataset = TensorDataset(
                                  encoded_train['input_ids'],
                                  encoded_train['attention_mask'],
                                  train_labels,
                                  )

    test_dataset = TensorDataset(
                                 encoded_test['input_ids'],
                                 encoded_test['attention_mask'],
                                 test_labels,
                                 )

    # initialize class weights

    if use_cuda:
        class_weights = class_weights.cuda()

    # fix target-specific hyperparams

    num_epochs = max(num_epochs_range)
    print(f"\nTotal epochs to run: {num_epochs}")

    # merge fixed hyperparams, num_epochs

    hyperparams = fixed_hyperparameters.copy()
    hyperparams['num_epochs'] = num_epochs

    train_loader = DataLoader(
                              train_dataset,
                              batch_size = hyperparams['batch_size'],
                              shuffle = True,
                              )

    test_loader = DataLoader(
                             test_dataset,
                             batch_size = hyperparams['batch_size'],
                             shuffle = False,
                             )

    print(f"\nTotal training rows: {len(train_dataset)}")
    print(f"Total evaluation rows: {len(test_dataset)}")
    print(f"Training batch size: {hyperparams['batch_size']}")
    print(f"Evaluation batch size: {hyperparams['batch_size']}")
    print(f"Total training batches: {len(train_loader)}")
    print(f"Total evaluation batches: {len(test_loader)}")
    print("\n")

    # initialize model

    model = model_class.from_pretrained(
                                        pretrained_model_name,
                                        attention_probs_dropout_prob = fixed_hyperparameters.get('dropout', 0.1), ### default = 0.1
                                        hidden_dropout_prob = fixed_hyperparameters.get('dropout', 0.1), ### default = 0.1
                                        )

    if use_cuda:
        model.cuda()

    # initialize optimizer and lr scheduler

    optimizer = torch.optim.AdamW(
                                  model.parameters(),
                                  lr = hyperparams['learning_rate'],
                                  weight_decay = hyperparams['weight_decay'],
                                  )

    # calculate total steps

    total_steps = len(train_loader) * num_epochs

    # add scheduler with warmup steps

    scheduler = get_linear_schedule_with_warmup(
                                                optimizer,
                                                num_warmup_steps = hyperparams['warmup_steps'],
                                                num_training_steps = total_steps,
                                                )

    criterion = CrossEntropyLoss(weight = class_weights)

    # set checkpoints subdirectory

    optimized_model_dir = os.path.join(save_path, "RoBERTa_checkpoints")
    os.makedirs(optimized_model_dir, exist_ok = True)

    # initialize tracking variables

    f1_scores = []
    performance_data = []

    # training loop

    for epoch in range(hyperparams['num_epochs']):
        model.train()
        total_train_loss = 0
        correct_train_preds = 0
        total_train_samples = 0

        for i, batch in enumerate(tqdm(train_loader, desc = f"Training Epoch {epoch + 1}/{hyperparams['num_epochs']}", leave = True)):
            input_ids, attention_mask, labels = batch
            if use_cuda:
                input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()
            outputs = model(input_ids, attention_mask = attention_mask)
            loss = criterion(outputs.logits, labels)
            total_train_loss += loss.item()

            predictions = torch.argmax(outputs.logits, dim = 1)
            correct_train_preds += (predictions == labels).sum().item()
            total_train_samples += labels.size(0)

            loss = loss / hyperparams['gradient_accumulation_steps']
            loss.backward()
            if (i + 1) % hyperparams['gradient_accumulation_steps'] == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

        # calculate avg training loss, accuracy

        avg_train_loss = total_train_loss / len(train_loader)
        training_accuracy = correct_train_preds / total_train_samples

        # eval loop after each epoch

        model.eval()
        total_eval_loss = 0
        correct_eval_preds = 0
        total_eval_samples = 0
        all_predictions = []
        all_true_labels = []
        all_probabilities = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = batch
                if use_cuda:
                    input_ids, attention_mask, labels = input_ids.cuda(), attention_mask.cuda(), labels.cuda()
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                total_eval_loss += loss.item()

                probabilities = torch.softmax(outputs.logits, dim = 1)
                predictions = torch.argmax(probabilities, dim = 1)
                correct_eval_preds += (predictions == labels).sum().item()
                total_eval_samples += labels.size(0)

                all_predictions.extend(predictions.cpu().tolist())
                all_true_labels.extend(labels.cpu().tolist())
                all_probabilities.extend(probabilities.cpu().tolist())

        # calculate accuracy, avg eval loss

        avg_eval_loss = total_eval_loss / len(test_loader)
        evaluation_accuracy = correct_eval_preds / total_eval_samples

        # calculate f1

        current_f1_macro = f1_score(
                                    all_true_labels,
                                    all_predictions,
                                    average = 'macro',
                                    )
        f1_scores.append(current_f1_macro)

        # display

        print(f"\nEpoch {epoch + 1}/{hyperparams['num_epochs']} Results:")
        print(f"  Training Loss: {avg_train_loss:.4f}")
        print(f"  Training Accuracy: {training_accuracy:.4f}")
        print(f"  Evaluation Loss: {avg_eval_loss:.4f}")
        print(f"  Evaluation Accuracy: {evaluation_accuracy:.4f}")
        print(f"  F1 Macro: {current_f1_macro:.4f}")

        # append performance data

        performance_data.append({
                                'pretrained_model_name': pretrained_model_name,
                                'target': target,
                                'epoch': epoch + 1,
                                'training_loss': avg_train_loss,
                                'training_accuracy': training_accuracy,
                                'evaluation_loss': avg_eval_loss,
                                'evaluation_accuracy': evaluation_accuracy,
                                'f1_score': current_f1_macro,
                                'batch_size': hyperparams['batch_size'],
                                'weight_decay': hyperparams['weight_decay'],
                                'learning_rate': hyperparams['learning_rate'],
                                'warmup_steps': hyperparams['warmup_steps'],
                                'num_epochs': hyperparams['num_epochs'],
                                'gradient_accumulation_steps': hyperparams['gradient_accumulation_steps'],
                                })

        # save model at end of epoch

        model_save_path = os.path.join(optimized_model_dir, f"{target}_roberta_optimized_epoch_{epoch + 1}_{cycle}.bin")
        torch.save(model.state_dict(), model_save_path)
        print(f"Model checkpoint saved: {model_save_path}")

    # save d_test_{target} with pred and prob

    print("--------------------------------------------------------------------------------------")
    print(f"Summary: {target}")
    print("--------------------------------------------------------------------------------------")

    # df: target-wise

    d_epochal_performance = pd.DataFrame(performance_data)

    # save: target-wise df

    d_epochal_performance.to_excel(f'{save_path}/d_epochal_performance_{target}_{cycle}.xlsx')

    # plot epochal training and evaluation loss

    plt.figure(figsize = (10, 6))
    plt.plot(
             d_epochal_performance['epoch'],
             d_epochal_performance['training_loss'],
             label = 'training_loss',
             #marker = 'o',
             )
    plt.plot(
             d_epochal_performance['epoch'],
             d_epochal_performance['evaluation_loss'],
             label = 'evaluation_loss',
             #marker = 'o',
             )
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.ylim(0, 1)
    plt.title(f'{target}: loss')
    plt.legend()
    plt.grid(False)
    loss_plot_path = os.path.join(save_path, f'{target}_loss_plot_{cycle}.png')
    plt.savefig(loss_plot_path)
    plt.show()
    plt.close()
    print(f"Loss plot saved: {loss_plot_path}")

    # plot epochal training and evaluation accuracy

    plt.figure(figsize = (10, 6))
    plt.plot(
             d_epochal_performance['epoch'],
             d_epochal_performance['training_accuracy'],
             label = 'training_accuracy',
             #marker = 'o',
             )
    plt.plot(
             d_epochal_performance['epoch'],
             d_epochal_performance['evaluation_accuracy'],
             label = 'evaluation_accuracy',
             #marker = 'o',
             )
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.ylim(0, 1)
    plt.title(f'{target}: accuracy')
    plt.legend()
    plt.grid(False)
    accuracy_plot_path = os.path.join(save_path, f'{target}_accuracy_plot_{cycle}.png')
    plt.savefig(accuracy_plot_path)
    plt.show()
    plt.close()
    print(f"Accuracy plot saved: {accuracy_plot_path}")

    return d_epochal_performance


#### bert_predict.py

**_load_model_**

In [None]:
%%writefile bert_predict.py

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
                          DistilBertTokenizer,
                          DistilBertForSequenceClassification,
                          BertTokenizer,
                          BertForSequenceClassification,
                          RobertaTokenizer,
                          RobertaForSequenceClassification,
                          )
from tqdm import tqdm
import pandas as pd

def load_model(model_path, model_class, pretrained_model_name):
    """
    Loads a pre-trained fine-tined LM from a specified path.
    """
    model = model_class.from_pretrained(pretrained_model_name)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

**_preprocess_data_**

In [None]:
%%writefile -a bert_predict.py

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
                          DistilBertTokenizer,
                          DistilBertForSequenceClassification,
                          BertTokenizer,
                          BertForSequenceClassification,
                          RobertaTokenizer,
                          RobertaForSequenceClassification,
                          )
from tqdm import tqdm
import pandas as pd

def preprocess_data(tokenizer, texts):
    """
    Tokenizes a list of texts using the specified LM-specific tokenizer.
    """
    encoded_texts = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    return encoded_texts

**_predict_**

In [None]:
%%writefile -a bert_predict.py

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
                          DistilBertTokenizer,
                          DistilBertForSequenceClassification,
                          BertTokenizer,
                          BertForSequenceClassification,
                          RobertaTokenizer,
                          RobertaForSequenceClassification,
                          )
from tqdm import tqdm
import pandas as pd

def predict(model, tokenizer, texts, batch_size = 8, use_cuda = True):
    """
    Predicts labels and probabilities for a list of texts using the specified model and tokenizer.
    """
    print(f"Total number of texts to predict: {len(texts)}")
    encoded_texts = preprocess_data(tokenizer, texts)
    dataset = TensorDataset(encoded_texts['input_ids'], encoded_texts['attention_mask'])
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    print(f"Batch size: {batch_size}")
    print(f"Total number of batches: {len(data_loader)}")

    if use_cuda:
        model.cuda()

    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        progress_bar = tqdm(total = len(data_loader), desc = "Predicting", leave = False)
        for batch in data_loader:
            input_ids, attention_mask = batch
            if use_cuda:
                input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()

            outputs = model(input_ids, attention_mask = attention_mask)
            probabilities = torch.softmax(outputs.logits, dim = 1)
            predictions = torch.argmax(probabilities, dim = 1).cpu().tolist()
            all_predictions.extend(predictions)
            all_probabilities.extend(probabilities.cpu().tolist())
            progress_bar.update(1)
        progress_bar.close()

    return all_predictions, all_probabilities

#### llama_train.py

**_load_llama_and_tokenizer_**

In [None]:
%%writefile llama_train.py

from peft import(
                 get_peft_model,
                 LoraConfig,
                 prepare_model_for_kbit_training,
                 )

import torch
from transformers import(
                         AutoTokenizer,
                         AutoModelForSequenceClassification,
                         BitsAndBytesConfig,
                         )

def load_llama_and_tokenizer(model_name, num_labels):
    """
    Loads the Llama model and tokenizer with 4-bit quantization and LoRA (Low-Rank Adaptation) applied.

    Args:
    model_name (str): name of pretrained Llama model.
    num_labels (int): number of labels for the classification task (binary or multiclass).

    Returns:
    model (AutoModelForSequenceClassification): Llama model configured for sequence classification.
    tokenizer (AutoTokenizer): tokenizer associated with the Llama model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space = True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

    quantization_config = BitsAndBytesConfig(
                                             load_in_4bit = True,
                                             bnb_4bit_quant_type = 'nf4',
                                             bnb_4bit_use_double_quant = True,
                                             bnb_4bit_compute_dtype = torch.bfloat16,
                                             )

    model_name = model_name

    model = AutoModelForSequenceClassification.from_pretrained(
                                                               model_name,
                                                               quantization_config = quantization_config,
                                                               num_labels = num_labels,
                                                               device_map = 'auto',
                                                               )


    # apply LoRA

    lora_config = LoraConfig(
                             r = 16,
                             lora_alpha = 8,
                             target_modules = [
                                               'q_proj',
                                               'k_proj',
                                               'v_proj',
                                               'o_proj',
                                               ],
                             lora_dropout = 0.05,
                             bias = 'none',
                             task_type = 'SEQ_CLS',
                             )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)

    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    return model, tokenizer

**_llama_tokenize_**

In [None]:
%%writefile -a llama_train.py

from transformers import PreTrainedTokenizer

def llama_tokenize(examples, tokenizer):
    """
    Tokenize the input examples using the provided tokenizer.

    Args:
    examples (dict): dictionary containing the text to be tokenized. Assumes the key 'text' contains the input text.
    tokenizer (PreTrainedTokenizer): tokenizer to be used for tokenizing the text.

    Returns:
    dict: a dictionary with tokenized input including input_ids, attention_mask, etc.
    """

    # tokenize 'text' col

    return tokenizer(
                     examples['text'],
                     padding = 'max_length',
                     truncation = True,
                     max_length = 512,
                     )

**_compute_llama_metrics_**

In [None]:
%%writefile -a llama_train.py

from sklearn.metrics import average_precision_score
#from datasets import load_metric
import evaluate

# load metrics

f1_metric = evaluate.load('f1')
mcc_metric = evaluate.load('matthews_correlation')

#f1_metric = load_metric('f1')
#mcc_metric = load_metric('matthews_correlation')

def compute_llama_metrics(eval_pred):
    """
    Compute evaluation metrics for the Llama model during evaluation.

    Args:
    eval_pred (tuple): a tuple containing predictions and labels. The predictions are logits, and the labels are the ground truth.

    Returns:
    dict: a dictionary containing F1 (macro), AUPRC, and MCC scores.
    """
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    f1 = f1_metric.compute(predictions = preds, references = labels, average = 'macro')
    auprc = average_precision_score(labels, predictions[:, 1]) ### use second (pos) class for binary classification
    mcc = mcc_metric.compute(predictions=preds, references = labels)

    return {
            'f1_macro': f1,
            'auprc': auprc,
            'mcc': mcc,
            }

**_train_and_evaluate_llama_**

In [None]:
%%writefile -a llama_train.py

from accelerate import Accelerator
from datasets import Dataset
from huggingface_hub import login
import pandas as pd
from sklearn.model_selection import ParameterGrid, StratifiedKFold
import torch
from transformers import (
                          AdamW,
                          TrainingArguments,
                          Trainer,
                          )

def train_and_evaluate_llama(target_datasets, targets_and_class_weights, model_name, hyperparameter_grid, save_path):
    """
    Trains and tests Llama for multiple targets using stratified k-fold cross-validation and a held-out test set. Handles
    model loading, tokenization, training with Hugging Face's Trainer. Computeds performance metrics by target.

    Args:
    target_datasets (dict): dictionary where keys are target names and values are target-specific tuples of
    (d_train_{target}, d_test_{target})
    targets_and_class_weights (dict): dictionary of target-specific inverse-freq class weights to mitigate class imbalance.
    model_name (str): name or path of the pretrained Llama model to load.
    hyperparameter_grid (list): grid space of hyperparameter configurations (generated using ParameterGrid).
    save_path (str): directory to save best-performing model

    Returns:
    Saves best-performing model by target, saves df of tabulated performance metrics.
    """

    # initialize accelerator

    accelerator = Accelerator()

    # HF login

    login(token = '')

    # initialize performance df

    d_llama_performance = pd.DataFrame(columns = [
                                                  'target',
                                                  'model',
                                                  'fold',
                                                  'f1_macro',
                                                  'mcc',
                                                  'auprc',
                                                  ])

    for target, (d_train, d_test) in target_datasets.items():
        class_weights = torch.tensor(targets_and_class_weights[target]).to(accelerator.device)
        print("\n======================================================================================")
        print(f"Training Llama for target: {target}")
        print("======================================================================================")

        # shuffle d_train

        d_train = d_train.sample(frac = 1, random_state = 56).reset_index(drop = True)

        # define stratified k-fold

        skf = StratifiedKFold(n_splits = 5)

        # adefine augmentation mask

        aug_mask = d_train['aug'] == 1

        # train-validation loop

        for fold, (train_index, val_index) in enumerate(skf.split(d_train, d_train[target])):
            print(f"\nFold {fold + 1}/5")

            # split train and validation sets based on aug mask

            train_mask = aug_mask | d_train.index.isin(train_index)
            val_mask = ~aug_mask & d_train.index.isin(val_index)

            d_train_fold = d_train[train_mask].copy()
            d_val_fold = d_train[val_mask].copy()

            print(f"Fold {fold + 1} Training rows: {len(d_train_fold)}")
            print(f"Fold {fold + 1} Validation rows: {len(d_val_fold)}")

            # rename 'target' col to 'label' for HF Trainer

            d_train_fold = d_train_fold.rename(columns = {target: 'label'})
            d_val_fold = d_val_fold.rename(columns = {target: 'label'})

            # excise 'aug' col before creating HF Dataset objects

            d_train_fold = d_train_fold.drop(columns = ['aug'])
            d_val_fold = d_val_fold.drop(columns = ['aug'])

            # convert to HF Dataset

            train_dataset = Dataset.from_pandas(d_train_fold)
            val_dataset = Dataset.from_pandas(d_val_fold)

           # reinitialize model and tokenizer for each fold to avoid residual information

            model, tokenizer = load_llama_and_tokenizer(model_name, num_labels = 2)

            # tokenize

            train_dataset = train_dataset.map(lambda i: llama_tokenize(i, tokenizer), batched = True)
            val_dataset = val_dataset.map(lambda i: llama_tokenize(i, tokenizer), batched = True)

            # reformat to PyTorch tensors for HF Trainer compatibility

            train_dataset.set_format(type = 'torch', columns = [
                                                                'input_ids',
                                                                'attention_mask',
                                                                'label',
                                                                ]
                                     )

            val_dataset.set_format(type = 'torch', columns = [
                                                              'input_ids',
                                                              'attention_mask',
                                                              'label',
                                                              ]
                                   )

            # display training and validation details

            train_batch_size = 4
            val_batch_size = 4
            total_train_batches = len(train_dataset) // train_batch_size
            total_eval_batches = len(val_dataset) // val_batch_size

            print(f"Total training rows: {len(train_dataset)}")
            print(f"Total validation rows: {len(val_dataset)}")
            print(f"Training batch size: {train_batch_size}")
            print(f"Validation batch size: {val_batch_size}")
            print(f"Total training batches: {total_train_batches}")
            print(f"Total evaluation batches: {total_eval_batches}")

            # HF TrainingArguments

            for h in hyperparameter_grid:
                training_args = TrainingArguments(
                                                  output_dir = '/content/drive/MyDrive/Colab/bar_policy_suicidality/temp/',
                                                  learning_rate = h['learning_rate'],
                                                  per_device_train_batch_size = 4,
                                                  per_device_eval_batch_size = 4,
                                                  num_train_epochs = h['num_train_epochs'],
                                                  weight_decay = h['weight_decay'],
                                                  gradient_accumulation_steps = h['gradient_accumulation_steps'],
                                                  warmup_steps = h['warmup_steps'],
                                                  evaluation_strategy = 'epoch',
                                                  save_strategy = 'epoch',
                                                  report_to = 'none',
                                                  push_to_hub = False,
                                                  remove_unused_columns = True,
                                                  fp16 = True,
                                                  seed = 56,
                                                  )

            # HF Trainer setup

            trainer = Trainer(
                              model = model,
                              args = training_args,
                              train_dataset = train_dataset,
                              eval_dataset = val_dataset,
                              compute_metrics = compute_llama_metrics,
                              optimizers = (AdamW(model.parameters(), lr = training_args.learning_rate), None),
                              )

            # train

            trainer.train()

            # append fold metrics to performance dataframe

            val_metrics = trainer.evaluate(val_dataset)
            d_llama_performance.loc[len(d_llama_performance)] = [
                                                                 target,
                                                                 'llama-3.1-8b',
                                                                 fold + 1,
                                                                 val_metrics['eval_f1_macro'],
                                                                 val_metrics['eval_mcc'],
                                                                 val_metrics['eval_auprc'],
                                                                ]

        # test on held-out test set

        print("--------------------------------------------------------------------------------------")
        print(f"Testing Llama for target: {target}")
        print("--------------------------------------------------------------------------------------")

        # rename 'target' col: held-out test set

        d_test = d_test.rename(columns = {target: 'label'})

        # excise 'aug' col: held-out test set

        d_test = d_test.drop(columns = ['aug'])

        test_dataset = Dataset.from_pandas(d_test)
        test_dataset = test_dataset.map(lambda i: llama_tokenize(i, tokenizer), batched = True)
        test_dataset.set_format(type = 'torch', columns = [
                                                           'input_ids',
                                                           'attention_mask',
                                                           'label',
                                                           ]
                                )

        # display test set details

        test_batch_size = 4
        total_test_batches = len(test_dataset) // test_batch_size

        print(f"Total test rows: {len(test_dataset)}")
        print(f"Test batch size: {test_batch_size}")
        print(f"Total test batches: {total_test_batches}")

        # test

        test_metrics = trainer.evaluate(test_dataset)
        d_llama_performance.loc[len(d_llama_performance)] = [
                                                             target,
                                                             'llama-3.1-8b',
                                                             'Test',
                                                             test_metrics['eval_f1_macro'],
                                                             test_metrics['eval_mcc'],
                                                             test_metrics['eval_auprc'],
                                                            ]

        # save target-wise trained models

        print(f"\nSaving baseline trained Llama for target: {target}")
        target_save_path = f'{save_path}/{target}_llama_baseline_model'
        model.save_pretrained(target_save_path)
        tokenizer.save_pretrained(target_save_path)

    # extract performance scores numeric values

    d_llama_performance['f1_macro'] = d_llama_performance['f1_macro'].apply(lambda i: i['f1'] if isinstance(i, dict) else i)
    d_llama_performance['mcc'] = d_llama_performance['mcc'].apply(lambda i: i['matthews_correlation'] if isinstance(i, dict) else i)
    d_llama_performance['auprc'] = d_llama_performance['auprc'].apply(lambda i: i if isinstance(i, float) else None)  # Ensure AUPRC is numeric

    print("\n--------------------------------------------------------------------------------------")
    print(f"Summary: Llama performance for target: {target}")
    print("--------------------------------------------------------------------------------------")

    print(d_llama_performance.head(6))
    d_llama_performance.to_excel('d_llama_performance.xlsx')


**_tune_and_optimize_llama_hyperparams_**

In [None]:
%%writefile -a llama_train.py

from accelerate import Accelerator
from datasets import Dataset
from huggingface_hub import login
import pandas as pd
from sklearn.model_selection import ParameterGrid
import torch
from transformers import (
                          AdamW,
                          TrainingArguments,
                          Trainer,
                          )

def tune_and_optimize_llama_hyperparams(target_datasets, targets_and_class_weights, model_name, hyperparameter_grid, save_path):
    """
    Tune and optimize hyperparameters for a Llama model using ParameterGrid search. Trains and tests on held-out target-specific
    d_test_{target}, adjusting model in accord with pre-specified ParameterGrid. Saves best-performing model by target.

    Args:
    target_datasets (dict): dictionary where keys are target-specific tuples of (d_train_{target}, d_test_{target}).
    targets_and_class_weights (dict): dictionary of target-specific inverse-freq class weights.
    model_name (str): name of pretrained Llama model to load.
    hyperparameter_grid (list): grid space of hyperparameter configurations (generated using ParameterGrid).
    save_path (str): directory to save best-performing model

    Returns:
    Saves best-performing model by target, saves df of tabulated performance metrics.
    """
    # initialize accelerator

    accelerator = Accelerator()

    # HF login

    login(token = '')

    # initialize performance df

    d_llama_performance = pd.DataFrame(columns = [
                                                  'target',
                                                  'model',
                                                  'f1_macro',
                                                  'mcc',
                                                  'auprc',
                                                  ]
                                       )

    for target, (d_train, d_test) in target_datasets.items():
        class_weights = torch.tensor(targets_and_class_weights[target]).to(accelerator.device)
        print("\n======================================================================================")
        print(f"Tuning Llama 3.1 for target: {target}")
        print("======================================================================================")

        best_f1_macro = 0 ### tracking var: best F1 (macro)
        best_model_state = None ### tracking var: best-performing model x hyperparam configs

        for h in hyperparameter_grid:
            print("\n")
            print(f"\nTuning with hyperparam config: {h}")

            # re/initialize model and tokenizer for each hyperparameter config

            model, tokenizer = load_llama_and_tokenizer(model_name, num_labels = 2)

            # rename 'target' col to 'label' for HF Trainer

            d_train = d_train.rename(columns = {target: 'label'})
            d_test = d_test.rename(columns = {target: 'label'})

            # convert to HF Dataset

            train_dataset = Dataset.from_pandas(d_train)
            test_dataset = Dataset.from_pandas(d_test)

            # tokenize

            train_dataset = train_dataset.map(lambda i: llama_tokenize(i, tokenizer), batched = True)
            test_dataset = test_dataset.map(lambda i: llama_tokenize(i, tokenizer), batched = True)

            # reformat to PyTorch tensors for HF Trainer compatibility

            train_dataset.set_format(type = 'torch', columns = [
                                                                'input_ids',
                                                                'attention_mask',
                                                                'label',
                                                                ]
                                     )

            test_dataset.set_format(type = 'torch', columns = [
                                                               'input_ids',
                                                               'attention_mask',
                                                               'label',
                                                               ]
                                    )

            # display training and testing details

            train_batch_size = 4
            test_batch_size = 4
            total_train_batches = len(train_dataset) // train_batch_size
            total_test_batches = len(test_dataset) // test_batch_size

            print(f"Total training rows: {len(train_dataset)}")
            print(f"Total test rows: {len(test_dataset)}")
            print(f"Training batch size: {train_batch_size}")
            print(f"Test batch size: {test_batch_size}")
            print(f"Total training batches: {total_train_batches}")
            print(f"Total test batches: {total_test_batches}")

            # HF TrainingArguments w/ ParameterGrid

            training_args = TrainingArguments(
                                              output_dir = '/content/drive/MyDrive/Colab/bar_policy_suicidality/temp/',
                                              learning_rate = h['learning_rate'],
                                              per_device_train_batch_size = train_batch_size,
                                              per_device_eval_batch_size = test_batch_size,
                                              num_train_epochs = h['num_train_epochs'],
                                              weight_decay = h['weight_decay'],
                                              gradient_accumulation_steps = h['gradient_accumulation_steps'],
                                              warmup_steps = h['warmup_steps'],
                                              evaluation_strategy = 'no', ### removes midstream validation
                                              save_strategy = 'epoch',
                                              report_to = 'none',
                                              push_to_hub = False,
                                              remove_unused_columns = True, ### 'aug' dropped here
                                              fp16 = True, ### mixed precision to mitigate memory overhead
                                              seed = 56,
                                              )

            # HF Trainer setup

            trainer = Trainer(
                              model = model,
                              args = training_args,
                              train_dataset = train_dataset,
                              eval_dataset = test_dataset, ### uses d_test directly
                              compute_metrics = compute_llama_metrics,
                              optimizers = (AdamW(model.parameters(), lr = training_args.learning_rate), None),
                              )

            # train

            trainer.train()

            # test on held-out test set

            print("--------------------------------------------------------------------------------------")
            print(f"Testing Llama for target: {target}")
            print("--------------------------------------------------------------------------------------")

            test_metrics = trainer.evaluate(test_dataset)
            print(test_metrics)

            # append fold metrics to performance dataframe

            d_llama_performance.loc[len(d_llama_performance)] = [
                                                                 target,
                                                                 'llama-3.1-8b',
                                                                 test_metrics['eval_f1_macro'],
                                                                 test_metrics['eval_mcc'],
                                                                 test_metrics['eval_auprc'],
                                                                 ]

            # save best model based on F1 (macro)

            if test_metrics['eval_f1_macro']['f1'] > best_f1_macro:
                best_f1_macro = test_metrics['eval_f1_macro']['f1']
                print(f"\nUpdating best model state for target: {target} with F1 (macro): {best_f1_macro}")

        # save the best model target-wise

        if best_f1_macro > 0:
            print(f"\nSaving best model for target: {target} with F1 (macro): {best_f1_macro}")
            target_save_path = f'{save_path}/{target}_llama_best_tuned_model'

            # save quantized model

            model.save_pretrained(target_save_path)

            # save tokenizer

            tokenizer.save_pretrained(target_save_path)

    # extract performance scores numeric values

    d_llama_performance['f1_macro'] = d_llama_performance['f1_macro'].apply(lambda i: i['f1'] if isinstance(i, dict) else i)
    d_llama_performance['mcc'] = d_llama_performance['mcc'].apply(lambda i: i['matthews_correlation'] if isinstance(i, dict) else i)
    d_llama_performance['auprc'] = d_llama_performance['auprc'].apply(lambda i: i if isinstance(i, float) else None)

    print("Llama performance summary:")
    d_llama_performance.to_excel('d_llama_tuned_performance.xlsx')


#### llama_predict.py

**_llama_load_and_predict_single_target_**

In [None]:
%%writefile llama_predict.py

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

def llama_load_and_predict_single_target(target, df, models_path, batch_size):
    """
    Function to load a single model and tokenizer for a specified target, and use them to predict
    labels and class probabilities for the text in df['text'] in batches.

    Args:
        target (str): The name of the target (e.g., 'asp', 'dep').
        df (pd.DataFrame): DataFrame containing a 'text' column with the input texts.
        models_path (str): Directory where the model for the target is saved (e.g., /models/).
        batch_size (int): The batch size for processing the data in smaller chunks.

    Returns:
        pd.DataFrame: DataFrame with additional columns '{target}_pred' and '{target}_prob'.
    """

    # load target-specific best-performing tuned Llama

    model_save_path = f'{models_path}/{target}_llama_best_tuned_model'

    tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

    # set padding token

    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

    # migrate to GPU

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # ensure eval mode

    model.eval()

    predicted_labels = []
    class_probabilities = []

    # batch processing

    with torch.no_grad():
        for i in range(0, len(df), batch_size):
            batch_texts = df['text'][i:i + batch_size].tolist()

            # tokenize

            inputs = tokenizer(
                               batch_texts,
                               return_tensors = 'pt',
                               truncation = True,
                               padding = True,
                               max_length = 512,
                               )

            inputs = {k: v.to(device) for k, v in inputs.items()}

            # get logits

            outputs = model(**inputs)
            logits = outputs.logits

            # logits -> probabilities via softmax

            probabilities = torch.softmax(logits, dim = -1).cpu().numpy()

            # get predicted labels

            predicted_labels_batch = torch.argmax(logits, dim = -1).cpu().numpy()

            # append results

            predicted_labels.extend(predicted_labels_batch)
            class_probabilities.extend(probabilities)

    # to df

    df[f'{target}_pred'] = predicted_labels
    df[f'{target}_prob'] = class_probabilities

    return df


**_llama_load_and_predict_multi_target_**

In [None]:
%%writefile -a llama_predict.py

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

def llama_load_and_predict_multi_target(targets, df, models_path, batch_size):
    """
    Function to load multiple models and tokenizers for multiple targets, and use them to predict
    labels and class probabilities for the text in df['text'] in batches.

    Args:
        targets (list): List of target names (e.g., ['asp', 'dep']).
        df (pd.DataFrame): DataFrame containing a 'text' column with the input texts.
        models_path (str): Directory where models for each target are saved (e.g., /models/).
        batch_size (int): The batch size for processing the data in smaller chunks.

    Returns:
        pd.DataFrame: DataFrame with additional columns '{target}_pred' and '{target}_prob' for each target.
    """

    for target in targets:

        # load target-specific best-performing tuned Llama

        model_save_path = f'{models_path}/{target}_llama_best_tuned_model'

        tokenizer = AutoTokenizer.from_pretrained(model_save_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

        # set padding token

        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

        # migrate to GPU

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model.to(device)

        # ensure eval mode

        model.eval()

        predicted_labels = []
        class_probabilities = []

        # batch processing

        with torch.no_grad():
            for i in range(0, len(df), batch_size):
                batch_texts = df['text'][i:i + batch_size].tolist()

                # tokenize

                inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
                inputs = {k: v.to(device) for k, v in inputs.items()}

                # get logits

                outputs = model(**inputs)
                logits = outputs.logits

                # logits -> probabilities via softmax

                probabilities = torch.softmax(logits, dim = -1).cpu().numpy()

                # get predicted labels

                predicted_labels_batch = torch.argmax(logits, dim = -1).cpu().numpy()

                # append results

                predicted_labels.extend(predicted_labels_batch)
                class_probabilities.extend(probabilities)

        # to df

        df[f'{target}_pred'] = predicted_labels
        df[f'{target}_prob'] = class_probabilities

    return df


#### Import

In [None]:
from preprocess import (
                        augment_training_data_with_rationales,
                        dummy_code_augmented_rows,
                        read_and_append_jsonl_posts,
                        read_and_append_jsonl_comments,
                        )

#from redact import (
#                    ner_redact_post_texts,
#                    )

# BERT, RoBERTa, DistilBERT modules

from bert_train import (
                        set_seed,
                        train_eval_save_bl_models,
                        performance_scatterplot,
                        iterative_stratified_train_test_split_with_rationales,
                        tune_and_optimize_model_hyperparams,
                        tune_and_optimize_model_loss_accuracy,
                        )

from bert_predict import (
                          load_model,
                          preprocess_data,
                          predict,
                          )

# Llama 3.1 modules

from llama_train import (
                         load_llama_and_tokenizer,
                         llama_tokenize,
                         compute_llama_metrics,
                         train_and_evaluate_llama,
                         tune_and_optimize_llama_hyperparams,
                         )

from llama_predict import (
                           llama_load_and_predict_single_target,
                           llama_load_and_predict_multi_target,
                           )

### 3. Preprocess
Takes $\mathcal{d}$<sub>annotated</sub>, builds independent $\mathcal{d}$<sub>calibrate</sub> and rationale-augmented $\mathcal{d}$<sub>augmented</sub> to train. Builds $\mathcal{V}$ corpus, $\mathcal{d}$<sub>adapt</sub> for domain adaptation. Merges, NER-anonymizes Aim II analytic sample $\mathcal{D}$<sub>inference</sub>.
***

#### Merge Wave I (purposive) and Wave II (random)

In [None]:
%cd ../inputs/data

    ### SJS 11/26: WIP annotation files at ../inputs/annotation

In [None]:
# recover 'sbrt' subreddit var

d_prp_sbrt = pd.read_excel('d_cycle999_prp_sd_single.xlsx', index_col = [0])
d_prp_01 = pd.read_excel('d_cycle999_prp_sd_ss_gpt_agreed_01.xlsx', index_col = [0])
d_prp_02 = pd.read_excel('d_cycle999_prp_sd_ss_gpt_agreed_02.xlsx', index_col = [0])

d_prp = pd.concat([
                   d_prp_01,
                   d_prp_02,
                   ])

d_prp = d_prp.drop(
                   'p_id_sd',
                   axis = 1,
                   )

d_prp.rename(
             columns = {
                        'p_id_ss': 'p_id',
                        }, inplace = True,
            )

d_prp_sbrt = d_prp_sbrt[[
                         'p_id',
                         'sbrt',
                         ]].copy()

#d_prp.info()
#d_prp.head(1)
#d_prp_sbrt.head(1)

d_prp = d_prp.merge(
                    d_prp_sbrt,
                    on = 'p_id',
                    how = 'inner',
                    )

d_prp.head(1)

In [None]:
# Wave I IAA - post-negotiated agreement

asp_kappa = cohen_kappa_score(d_prp['asp_sd'], d_prp['asp_ss'])
val_kappa = cohen_kappa_score(d_prp['val_sd'], d_prp['val_ss'])

print("asp Kappa:", asp_kappa)
print("val Kappa:", val_kappa)

In [None]:
# import purposive - SS-SD-GPT triangulated

#d_prp_01 = pd.read_excel('d_cycle999_prp_sd_ss_gpt_agreed_01.xlsx', index_col = [0])
#d_prp_02 = pd.read_excel('d_cycle999_prp_sd_ss_gpt_agreed_02.xlsx', index_col = [0])

#d_prp = pd.concat([
#                   d_prp_01,
#                   d_prp_02,
#                   ])

# target = 1 at SS-SD agreement

strain_tuples = [
                    ('asp_ss', 'asp_sd', 'asp'),
                    ('dep_ss', 'dep_sd', 'dep'),
                    ('val_ss', 'val_sd', 'val'),
                    #('prg_ss', 'prg_sd', 'prg'),
                    #('tgd_ss', 'tgd_sd', 'tgd'),
                    #('age_ss', 'age_sd', 'age'),
                    #('race_ss', 'race_sd', 'race'),
                    #('dbty_ss', 'dbty_sd', 'dbty'),
                    ]

for ss, sd, target in strain_tuples:
    d_prp[target] = (d_prp[ss] == 1) & (d_prp[sd] == 1)
    d_prp[target] = d_prp[target].astype(int)

# target = 1 at SS-SD-GPT agreement

trait_tuples = [
                    ('prg_ss', 'prg_sd', 'prg_gpt', 'prg'),
                    ('tgd_ss', 'tgd_sd', 'tgd_gpt', 'tgd'),
                    ('age_ss', 'age_sd', 'age_gpt', 'age'),
                    ('race_ss', 'race_sd', 'race_gpt', 'race'),
                    ('dbty_ss', 'dbty_sd', 'dbty_gpt', 'dbty'),
                    ]

for ss, sd, gpt, target in trait_tuples:
    d_prp[target] = (d_prp[ss] == 1) & (d_prp[sd] == 1) & (d_prp[gpt] == 1)
    d_prp[target] = d_prp[target].astype(int)


# append rationales

rationale_tuples = [
                    ('asp_rtnl_ss', 'asp_rtnl_sd', 'asp', 'asp_rtnl'),
                    ('dep_rtnl_ss', 'dep_rtnl_sd', 'dep', 'dep_rtnl'),
                    ('val_rtnl_ss', 'val_rtnl_sd', 'val', 'val_rtnl'),
                    ]

for ss, sd, target, rationale in rationale_tuples:
    d_prp[rationale] = d_prp.apply(
        lambda row: (row[ss] or ' ') + ' ' + (row[sd] or ' ') if row[target] == 1 else None, axis = 1
    )

d_prp.rename(
             columns = {
                        'prg_rtnl_gpt': 'prg_rtnl',
                        'tgd_rtnl_gpt': 'tgd_rtnl',
                        'age_rtnl_gpt': 'age_rtnl',
                        'race_rtnl_gpt': 'race_rtnl',
                        'dbty_rtnl_gpt': 'dbty_rtnl',
                        }, inplace = True,
            )

# drop negotiated agreement artifacts

d_prp = d_prp[[
                #'p_au',
                #'p_utc',
                #'p_date',
                'p_id',
                #'n_cmnt',
                'text',
                'sbrt',
                'p_titl',
                'asp',
                'asp_rtnl',
                'dep',
                'dep_rtnl',
                'val',
                'val_rtnl',
                'prg',
                'prg_rtnl',
                'tgd',
                'tgd_rtnl',
                'age',
                'age_rtnl',
                'race',
                'race_rtnl',
                'dbty',
                'dbty_rtnl',
                ]].copy()


# inspect

d_prp.info()
d_prp.head(1)


In [None]:
# random - SS-GPT triangulated

d_rnd_01 = pd.read_excel('d_cycle999_rnd_ss_gpt_agreed_01.xlsx', index_col = [0])
d_rnd_02 = pd.read_excel('d_cycle999_rnd_ss_gpt_agreed_02.xlsx', index_col = [0])

d_rnd = pd.concat([
                   d_rnd_01,
                   d_rnd_02,
                   ])

# target = 1 at SS-GPT agreement

target_tuples = [
                    ('asp_ss', 'asp_gpt', 'asp'),
                    ('dep_ss', 'dep_gpt', 'dep'),
                    ('val_ss', 'val_gpt', 'val'),
                    ('prg_ss', 'prg_gpt', 'prg'),
                    ('tgd_ss', 'tgd_gpt', 'tgd'),
                    ('age_ss', 'age_gpt', 'age'),
                    ('race_ss', 'race_gpt', 'race'),
                    ('dbty_ss', 'dbty_gpt', 'dbty'),
                    ]

for ss, gpt, target in target_tuples:
    d_rnd[target] = (d_rnd[ss] == 1) & (d_rnd[gpt] == 1)
    d_rnd[target] = d_rnd[target].astype(int)

# append rationales

rationale_tuples = [
                    ('asp_rtnl_ss', 'asp_rtnl_gpt', 'asp', 'asp_rtnl'),
                    ('dep_rtnl_ss', 'dep_rtnl_gpt', 'dep', 'dep_rtnl'),
                    ('val_rtnl_ss', 'val_rtnl_gpt', 'val', 'val_rtnl'),
                    ]

for ss, gpt, target, rationale in rationale_tuples:
    d_rnd[rationale] = d_rnd.apply(
        lambda row: (row[ss] or ' ') + ' ' + (row[gpt] or ' ') if row[target] == 1 else None, axis = 1
    )

d_rnd.rename(
             columns = {
                        'prg_rtnl_gpt': 'prg_rtnl',
                        'tgd_rtnl_gpt': 'tgd_rtnl',
                        'age_rtnl_gpt': 'age_rtnl',
                        'race_rtnl_gpt': 'race_rtnl',
                        'dbty_rtnl_gpt': 'dbty_rtnl',
                        }, inplace = True,
            )

# drop negotiated agreement artifacts

d_rnd = d_rnd[[
                #'p_au',
                #'p_utc',
                #'p_date',
                'p_id',
                #'n_cmnt',
                'text',
                'sbrt',
                'p_titl',
                'asp',
                'asp_rtnl',
                'dep',
                'dep_rtnl',
                'val',
                'val_rtnl',
                'prg',
                'prg_rtnl',
                'tgd',
                'tgd_rtnl',
                'age',
                'age_rtnl',
                'race',
                'race_rtnl',
                'dbty',
                'dbty_rtnl',
                ]].copy()

# inspect

d_rnd.info()
d_rnd.head(1)


In [None]:
# supplemental - SS annotated

d_bp = pd.read_excel('d_bp_suppl_ss.xlsx') ### r/Blackpeople, n = 100
d_db = pd.read_excel('d_db_suppl_ss.xlsx') ### r/Disability, n = 100

d_suppl = pd.concat([
                     d_bp,
                     d_db,
                     ])

d_suppl = d_suppl.drop(['date','insb'], axis = 1,)

d_suppl['prg_rtnl'] = ' '
d_suppl['tgd_rtnl'] = ' '
d_suppl['age_rtnl'] = ' '

d_suppl = d_suppl[[
                   #'p_au',
                   #'p_utc',
                   #'p_date',
                   'p_id',
                   #'n_cmnt',
                   'text',
                   'sbrt',
                   'p_titl',
                   'asp',
                   'asp_rtnl',
                   'dep',
                   'dep_rtnl',
                   'val',
                   'val_rtnl',
                   'prg',
                   'prg_rtnl',
                   'tgd',
                   'tgd_rtnl',
                   'age',
                   'age_rtnl',
                   'race',
                   'race_rtnl',
                   'dbty',
                   'dbty_rtnl',
                   ]].copy()

d_suppl.info()
d_suppl.head(3)

In [None]:
# concat, housekeeping

d = pd.concat([
               d_prp,
               d_rnd,
               d_suppl,
               ])

# delete index

#d = d.drop(
#           'Unnamed: 0',
#           axis = 1,
#           )

# delete empty 'text' cells

d = d[d.text != ' ']

d.replace(
          ' ',
          0,
          inplace = True,
          )

d.fillna(
         0,
         inplace = True,
         )

# to int

targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

for t in targets:
    d[t] = pd.to_numeric(d[t], errors = 'coerce')
    d[t] = d[t].fillna(0).astype('int64')

# housekeeping: del 'text' = 0, del pseudowords

d = d[d['text'] != '0']
d = d[d['text'].astype(str) != '0']


# shuffle, reset index

d_annotated = shuffle(
                      d,
                      random_state = 56,
                      )

d_annotated.reset_index(
                        drop = True,
                        inplace = True,
                        )

# subreddit: n/%

sbrt_cnt = d_annotated['sbrt'].value_counts()
sbrt_pct = d_annotated['sbrt'].value_counts(normalize = True) * 100

sbrts = pd.DataFrame({
                      'Count': sbrt_cnt,
                      'Percentage': sbrt_pct,
                      })

print(sbrts)

# target: n

d_annotated[targets].apply(pd.Series.value_counts)

# target / subreddit: n/%

targets_cnt = d_annotated.groupby('sbrt')[targets].sum()
targets_pct = d_annotated.groupby('sbrt')[targets].mean() * 100

targets_sbrt = targets_cnt.astype(int).add_suffix('_count').join(targets_pct.round(2).add_suffix('_percent'))

print(targets_sbrt)

# inspect

#d.dtypes
#d_annotated.info()
d_annotated.head(3)

# export

#d_annotated.to_excel('d_annotated.xlsx')

$\mathcal{d}$<sub>calibrate</sub> ($n$<sub>posts</sub> = 400)

In [None]:
d_calibrate = d_annotated.iloc[:400]

# del pseudowords

texts = ['text']
pseudoword_tokens = ['<SPL>', '<|PII|>']

for t in texts:
    d_calibrate[t] = d_calibrate[t].replace(
                                            pseudoword_tokens,
                                            ' ',
                                            regex = True,
                                            )

# inspect, export

d_calibrate.shape
d_calibrate.head(3)

d_calibrate.to_excel('d_calibrate.xlsx')

$\mathcal{d}$<sub>augmented</sub> (unique $n$<sub>posts</sub> = 2,005)

In [None]:
d = d_annotated.iloc[400:]

d.reset_index(
              drop = True,
              inplace = True,
              )

# 'strn' = any pos_instance of strain

d['strn'] = (d['asp'] == 1) | (d['dep'] == 1) | (d['val'] == 1)
d['strn'] = d['strn'].astype(int)

# append rationales

rationales = [
              'asp_rtnl',
              'dep_rtnl',
              'val_rtnl',
              'prg_rtnl',
              'tgd_rtnl',
              'age_rtnl',
              'race_rtnl',
              'dbty_rtnl',
               ]

for r in rationales:
    d[r] = d[r].astype(str)
    d[r] = d[r].str.replace(
                            r'0',
                            '.',
                            regex = True,
                            )

d['rtnl'] = d['asp_rtnl'] + ' ' + d['dep_rtnl'] + ' ' + d['val_rtnl'] + ' ' + d['prg_rtnl'] + ' ' + d['tgd_rtnl'] + ' ' + d['age_rtnl'] + ' ' + d['race_rtnl'] + ' ' + d['dbty_rtnl']

d['rtnl'] = d['rtnl'].str.replace(
                                  r'. . . . . . . .',
                                  '.',
                                  regex = False,
                                  )

# del pseudowords

texts = ['text', 'rtnl']
pseudoword_tokens = ['<SPL>', '<|PII|>']

for t in texts:
    d[t] = d[t].replace(
                        pseudoword_tokens,
                        ' ',
                        regex = True,
                        )

print("pre-augmentation")
print("--------------------------------------------------------------------------------------")
d.shape
d.head(3)

In [None]:
# augment

d_augmented = augment_training_data_with_rationales(d)

d_augmented.reset_index(
                        drop = True,
                        inplace = True,
                        )

# 'aug' - flag augmented rows

d_augmented = dummy_code_augmented_rows(d_augmented)


print("post-augmentation")
print("--------------------------------------------------------------------------------------")

d_augmented[[
             'asp',
             'dep',
             'val',
             'prg',
             'tgd',
             'age',
             'race',
             'dbty',
             'aug'
              ]].apply(pd.Series.value_counts)

d_augmented.shape
d_augmented.head(3)

d_augmented.to_excel('d_augmented.xlsx')

#### Append $\mathcal{V}$ corpus, derive $\mathcal{D}$<sub>inference</sub>

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

archives_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/archives/'

chunk_size = 10000

d_p = read_and_append_jsonl_posts(archives_path)

# inspect + save

d_p.info()
d_p.head(3)

d_p.to_csv(
           'd_posts_raw.csv',
           encoding = 'utf-8',
           #index = False,
           header = True,
           )

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

archives_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/archives/'

chunk_size = 10000

d_c = read_and_append_jsonl_comments(archives_path)

d_c.shape
d_c.head(3)

# inspect + save

d_c.info()
d_c.to_csv(
           'd_comments_raw.csv',
           encoding = 'utf-8',
           #index = False,
           header = True,
           )

**Clean, condense: posts**

In [None]:
# d_p = posts

d_p = d_p.drop_duplicates(subset = 'id')

d_p['date'] = pd.to_datetime(
                             d_p.created_utc,
                             unit = 's',
                             )

d_p.set_index(
              'date',
              drop = False,
              inplace = True,
              )

d_p = d_p.loc[(d_p['date'] >= '2020-12-02') & (d_p['date'] < '2024-06-24')] ### yyyy-mm-dd = Dec 2, 2020 - Jun 24, 2024

d_p = d_p[~d_p['selftext'].isin(['[deleted]', '[removed]'])]

# housekeeping

d_p = d_p[[
           'author',
           'created_utc',
           'date',
           'id',
           'num_comments',
           'selftext',
           'subreddit',
           'title',
           ]].copy()

d_p.rename(
           columns = {
                      'author': 'p_au',
                      'created_utc': 'p_utc',
                      'date': 'p_date',
                      'num_comments': 'n_cmnt',
                      'selftext': 'text',
                      'subreddit': 'p_sbrt',
                      'title': 'p_titl',
                      }, inplace = True,
            )

**NER anonymize: posts**

In [None]:
d_p['text'] = d_p['text'].astype(str).apply(lambda i: ner_redact_post_texts(i))

**Clean, condense: comments**

In [None]:
# d_c = comments

d_c = d_c.drop_duplicates(subset = 'id')

d_c['date'] = pd.to_datetime(
                             d_c.created_utc,
                             unit = 's',
                             )

d_c.set_index(
              'date',
              drop = False,
              inplace = True,
              )

d_c = d_c.loc[(d_c['date'] >= '2020-12-02') & (d_c['date'] < '2024-06-24')] ### yyyy-mm-dd = Dec 2, 2020 - Jun 24, 2024

d_c = d_c[~d_c['body'].isin(['[deleted]', '[removed]'])]

# housekeeping

d_c = d_c[[
           'author',
           'body',
           'date',
           'link_id',
           'subreddit',
           ]].copy()

d_c.rename(
           columns = {
                      'author': 'c_au',
                      'body': 'c_text',
                      'date': 'c_date',
                      'link_id': 'id',
                      'subreddit': 'c_sbrt',
                      }, inplace = True,
            )

# delete comment-level 'id' prefix for merge

d_c['id'] = d_c['id'].str.replace('t3_', ' ')

In [None]:
print("V-corpus posts/d_adapt")
print("--------------------------------------------------------------------------------------")
d_p.shape
d_p.head(3)
d_p.tail(3)

print("V-corpus comments")
print("--------------------------------------------------------------------------------------")
d_c.shape
d_c.head(3)
d_c.tail(3)

In [None]:
# for post-labeling merge

d_c.to_csv(
           'd_comments.csv',
           encoding = 'utf-8',
           index = False,
           header = True,
           )

$\mathcal{d}$<sub>adapt</sub>: domain adaptation set

In [None]:
d_adapt = d_p.copy()

d_adapt.to_csv(
               'd_adapt_TEST.csv',
               encoding = 'utf-8',
               index = False,
               header = True,
               )

$\mathcal{D}$<sub>inference</sub>: prediction set

In [None]:
d_inference = d_p.sample(
                         n = 1000000, ### TKTK - maybe
                         random_state = 56,
                         )

**GPE: encoding, extraction**

In [None]:
nlp = spacy.load('en_core_web_lg')

# extract, count GPEs

def extract_gpe(text):
    doc = nlp(text)
    gpes = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    return gpes, len(gpes)

d_inference[[
             'gpe',
             'gpe_count',
             ]] = d_inference['text'].apply(lambda i: pd.Series(extract_gpe(i)))

total_gpe_count = d_inference['gpe_count'].sum()

print(f"Total number of GPEs recognized: {total_gpe_count}")

In [None]:
    ### SJS 8/17: validation w/ GTP-4o TKTK: is each GPE in the US?

**GPE: concordance**

In [None]:
# join single string for nltk entry

all_texts = ' '.join(d_inference['text'].tolist())

# tokenize for nltk

tokens = nltk.word_tokenize(all_texts)
nltk_text = Text(tokens)

# concordance: GPEs in context

for gpes in d_inference['gpe']:
    for gpe in gpes:  # gpes = list of GPEs
        print(f"\nConcordance for '{gpe}':")
        nltk_text.concordance(gpe)

**Explicit suicidality: encoding**

In [None]:
regex = r'\bsuicid\S*'

d_inference['sui'] = d_inference['text'].str.contains(
                                                      regex,
                                                      regex = True,
                                                      ) | (d_inference['p_sbrt'] == 'SuicideWatch')


d_inference['sui'] = d_inference['sui'].astype(int)

d_inference.head(3)

In [None]:
    ### SJS 8/17: validation w/ GTP-4o TKTK: is sui in reference to other people?

#%pwd
%cd ../inputs/data

In [None]:
d_inference.to_csv(
                   'd_inference_TEST.csv',
                   encoding = 'utf-8',
                   index = False,
                   header = True,
                   )

### 4. Train-Adapt-Test
Trains baseline BERT, RoBERTa, and DistilBERT, using rationale-augmented data $\mathcal{d}$<sub>augmented</sub>, iterating over a.) strains, b.) explicit targeting, c.) implicit vulnerabilities. Evaluates using de-augmented data. Outputs model x target _$F$_<sub>1</sub> (macro) performance scores.
***

In [None]:
#%pwd
%cd ../inputs/data

In [None]:
d_augmented = pd.read_excel(
                            'd_augmented.xlsx',
                            index_col = [0],
                            )

d_augmented.info()
d_augmented.head(3)

**Condense for model entry**

In [None]:
targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

d_augmented = d_augmented[
                          ['text',
                           'aug'] +
                           targets
                           ].copy()

d_augmented[
            ['aug'] +
             targets
             ].apply(pd.Series.value_counts)

**Compute weights ($w$): inverse class ($c$) freq<br>
$w_c = N / (2 * n_c)$**

In [None]:
class_weights = {}

for t in targets:

    value_counts = d_augmented[t].value_counts()

    w_pos = round(len(d_augmented) / (2 * value_counts.get(1, 0)), 4)
    w_neg = round(len(d_augmented) / (2 * value_counts.get(0, 0)), 4)

    class_weights[t] = {
                        'w_pos': w_pos if w_pos != float('inf') else 0,
                        'w_neg': w_neg if w_neg != float('inf') else 0,
                        }

class_weights

#### Train and evaluate baseline models: $k$-fold cross validate

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/temp

# define target-specific aug-stratified df

target_datasets = iterative_stratified_train_test_split_with_rationales(
                                                                        d_augmented,
                                                                        targets,
                                                                        random_state = 56,
                                                                        test_size = 0.2,
                                                                        )

In [None]:
print(target_datasets.keys())

In [None]:
# define target-specific df

#target_datasets = {
#                   'asp': (d_train_asp, d_test_asp),
#                   'dep': (d_train_dep, d_test_dep),
#                   'val': (d_train_val, d_test_val),
#                   'prg': (d_train_prg, d_test_prg),
#                   'tgd': (d_train_tgd, d_test_tgd),
#                   'age': (d_train_age, d_test_age),
#                   'race': (d_train_race, d_test_race),
#                   'dbty': (d_train_dbty, d_test_dbty),
#}

# define targets + class weights

targets_and_class_weights = {
                             'asp': [
                                     0.838, ### w_neg
                                     1.2397, ### w_pos
                                     ],
                             'dep': [
                                     0.5847,
                                     3.4522,
                                     ],
                             'val': [
                                     0.708,
                                     1.7017,
                                     ],
                             'prg': [
                                     0.5425,
                                     6.385,
                                     ],
                            'tgd': [
                                     0.5619,
                                     4.5377,
                                     ],
                             'age': [
                                     0.5926,
                                     3.1996,
                                     ],
                             'race': [
                                      0.506,
                                      42.4412,
                                      ],
                             'dbty': [
                                      0.5159,
                                      16.2135,
                                      ],
                              }

**BERT, RoBERTa, DistilBERT**

In [None]:
# define models

models = {
          'bert': (
                   BertForSequenceClassification,
                   BertTokenizer,
                  'bert-base-uncased',
                   ),

          'roberta': (
                      RobertaForSequenceClassification,
                      RobertaTokenizer,
                      'roberta-base',
                      ),

          'distilbert': (
                         DistilBertForSequenceClassification,
                         DistilBertTokenizer,
                         'distilbert-base-uncased',
                         ),
        }

# define save path

save_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# define hyperparameter grid

hyperparameter_grid = {
                       'batch_size': 8,
                       'gradient_accumulation_steps': 2,
                       'learning_rate': 2e-5,
                       'num_epochs': 2,
                       'warmup_steps': 0,
                       'weight_decay': 0.00,
                       }

# set cycle

cycle = 'baseline'

In [None]:
%cd ../../outputs/tables

# 'baseline' cycle: train-test loop

train_eval_save_bl_models(
                          target_datasets = target_datasets,
                          targets_and_class_weights = targets_and_class_weights,
                          models = models,
                          save_path = save_path,
                          cycle = cycle,
                          hyperparameter_grid = hyperparameter_grid,
                          )

**Llama 3.1**

In [None]:
# define save path

save_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# define hyperparameters

hyperparameters = {
                   'gradient_accumulation_steps': [1],
                   'learning_rate': [1e-4],
                   'num_train_epochs': [1],
                   'warmup_steps': [0],
                   'weight_decay': [0.01],
                   }

hyperparameter_grid = list(ParameterGrid(hyperparameters))

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/temp
#%cd ../../outputs/tables

# llama train-test loop

train_and_evaluate_llama(
                         target_datasets = target_datasets,
                         targets_and_class_weights = targets_and_class_weights,
                         model_name = 'meta-llama/Llama-3.1-8B',
                         hyperparameter_grid = hyperparameter_grid,
                         save_path = save_path
                         )

**Performance scatterplots: baseline, adapted**

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/tables

In [None]:
fm.fontManager.addfont('/content/drive/MyDrive/Colab/Arial.ttf')
plt.rcParams['font.family'] = 'Arial'

In [None]:
%%capture

d_v = pd.read_excel('d_baseline_performance.xlsx')
d_v.round({'f1_macro': 4, 'mcc': 4, 'auprc': 4})

In [None]:
%cd ../figures

    ### SJS 1/2: _NOTE_ this is shit; fix TKTK, use bar_scratchpad for viz.

performance_scatterplot(
                        df = d_v,
                        plot_name = 'baseline',
                        )

#### Adapt

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data
#%cd inputs/data

In [None]:
d_adapt = pd.read_csv('d_adapt.csv')

d_adapt.info()
d_adapt.head(3)

In [None]:
d_adapt = d_adapt[[
                   'text',
                   'p_sbrt',
                   ]].copy()

d_adapt.shape
d_adapt.head(3)

In [None]:
%cd ../../outputs/models
models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

**Domain adaptation proxy task: subreddit clr**

In [None]:
# prep dataset

class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        label = self.df.iloc[index]['p_sbrt']

        if pd.isna(text):
            text = ' '  ### replaces NaN w/ empty string

        encoding = self.tokenizer.encode_plus(
                                              text,
                                              add_special_tokens = True,
                                              max_length = self.max_length,
                                              return_token_type_ids = False,
                                              padding = 'max_length',
                                              truncation = True,
                                              return_attention_mask = True,
                                              return_tensors = 'pt'
                                              )
        return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label': torch.tensor(label, dtype = torch.long)
                }

# hyperparams

epochs = 2
batch_size = 16
learning_rate = 2e-5
max_length = 512

# define models

models_to_train = {
                   #'BERT': {
                   #         'model_class': BertForSequenceClassification,
                   #         'tokenizer_class': BertTokenizer,
                   #         'pretrained_model_name': 'bert-base-uncased',
                   #         },
                  #'RoBERTa': {
                  #            'model_class': RobertaForSequenceClassification,
                  #            'tokenizer_class': RobertaTokenizer,
                  #            'pretrained_model_name': 'roberta-base',
                  #          },
                  'DistilBERT': {
                                 'model_class': DistilBertForSequenceClassification,
                                 'tokenizer_class': DistilBertTokenizer,
                                 'pretrained_model_name': 'distilbert-base-uncased'
                                 }
                  }

# encode labels

label_encoder = LabelEncoder()
d_adapt['p_sbrt'] = label_encoder.fit_transform(d_adapt['p_sbrt'])

# iterate over models

for model_name, model_info in models_to_train.items():
    print(f'\nTraining {model_name}...')
    print("--------------------------------------------------------------------------------------")

    # initialize tokenizer, model

    tokenizer = model_info['tokenizer_class'].from_pretrained(model_info['pretrained_model_name'])
    model = model_info['model_class'].from_pretrained(
                                                      model_info['pretrained_model_name'],
                                                      num_labels = 3, ### update for true run - maps to tensor dimensions
                                                      )

    # prep dataset

    dataset = CustomDataset(
                            d_adapt,
                            tokenizer,
                            max_length = max_length,
                            )
    dataloader = DataLoader(
                            dataset,
                            batch_size = batch_size,
                            shuffle = True,
                            )

    # config optimizer

    optimizer = AdamW(
                      model.parameters(), ### ensures all model params are trained
                      lr = learning_rate,
                      )

    # training loop

    model.train()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        for batch in tqdm(dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                            input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # save

    save_path = f'{models_path}{model_name}_adapted'
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f'{model_name} saved to {save_path}\n')

#### Train and evaluate domain-adapted models: $k$-fold cross validate

In [None]:
targets_and_class_weights = {
                              'asp': [
                                     0.838, ### w_neg
                                     1.2397, ### w_pos
                                     ],
                             'dep': [
                                     0.5847,
                                     3.4522,
                                     ],
                             'val': [
                                     0.708,
                                     1.7017,
                                     ],
                             'prg': [
                                     0.5425,
                                     6.385,
                                     ],
                            'tgd': [
                                     0.5619,
                                     4.5377,
                                     ],
                             'age': [
                                     0.5926,
                                     3.1996,
                                     ],
                             'race': [
                                      0.506,
                                      42.4412,
                                      ],
                             'dbty': [
                                      0.5159,
                                      16.2135,
                                      ],
                              }


In [None]:
targets_and_class_weights = targets_and_class_weights

# define models path + load in-domain-adapted models

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

models = {
          'bert': (
                   BertForSequenceClassification.from_pretrained(f'{models_path}BERT_adapted'),
                   BertTokenizer.from_pretrained(f'{models_path}BERT_adapted'),
                   'bert-base-uncased',
                   ),

          'roberta': (
                      RobertaForSequenceClassification.from_pretrained(f'{models_path}RoBERTa_adapted'),
                      RobertaTokenizer.from_pretrained(f'{models_path}RoBERTa_adapted'),
                      'roberta-base',
                      ),

          'distilbert': (
                         DistilBertForSequenceClassification.from_pretrained(f'{models_path}DistilBERT_adapted'),
                         DistilBertTokenizer.from_pretrained(f'{models_path}DistilBERT_adapted'),
                         'distilbert-base-uncased',
                         )
          }

# define hyperparameter grid

hyperparameter_grid = {
                       'batch_size': 8,
                       'gradient_accumulation_steps': 2,
                       'learning_rate': 2e-5,
                       'num_epochs': 2,
                       'warmup_steps': 0,
                       'weight_decay': 0.00,
                       }

# define save path

save_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# set cycle

cycle = 'adapted'

In [None]:
%pwd

In [None]:
#%cd ../../outputs/tables
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/outputs/tables

# 'adapted' cycle: train-test loop

#results = train_eval_save_bl_models(
#                                    d_augmented,
#                                    targets_and_class_weights,
#                                    models,
#                                    save_path,
#                                    cycle,
#                                    )

train_eval_save_bl_models(
                          target_datasets = target_datasets,
                          targets_and_class_weights = targets_and_class_weights,
                          models = models,
                          save_path = save_path,
                          cycle = cycle,
                          hyperparameter_grid = hyperparameter_grid,
                          )

**Adapted: viz**

In [None]:
%cd ../outputs/tables

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/tables

In [None]:
%%capture

d_v = pd.read_excel('d_adapted_performance.xlsx')
d_v.round({'f1_macro': 4, 'mcc': 4, 'auprc': 4})

In [None]:
%cd ../figures

performance_barplot(
                    d_v,
                    'adapted_performance',
                    )

### 5. Tune-Regularize
Builds stratified train-test sets, searches hyperparam space to optimize highest-performing target x pretrained model configs. Evaluates learning rate, dropout, and self-training accuracy gain over 20 epochs.
***

In [None]:
%pwd

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data
#%cd ../inputs/data

d_augmented = pd.read_excel('d_augmented.xlsx')

In [None]:
targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

d_augmented = d_augmented[
                          ['text',
                           'aug'] +
                           targets
                           ].copy()

d_augmented.info()
d_augmented.head(3)

In [None]:
%cd ../../temp

**Target-parsed $\mathcal{d}$<sub>train</sub>($y$): augmented | $\mathcal{d}$<sub>test</sub>($y$): de-augmented**

In [None]:
target_datasets = iterative_stratified_train_test_split_with_rationales(
                                                                        d_augmented,
                                                                        targets,
                                                                        random_state = 56,
                                                                        test_size = 0.2,
                                                                        )

In [None]:
print(target_datasets.keys())

#### BERT, RoBERTa, DistilBERT

In [None]:
# drop 'aug' + extract target-wise train/test df

    ### SJS 12/3: old/bad way; validate new _for t in targets_ loop

d_train_asp, d_test_asp = target_datasets['asp']
d_train_dep, d_test_dep = target_datasets['dep']
d_train_val, d_test_val = target_datasets['val']

d_train_prg, d_test_prg = target_datasets['prg']
d_train_tgd, d_test_tgd = target_datasets['tgd']

d_train_age, d_test_age = target_datasets['age']
d_train_race, d_test_race = target_datasets['race']
d_train_dbty, d_test_dbty = target_datasets['dbty']

# strn

d_train_asp = d_train_asp.drop('aug', axis = 1)
d_test_asp = d_test_asp.drop('aug', axis = 1)

#d_train_asp.head(3)
#d_test_asp.head(3)

d_train_dep = d_train_dep.drop('aug', axis = 1)
d_test_dep = d_test_dep.drop('aug', axis = 1)

#d_train_dep.head(3)
#d_test_dep.head(3)

d_train_val = d_train_val.drop('aug', axis = 1)
d_test_val = d_test_val.drop('aug', axis = 1)

#d_train_val.head(3)
#d_test_val.head(3)

# traits

d_train_prg = d_train_prg.drop('aug', axis = 1)
d_test_prg = d_test_prg.drop('aug', axis = 1)

#d_train_prg.head(3)
#d_test_prg.head(3)

d_train_tgd = d_train_tgd.drop('aug', axis = 1)
d_test_tgd = d_test_tgd.drop('aug', axis = 1)

#d_train_tgd.head(3)
#d_test_tgd.head(3)

d_train_age = d_train_age.drop('aug', axis = 1)
d_test_age = d_test_age.drop('aug', axis = 1)

#d_train_age.head(3)
#d_test_age.head(3)

d_train_race = d_train_race.drop('aug', axis = 1)
d_test_race = d_test_race.drop('aug', axis = 1)

#d_train_race.head(3)
#d_test_race.head(3)

d_train_dbty = d_train_dbty.drop('aug', axis = 1)
d_test_dbty = d_test_dbty.drop('aug', axis = 1)

#d_train_dbty.head(3)
#d_test_dbty.head(3)

#### Grid search: RoBERTa

In [None]:
%cd ../../outputs/tables

# define BERT, RoBERTa, DistilBERT hyperparam grid

hyperparameter_grid = {
                       'batch_size': [8, 16],
                       'gradient_accumulation_steps': [1, 2],
                       'learning_rate': [2e-5, 3e-5],
                       'num_epochs': [2, 3],
                       'warmup_steps': [0, 500],
                       'weight_decay': [0.0, 0.3],
                       }

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models'

    ### SJS 9/8: no longer using dedicated subdirectories; just do save_path = models_path when calling the Fx...

# define tuning param sets

params = [

    # asp: Best F1 (macro) for asp: 0.77 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_asp,
           'd_test': d_test_asp,
           'target': 'asp',
           'class_weights': torch.tensor([
                                          0.838, ### w_neg
                                          1.2397, ### w_pos
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

    # dep: Best F1 (macro) for dep: 0.80 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_dep,
           'd_test': d_test_dep,
           'target': 'dep',
           'class_weights': torch.tensor([
                                          0.5847,
                                          3.4522,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

    # val: Best F1 (macro) for val: 0.75 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_val,
           'd_test': d_test_val,
           'target': 'val',
           'class_weights': torch.tensor([
                                          0.708,
                                          1.7017,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },
     # prg: Best F1 (macro) for prg: 0.70 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_prg,
           'd_test': d_test_prg,
           'target': 'prg',
           'class_weights': torch.tensor([
                                          0.5425,
                                          6.385,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

    # tgd: Best F1 (macro) for tgd: 0.76 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_tgd,
           'd_test': d_test_tgd,
           'target': 'tgd',
           'class_weights': torch.tensor([
                                          0.5619,
                                          4.5377,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

        # age: Best F1 (macro) for age: 0.76 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_age,
           'd_test': d_test_age,
           'target': 'age',
           'class_weights': torch.tensor([
                                          0.5926,
                                          3.1996,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

        # race: Best F1 (macro) for race: 0.63 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_race,
           'd_test': d_test_race,
           'target': 'race',
           'class_weights': torch.tensor([
                                          0.506,
                                          42.4412,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

        # dbty: Best F1 (macro) for dbty: 0.66 achieved by roberta - baseline

          {
           'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
           'model_class': RobertaForSequenceClassification,
           'pretrained_model_name': 'roberta-base',
           'd_train': d_train_dbty,
           'd_test': d_test_dbty,
           'target': 'dbty',
           'class_weights': torch.tensor([
                                          0.5159,
                                          16.2135,
                                          ], dtype = torch.float),
           'save_path': models_path,
           },

]


# hyperparam tuning loop

all_tuned_performance = pd.DataFrame()

for p in params:
    #print(f"Inspecting parameter set: {p}")  # inspect for dict format
    d_test, d_tuned_performance = tune_and_optimize_model_hyperparams(
                                                                      tokenizer = p['tokenizer'],
                                                                      model_class = p['model_class'],
                                                                      pretrained_model_name = p['pretrained_model_name'],
                                                                      d_train = p['d_train'],
                                                                      d_test = p['d_test'],
                                                                      target = p['target'],
                                                                      class_weights = p['class_weights'],
                                                                      save_path = p['save_path'],
                                                                      hyperparameter_grid = hyperparameter_grid,
                                                                      )

    all_tuned_performance = pd.concat([all_tuned_performance, d_tuned_performance], ignore_index = True)

print(all_tuned_performance.head())
all_tuned_performance.to_excel('all_tuned_performance.xlsx')

In [None]:
#print(all_tuned_performance.head())
#all_tuned_performance.to_excel('all_tuned_performance.xlsx')

#### Accuracy scoring (_post hoc_)

In [None]:
# import all d_test_tuned_preds_{target}

load_path = '/content/drive/My Drive/Colab/bar_policy_suicidality/outputs/tables'

targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

# create dict for all acc scores

accuracy_scores = {}

for t in targets:
    file_path = os.path.join(load_path, f'd_test_tuned_preds_{t}.xlsx')
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)

        # define y_true and y_pred

        y_true = df[t]
        y_pred = df['predicted_labels']

        # calculate and tabulate acc

        accuracy = accuracy_score(y_true, y_pred)
        accuracy_scores[t] = accuracy
        print(f"Accuracy for {t}: {accuracy:.4f}")

    else:
        print(f"File not found: {file_path}")


**Viz.**

In [None]:
# font = Arial

fm.fontManager.addfont('/content/drive/MyDrive/Colab/Arial.ttf')
plt.rcParams['font.family'] = 'Arial'

In [None]:
# computed acc

acc = {
       'asp': 0.8429,
       'dep': 0.9352,
       'val': 0.8778,
       'prg': 0.9601,
       'tgd': 0.9352,
       'age': 0.9027,
       'race': 0.9850,
       'dbty': 0.9576,
       }

# define custom colors for bars
#bar_colors = [
#    '#27aeef',  # for 'asp'
#    '#27aeef',  # for 'dep'
#    '#27aeef',  # for 'val'
#    '#27aeef',  # for 'prg'
#    '#27aeef',  # for 'tgd'
#    '#27aeef',  # for 'age'
#    '#27aeef',  # for 'race'
#    '#27aeef'   # for 'dbty'
#]

# Map target: numeric position for x-axis
#target_mapping = {
#    'asp': 0,
#    'dep': 1,
#    'val': 2,
#    'prg': 3,
#    'tgd': 4,
#    'age': 5,
#    'race': 6,
#    'dbty': 7
#}

target_mapping = {
                  'asp': 0,
                  'dep': 0.6,
                  'val': 1.2,
                  'prg': 1.8,
                  'tgd': 2.4,
                  'age': 3.0,
                  'race': 3.6,
                  'dbty': 4.2,
                  }

# extract values for barplot

x_labels = list(target_mapping.keys())
x_positions = list(target_mapping.values())
y_values = [acc[target] for target in x_labels]

# define broken y-axis

fig = plt.figure(figsize = (12, 5.5))
bax = brokenaxes(
                 ylims = ((0, 0.1), (0.4, 1)),
                 hspace = 0.05,
                 )

# barplot

bax.bar(
        x_positions,
        y_values,
        color = '#27aeef',
        width = 0.4,
        #edgecolor='black',
        linewidth = 0.6,
        alpha = 0.7,
        )

# set labels and ticks

bax.set_xlabel(
               'Target',
               fontsize = 12,
               labelpad = 30,
               )

bax.set_ylabel(
               'Accuracy: tuned',
               fontsize = 12,
               labelpad = 30,
               )


bax.axs[1].set_xticks(x_positions)
bax.axs[1].set_xticklabels(
                           x_labels,
                           rotation = 45,
                           fontsize = 10,
                           )

# customize legend

legend_elements = [
    Line2D([0], [0], marker = 's', color = 'w', label = 'BERT', markersize = 8, markerfacecolor = '#87bc45', lw = 0),
    Line2D([0], [0], marker = 's', color = 'w', label = 'RoBERTa', markersize = 8, markerfacecolor = '#27aeef', lw = 0),
    Line2D([0], [0], marker = 's', color = 'w', label = 'DistilBERT', markersize = 8, markerfacecolor = '#b33dc6', lw = 0),
]

bax.axs[0].legend(
                  handles = legend_elements,
                  loc = 'upper center',
                  bbox_to_anchor = (0.5, 1.15),
                  ncol = 4,
                  fontsize = 9,
                  frameon = False,
                  )

# set y-axis tick size

for ax in bax.axs:
    ax.tick_params(
                   axis = 'y',
                   labelsize = 9,
                   )


# save

plt.savefig(
            'tuned_high_res_bar.png',
            dpi = 300,
            )

plt.savefig(
            'tuned_low_res_bar.png',
            dpi = 100,
            )

# display

plt.show()

In [None]:
%pwd

#### Self-training augmentation: RoBERTa

In [None]:
# import d_inference: labeled by hyperparam-tuned RoBERTa

#%cd ../../outputs/tables
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/outputs/tables

d_inference = pd.read_csv(
                          'd_inference_pred_pilot.csv',
                          index_col = [0],
                          )

# harmonize column names

d_inference.rename(columns = {
                              'asp_pred': 'asp',
                              'dep_pred': 'dep',
                              'val_pred': 'val',
                              'prg_pred': 'prg',
                              'tgd_pred': 'tgd',
                              'age_pred': 'age',
                              'race_pred': 'race',
                              'dbty_pred': 'dbty',
                              }, inplace = True,
                   )

# sort by pred_proba confidence

targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

for t in targets:
    d_inference[f'{t}_prob'] = d_inference[f'{t}_prob'].apply(lambda i: ast.literal_eval(i))
    d_inference[f'{t}_pos'] = d_inference[f'{t}_prob'].apply(lambda i: round(i[1], 4))
    d_inference[f'{t}_neg'] = d_inference[f'{t}_prob'].apply(lambda i: round(i[0], 4))

# parse self-training columns

self_train_cols = ['text'] + [
                              f'{t}' for t in targets
                              ] + [
                              f'{t}_pos' for t in targets
                              ] + [
                              f'{t}_neg' for t in targets
                              ]

d_inference = d_inference[self_train_cols]

# inspect

d_inference.info()
d_inference.head(3)

In [None]:
targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

# create d_self_train dataframes

d_self_train = {}

# n d_inference rows to sample

n = 100

for t in targets:

    # create target-specific df for highest-confidence positive labels

    d_self_train[f'{t}_pos'] = d_inference.sort_values(
                                                       by = f'{t}_pos',
                                                       ascending = False,
                                                       ).head(n)[[
                                                                  'text',
                                                                  t,
                                                                  f'{t}_pos',
                                                                  ]].copy()

    # create target-specific df for highest-confidence negative labels

    d_self_train[f'{t}_neg'] = d_inference.sort_values(
                                                       by = f'{t}_neg',
                                                       ascending = False,
                                                       ).head(n)[[
                                                                  'text',
                                                                  t,
                                                                  f'{t}_neg',
                                                                  ]].copy()

# access target-specific df: d_self_train[f'{target}_pos'] or d_self_train[f'{target}_neg']

d_self_train['asp_pos'].head() ### sense check

In [None]:
# create interleaved target-specific df

d_self_train_combined = {}

for t in targets:

    df_pos = d_self_train[f'{t}_pos']
    df_neg = d_self_train[f'{t}_neg']

    #df_pos = d_self_train_combined[f'{t}_pos']
    #df_neg = d_self_train_combined[f'{t}_neg']

    df_pos = df_pos.reset_index(drop = True)
    df_neg = df_neg.reset_index(drop = True)

    # interleave rows from pos and neg dataframes

    min_length = min(len(df_pos), len(df_neg)) ### safeguard against IndexError
    interleaved_rows = [
        row for pair in zip(df_pos.iloc[:min_length].to_dict('records'), df_neg.iloc[:min_length].to_dict('records'))
        for row in pair
    ]

    # convert interleaved rows to df

    d_self_train_combined[t] = pd.DataFrame(interleaved_rows)

    # append remaining rows (if lengths differ)

    if len(df_pos) > min_length:
        d_self_train_combined[t] = pd.concat([d_self_train_combined[t], df_pos.iloc[min_length:]], ignore_index = True)
    elif len(df_neg) > min_length:
        d_self_train_combined[t] = pd.concat([d_self_train_combined[t], df_neg.iloc[min_length:]], ignore_index = True)

    ### SJS 12/12: note logits retained to this checkpoint for sense check; drop before save

# access target-specific df via d_self_train_combined[target]

d_self_train_combined['age'] ### age: intuitive example

In [None]:
%cd ../../temp

# drop logit columns, shuffle, save

for target, df in d_self_train_combined.items():
    df.drop(
            [f'{target}_pos', f'{target}_neg'],
            axis = 1,
            inplace = True,
            )
    df = shuffle(
                 df,
                 random_state = 56,
                 )
    df.to_excel(f'd_self_train_{target}.xlsx', index = False)
    print(f"Saved: d_self_train_{target}.xlsx")

In [None]:
# inspect: pre-append

print("d_train_{target}")
print(d_train_asp.shape)

d_train_asp.head(3)
d_train_asp.tail(3)

In [None]:
# append d_self_train_{target} self-training to d_train_{target} training data

targets = [
           'asp',
           'dep',
           'val',
           'prg',
           'tgd',
           'age',
           'race',
           'dbty',
           ]

    ### SJS 1/6: add 100 for cycle H

for t in targets:

  # access first _n_ rows of interleaved target-specific dataframes

  d_self_subset = d_self_train_combined[t].head(100) ### appending 100 pseudo-labeled rows

  #print(d_self_subset.head(20))

  d_train_var = f'd_train_{t}'
  globals()[d_train_var] = pd.concat([
                                      globals()[d_train_var],
                                      d_self_subset
                                      ], ignore_index = True,
                                     )

  # shuffle

  globals()[d_train_var] = globals()[d_train_var].sample(
                                                         frac = 1,
                                                         random_state = 56,
                                                         ).reset_index(drop = True)

  print(f"Updated and shuffled {d_train_var}")

# inspect: post-append

    ### SJS 12/14: asp as ex for now...

print(d_train_asp.shape)

d_train_asp.head(3)
d_train_asp.tail(3)

#### Training and validation loss: RoBERTa

In [None]:
%pwd

In [None]:
# set cycle

cycle = 'I'

# define models_path

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models'

# define target x model fixed hyperparams, num_epochs range

params = [

          # best 'asp' tuned f1 = 0.8094

          {
          'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
          'model_class': RobertaForSequenceClassification,
          'pretrained_model_name': 'roberta-base',
          'd_train': d_train_asp,
          'd_test': d_test_asp,
          'target': 'asp',
          'class_weights': torch.tensor([
                                        0.838,  ### w_neg
                                        1.2397, ### w_pos
                                        ], dtype = torch.float),
          'cycle': cycle,
          'save_path': models_path,
          'fixed_hyperparameters': {
                                    'batch_size': 16, ### optimized by grid search: enter d_tuned_performance values
                                    'dropout': 0.1, ### new since grid search
                                    'gradient_accumulation_steps': 2,
                                    'learning_rate': 1e-5,
                                    'warmup_steps': 0,
                                    'weight_decay': 0.3,
                                    },
          'num_epochs_range': [1,20],
         },

        # best 'dep' tuned f1 = 0.8515

        #{
        #  'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
        #  'model_class': RobertaForSequenceClassification,
        #  'pretrained_model_name': 'roberta-base',
        #  'd_train': d_train_dep,
        #  'd_test': d_test_dep,
        #  'target': 'dep',
        #  'class_weights': torch.tensor([
        #                                0.5847,
        #                                3.4522,
        #                                ], dtype = torch.float),
        #  'cycle': cycle,
        #  'save_path': models_path,
        #  'fixed_hyperparameters': {
        #                            'batch_size': 8,
        #                            'dropout': 0.1,
        #                            'gradient_accumulation_steps': 1,
        #                            'learning_rate': 1e-5,
        #                            'warmup_steps': 500,
        #                            'weight_decay': 0.3,
        #                            },
        #  'num_epochs_range': [1,20],
      #},

      # best 'val' tuned f1 = 0.8179

        {
          'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
          'model_class': RobertaForSequenceClassification,
          'pretrained_model_name': 'roberta-base',
          'd_train': d_train_val,
          'd_test': d_test_val,
          'target': 'val',
          'class_weights': torch.tensor([
                                        0.708,
                                        1.7017,
                                        ], dtype = torch.float),
          'cycle': cycle,
          'save_path': models_path,
          'fixed_hyperparameters': {
                                    'batch_size': 16,
                                    'dropout': 0.1,
                                    'gradient_accumulation_steps': 2,
                                    'learning_rate': 1e-5,
                                    'warmup_steps': 0,
                                    'weight_decay': 0.3,
                                    },
          'num_epochs_range': [1,20],
      },

      # best 'prg' tuned f1 = 0.8355

      #{
      #    'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
      #    'model_class': RobertaForSequenceClassification,
      #    'pretrained_model_name': 'roberta-base',
      #    'd_train': d_train_prg,
      #    'd_test': d_test_prg,
      #    'target': 'prg',
      #    'class_weights': torch.tensor([
      #                                  0.5425,
      #                                  6.385,
      #                                  ], dtype = torch.float),
      #    'cycle': cycle,
      #    'save_path': models_path,
      #    'fixed_hyperparameters': {
      #                              'batch_size': 16,
      #                              'dropout': 0.1,
      #                              'gradient_accumulation_steps': 2,
      #                              'learning_rate': 1e-5,
      #                              'warmup_steps': 0,
      #                              'weight_decay': 0,
      #                              },
      #    'num_epochs_range': [1,20],
      #},

      # best 'tgd' tuned f1 = 0.8433

      #{
      #    'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
      #    'model_class': RobertaForSequenceClassification,
      #    'pretrained_model_name': 'roberta-base',
      #    'd_train': d_train_tgd,
      #    'd_test': d_test_tgd,
      #    'target': 'tgd',
      #    'class_weights': torch.tensor([
      #                                  0.5619,
      #                                  4.5377,
      #                                  ], dtype = torch.float),
      #    'cycle': cycle,
      #    'save_path': models_path,
      #    'fixed_hyperparameters': {
      #                              'batch_size': 8,
      #                              'dropout': 0.1,
      #                              'gradient_accumulation_steps': 1,
      #                              'learning_rate': 1e-5,
      #                              'warmup_steps': 0,
      #                              'weight_decay': 0.3,
      #                              },
      #    'num_epochs_range': [1,20],
      #},

      # best 'age' tuned f1 = 0.8303

      #{
      #    'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
      #    'model_class': RobertaForSequenceClassification,
      #    'pretrained_model_name': 'roberta-base',
      #    'd_train': d_train_age,
      #    'd_test': d_test_age,
      #    'target': 'age',
      #    'class_weights': torch.tensor([
      #                                  0.5926,
      #                                  3.1996,
      #                                  ], dtype = torch.float),
      #    'cycle': cycle,
      #    'save_path': models_path,
      #    'fixed_hyperparameters': {
      #                              'batch_size': 8,
      #                              'dropout': 0.1,
      #                              'gradient_accumulation_steps': 2,
      #                              'learning_rate': 1e-5,
      #                              'warmup_steps': 0,
      #                              'weight_decay': 0.3,
      #                              },
      #    'num_epochs_range': [1,20],
      #},

      # best 'race' tuned f1 = 0.7462

      #{
      #    'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
      #    'model_class': RobertaForSequenceClassification,
      #    'pretrained_model_name': 'roberta-base',
      #    'd_train': d_train_race,
      #    'd_test': d_test_race,
      #    'target': 'race',
      #    'class_weights': torch.tensor([
      #                                  0.506,
      #                                  42.4412,
      #                                  ], dtype = torch.float),
      #    'cycle': cycle,
      #    'save_path': models_path,
      #    'fixed_hyperparameters': {
      #                              'batch_size': 8,
      #                              'dropout': 0.1,
      #                              'gradient_accumulation_steps': 2,
      #                              'learning_rate': 1e-5,
      #                              'warmup_steps': 500,
      #                              'weight_decay': 0.3,
      #                              },
      #    'num_epochs_range': [1,20],
      #},

      # best 'dbty' tuned f1 = xx

      #{
      #    'tokenizer': RobertaTokenizer.from_pretrained('roberta-base'),
      #    'model_class': RobertaForSequenceClassification,
      #    'pretrained_model_name': 'roberta-base',
      #    'd_train': d_train_dbty,
      #    'd_test': d_test_dbty,
      #    'target': 'dbty',
      #    'class_weights': torch.tensor([
      #                                  0.5159,
      #                                  16.2135,
      #                                  ], dtype = torch.float),
      #    'cycle': cycle,
      #    'save_path': models_path,
      #    'fixed_hyperparameters': {
      #                              'batch_size': 8,
      #                              'dropout': 0.1,
      #                              'gradient_accumulation_steps': 2,
      #                              'learning_rate': 1e-5,
      #                              'warmup_steps': 0,
      #                              'weight_decay': 0,
      #                              },
      #    'num_epochs_range': [1,20],
      #},

]


In [None]:
# training/validation loss and accuracy loop

for p in params:
    d_epochal_performance = tune_and_optimize_model_loss_accuracy(
                                                                  tokenizer = p['tokenizer'],
                                                                  model_class = p['model_class'],
                                                                  pretrained_model_name = p['pretrained_model_name'],
                                                                  d_train = p['d_train'],
                                                                  d_test = p['d_test'],
                                                                  target = p['target'],
                                                                  class_weights = p['class_weights'],
                                                                  save_path = p['save_path'],
                                                                  cycle = p['cycle'],
                                                                  fixed_hyperparameters = p['fixed_hyperparameters'],
                                                                  num_epochs_range = p['num_epochs_range'],
                                                                  )

    d_epochal_performance.head(6)

#### Llama 3.1

In [None]:
# define targets + class weights

targets_and_class_weights = {
#                             'asp': [
#                                     0.838, ### w_neg
#                                     1.2397, ### w_pos
#                                     ],
#                             'dep': [
#                                     0.5847,
#                                     3.4522,
#                                     ],
#                             'val': [
#                                     0.708,
#                                     1.7017,
#                                     ],
#                             'prg': [
#                                     0.5425,
#                                     6.385,
#                                     ],
#                            'tgd': [
#                                     0.5619,
#                                     4.5377,
#                                     ],
                             'age': [
                                     0.5926,
                                     3.1996,
                                     ],
#                             'race': [
#                                      0.506,
#                                      42.4412,
#                                      ],
#                             'dbty': [
#                                      0.5159,
#                                      16.2135,
#                                      ],
                              }


# define Llama 3.1 hypereparam grid

hyperparams = {
               'gradient_accumulation_steps': [1,
                                               2
                                               ],
               'learning_rate': [1e-4,
                                 5e-5
                                 ],
               'num_train_epochs': [1,
                                    2
                                    ],
               'warmup_steps': [0,
                                500
                                ],
               'weight_decay': [0.0,
                                0.01
                                ],
               }

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models'

hyperparameter_grid = list(ParameterGrid(hyperparams))

In [None]:
tune_and_optimize_llama_hyperparams(
                                    target_datasets = target_datasets,
                                    targets_and_class_weights = targets_and_class_weights,
                                    model_name = 'meta-llama/Llama-3.1-8b',
                                    hyperparameter_grid = hyperparameter_grid,
                                    save_path = models_path,
                                    )

    ### SJS 12/2: target-by-target grid search, manually reset 'targets' etc param and rename deposited df


**Performance scatterplots: grid search**

In [None]:
%%capture

%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/tables

d_v = pd.read_excel('all_tuned_performance.xlsx')

d_v.rename(columns = {'pretrained_model_name': 'model'}, inplace = True)

d_v.round({'f1_macro': 4, 'mcc': 4, 'auprc': 4})

In [None]:
# aesthetics

### SJS 10/1: last three colors in "Retro Metro (Default)" https://www.heavy.ai/blog/12-color-palettes-for-telling-better-stories-with-your-data

plot_name = 'tuned'

# aesthetics

sns.set_style(
              style = 'whitegrid',
              rc = None,
              )


model_colors = {
                'bert': '#87bc45',
                'roberta-base': '#27aeef',
                'distilbert': '#b33dc6',
                }

target_mapping = {
                  'asp': 0,
                  'dep': 2,
                  'val': 4,
                  'prg': 6,
                  'tgd': 8,
                  'age': 10,
                  'race': 12,
                  'dbty': 14
                  }

d_v['target_numeric'] = d_v['target'].map(target_mapping)
d_v['target_numeric'] = pd.to_numeric(d_v['target_numeric'])

d_v['target_jitter'] = d_v['target_numeric'] + np.random.uniform(-0.35, 0.35, size = len(d_v))

plt.figure(figsize = (12, 5.5))


# initialize fig. with broken y-axis

fig = plt.figure(figsize = (12, 5.5))
bax = brokenaxes(
                 ylims = ((0, 0.1), (0.4, 1)),
                 hspace = 0.1
                 )

# scatterplot: categorical x model

for model, color in model_colors.items():
    model_data = d_v[d_v['model'] == model]
    bax.scatter(
                model_data['target_jitter'],
                model_data['f1_score'],
                marker = '.',
                color = color,
                s = 40,
                alpha = 0.6,
                label=None,
                )

# mean (SD) f1 for each target x model

mean_std_df = d_v.groupby(['target', 'model']).agg(
                                                   mean_f1_score = ('f1_score', 'mean'),
                                                   std_f1_score = ('f1_score', 'std'),
                                                   ).reset_index()

mean_std_df['target_numeric'] = mean_std_df['target'].map(target_mapping).astype(float)
mean_std_df['target_offset'] = mean_std_df['target_numeric'] + mean_std_df['model'].map(
    {'bert': -0.3, 'roberta-base': 0.0, 'distilbert': 0.3}
)


# mean (SD) f1 - plot error bars

for model in mean_std_df['model'].unique():
    model_data = mean_std_df[mean_std_df['model'] == model]

    if not model_data[['target_offset', 'mean_f1_score', 'std_f1_score']].isnull().any().any():
        bax.errorbar(
                     model_data['target_offset'],
                     model_data['mean_f1_score'],
                     yerr = model_data['std_f1_score'],
                     fmt = 'D',
                     markersize = 7,
                     capsize = 0,
                     elinewidth = 1,
                     markeredgewidth = 1,
                     color = model_colors[model],
                     )

bax.set_xlabel(
               'Target',
               fontsize = 12,
               labelpad = 30,
               )

bax.set_ylabel(
               f'$F_1$ (macro): {plot_name}',
               fontsize = 12,
               labelpad = 30,
               )


# x-ticks: lower subplot of broken axis

bax.axs[1].set_xticks(list(target_mapping.values()))
bax.axs[1].set_xticklabels(
                           list(target_mapping.keys()),
                           rotation = 45,
                           fontsize = 10,
                           )

#sns.despine(left = True)
bax.grid(
         #axis='x',
         False,
         )

# f1 = 0.8 marker

bax.axhline(
            y = 0.8,
            color = 'r',
            linewidth = 0.6,
            linestyle = '--',
            )

# custom legend

from matplotlib.lines import Line2D

legend_elements = [
                   Line2D([0], [0], marker = 'o', color = 'w', label = 'BERT', markersize = 8, markerfacecolor = '#87bc45', lw = 0),
                   Line2D([0], [0], marker = 'o', color = 'w', label = 'RoBERTa', markersize = 8, markerfacecolor = '#27aeef', lw = 0),
                   Line2D([0], [0], marker = 'o', color = 'w', label = 'DistilBERT', markersize = 8, markerfacecolor = '#b33dc6', lw = 0),
                   ]

bax.axs[0].legend(
                  handles = legend_elements,
                  loc = 'upper center',
                  bbox_to_anchor = (0.5, 1.15),
                  ncol = 4,
                  fontsize = 9,
                  frameon = False,
                  )

# save

plt.savefig(f'{plot_name}_scatter.png')

# display

plt.show()


### 6. Infer (prelim)
Labels $n$<sub>posts</sub> = 10K subset of $\mathcal{d}$<sub>adapt</sub> for prediction and model explainability, preliminary human and GPT-4o cross-validation.
***

In [None]:
### SJS 12/2: pilot inference: RoBERTa

%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data
#%cd inputs/data

d_adapt = pd.read_csv('d_adapt.csv')

# delete empty/NaN 'text' cells

#d_adapt = d_adapt[d_adapt.text != ' ']

d_adapt = d_adapt[d_adapt['text'].notnull() & (d_adapt['text'].str.strip() != '')]

d_adapt.info()
d_adapt.head(3)

In [None]:
d_inference = d_adapt.sample(
                             n = 10000,
                             random_state = 56,
                             ).reset_index(drop = True)

d_inference = d_inference.dropna().reset_index(drop = True)

d_inference.info()
d_inference.head(3)

#### BERT, RoBERTa, DistilBERT

In [None]:
%cd ../../outputs/tables

In [None]:
models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# define inference param sets

params = [
          {
           'target': 'asp',
           'model_class': RobertaForSequenceClassification,
           'tokenizer_class': RobertaTokenizer,
           'pretrained_model_name': 'roberta-base',
           'model_path': f'{models_path}asp_roberta-base_best_tuned_model.bin',
          },
          {
           'target': 'dep',
           'model_class': RobertaForSequenceClassification,
           'tokenizer_class': RobertaTokenizer,
           'pretrained_model_name': 'roberta-base',
           'model_path': f'{models_path}dep_roberta-base_best_tuned_model.bin',
          },
         {
          'target': 'val',
          'model_class': RobertaForSequenceClassification,
          'tokenizer_class': RobertaTokenizer,
          'pretrained_model_name': 'roberta-base',
          'model_path': f'{models_path}val_roberta-base_best_tuned_model.bin',
         },
          {
           'target': 'prg',
           'model_class': RobertaForSequenceClassification,
           'tokenizer_class': RobertaTokenizer,
           'pretrained_model_name': 'roberta-base',
           'model_path': f'{models_path}prg_roberta-base_best_tuned_model.bin',
          },
         {
          'target': 'tgd',
          'model_class': RobertaForSequenceClassification,
          'tokenizer_class': RobertaTokenizer,
          'pretrained_model_name': 'roberta-base',
          'model_path': f'{models_path}tgd_roberta-base_best_tuned_model.bin',
         },
         {
          'target': 'age',
          'model_class': RobertaForSequenceClassification,
          'tokenizer_class': RobertaTokenizer,
          'pretrained_model_name': 'roberta-base',
          'model_path': f'{models_path}age_roberta-base_best_tuned_model.bin',
         },
          {
           'target': 'race',
           'model_class': RobertaForSequenceClassification,
           'tokenizer_class': RobertaTokenizer,
           'pretrained_model_name': 'roberta-base',
           'model_path': f'{models_path}race_roberta-base_best_tuned_model.bin',
          },
         {
          'target': 'dbty',
          'model_class': RobertaForSequenceClassification,
          'tokenizer_class': RobertaTokenizer,
          'pretrained_model_name': 'roberta-base',
          'model_path': f'{models_path}dbty_roberta-base_best_tuned_model.bin',
         },
]

# coerce to str

d_inference['text'] = d_inference['text'].astype(str)
texts = d_inference['text'].tolist()

# inference loop

for p in params:
    target = p['target']

    # load tokenizers, models

    tokenizer = p['tokenizer_class'].from_pretrained(p['pretrained_model_name'])
    model = load_model(
                       p['model_path'],
                       p['model_class'],
                       p['pretrained_model_name'],
                       )

    # infer predictions, probabilities

    predictions, probabilities = predict(
                                         model,
                                         tokenizer,
                                         texts,
                                         )

    d_inference[f'{target}_pred'] = predictions
    d_inference[f'{target}_prob'] = probabilities

    # inspect

d_inference[[
             'asp_pred',
             'dep_pred',
             'val_pred',
             'prg_pred',
             'tgd_pred',
             'age_pred',
             'race_pred',
             'dbty_pred',
             ]].apply(pd.Series.value_counts)

d_inference.head(3)

# save

d_inference.to_csv('d_inference_pred_pilot.csv')

#### Llama 3.1

In [None]:
# clear cache

torch.cuda.empty_cache()

**Model-by-model (memory safe)**

In [None]:
%cd /content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data
#%cd inputs/data

# single-target Llama inference

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

target = 'asp'

batch_size = 8

d_inference = llama_load_and_predict_single_target(
                                                   target,
                                                   d_inference,
                                                   models_path,
                                                   batch_size,
                                                   )
# inspect + save

d_inference.head(6)
d_inference.to_csv('d_inference_pred_llama.csv')

**Multiple models**

In [None]:
#%cd inputs/data

# multi-target Llama inference

#targets = [
#           'asp',
#           'dep',
#           'val',
#           'prg',
#           'tgd',
#           'age',
#           'race',
#           'dbty',
#           ]

#models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

#batch_size = 8

#d_inference = llama_load_and_predict_multi_target(
#                                                  targets,
#                                                  d_inference,
#                                                  model_save_dir,
#                                                  batch_size,
#                                                  )

# inspect + save

#d_inference.head(6)
#d_inference.to_csv('d_inference_pred_llama.csv')

### 7. Explain
Generates local instance-specific and global model-representative interpretations using LIME and SP-LIME.
***

In [None]:
%pwd

In [None]:
#%cd ../../outputs/tables
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/outputs/tables

d_inference = pd.read_csv('d_inference_pred_pilot.csv')
d_inference.drop(
                 'Unnamed: 0',
                 axis = 1,
                 inplace = True,
                 )

d_inference.info()
d_inference.head(3)

#### LIME: prediction explainability

**Sort by confidence**

In [None]:
# define target

target = 'dbty'

# extract neg proba

#d_inference[f'{target}_prob'] = d_inference[f'{target}_prob'].apply(lambda i: ast.literal_eval(i))
#d_inference[f'{target}_neg'] = d_inference[f'{target}_prob'].apply(lambda i: round(i[0], 4))

# extract pos proba

d_inference[f'{target}_prob'] = d_inference[f'{target}_prob'].apply(lambda i: ast.literal_eval(i))
d_inference[f'{target}_pos'] = d_inference[f'{target}_prob'].apply(lambda i: round(i[1], 4))

#d_inference.head(3)

# parse by 'f'{target}_neg' > 0.90

#neg_index = d_inference[d_inference[f'{target}_neg'] > 0.90]

# parse by 'f'{target}_pos' > 0.90

pos_index = d_inference[d_inference[f'{target}_pos'] > 0.90]

# shuffle

pos_index = shuffle(
                    pos_index,
                    random_state = 56,
                    )
pos_index.head(10)

In [None]:
# define instance

instance = 6633

**LIMETextExplainer**

In [None]:
%%capture

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# load pre-trained HF tokenizer, config, architecture

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained(
                                                         'roberta-base',
                                                          num_labels = 2,
                                                          )

# load fine-tuned model weights - dynamic target handling

target_model_path = f'{models_path}{target}_roberta-base_best_tuned_model.bin'
model.load_state_dict(torch.load(target_model_path, map_location=torch.device('cuda')))

# coerce eval mode

model.eval()

# targets

class_names = ['0', '1']

# tokenize Fx

def predict_proba(texts):
    inputs = tokenizer(
                       texts,
                       padding = True,
                       truncation = True,
                       return_tensors = 'pt',
                       max_length = 512,
                       )
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(
                              outputs.logits,
                              dim = 1,
                              )
    return probs.cpu().numpy()

# initialize explainer

### SJS 9/27: re 'bow' and 'char_level' params: https://lime-ml.readthedocs.io/en/latest/lime.html

explainer = LimeTextExplainer(
                              class_names = class_names,
                              random_state = 56,
                              )

# d_inference selection - dynamic

text = d_inference.loc[instance, 'text']

# explain

exp = explainer.explain_instance(
                                 text,
                                 predict_proba,
                                 num_features = 8,
                                 num_samples = 1000,
                                 distance_metric = 'cosine',
                                 )


**Display explanation**

In [None]:
print(instance)
exp.show_in_notebook(text = True)
#exp.save_to_file('lime_explanation.html')

In [None]:
%cd ../figures
exp.save_to_file(f'lime_explanation_{target}_{instance}_pos.html')

#### SP-LIME: model explainability

In [None]:
%%capture

# define target

target = 'dbty'

# define models dir

models_path = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# load pre-trained HF tokenizer, config, architecture

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained(
                                                         'roberta-base',
                                                          num_labels = 2,
                                                          )

# load fine-tuned model weights - dynamic target handling

target_model_path = f'{models_path}{target}_roberta-base_best_tuned_model.bin'
model.load_state_dict(torch.load(target_model_path, map_location=torch.device('cuda')))

# coerce eval mode

model.eval()

# targets

class_names = ['0', '1']

# tokenization function - incl. batch processing

def predict_proba(texts, batch_size = 8): ### batch processing - memory safe

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device) ### move model - same device as data

    all_probs = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(
                           batch_texts,
                           padding = True,
                           truncation = True,
                           return_tensors = 'pt',
                           max_length = 512,
                           )
        with torch.no_grad():
            inputs = {key: val.to(device) for key, val in inputs.items()} ### tokenized inputs - same device as model

            # infer

            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim = 1)

            all_probs.extend(probs.cpu().numpy()) ### logits to CPU

    return np.array(all_probs)

# initialize LIMETextExplainer

explainer = LimeTextExplainer(
                              class_names = class_names,
                              random_state = 56,
                              )

# select text instances dynamically for SP-LIME

selected_instances = d_inference.sample(
                                        n = 100,
                                        random_state = 56,
                                        )

# store explanations

sp_explanations = []

for instance_idx in selected_instances.index:
    text = d_inference.loc[instance_idx, 'text']

    # instance-wise LIME explanations

    exp = explainer.explain_instance(
                                     text,
                                     predict_proba,
                                     num_features = 10,
                                     num_samples = 1000,
                                     distance_metric = 'cosine',
                                     )
    # append

    sp_explanations.append(exp)

# define SubmodularPick logics

sp = SubmodularPick(
                    explainer,
                    selected_instances['text'].tolist(),
                    predict_proba,
                    num_features = 10,
                    num_exps_desired = 10, ### n explanations to provide
                    )

In [None]:
# viz. + save

%cd ../figures

for idx, exp in enumerate(sp.sp_explanations, start = 1):
    exp.show_in_notebook(text = True)
    exp.save_to_file(f'sp_lime_explanation_{target}_{idx:02}.html') ### save full range of num_exps_desired

#for exp in sp.sp_explanations:
#    exp.show_in_notebook(text = True)

### 8. Calibrate
tk

In [None]:
# tk

> End of aim_i_train_tune_predict.ipynb