## mhp ground-up refactor / headers etc. TODO

### 1. Prepare
Installs, imports, requisite packages; customizes outputs.
***
> **Dependencies:** Install via `%pip install -r requirements.txt` from project root before running.

In [None]:
%%capture

%pip install -r ../requirements.txt

In [None]:
import logging
import numpy as np
import pandas as pd
import re
import spacy
import torch

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import(
    f1_score, 
    matthews_corrcoef, 
    average_precision_score,
    ) 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.mode.copy_on_write = True

pd.set_option(
    'display.max_columns',
    None,
    )

pd.set_option(
    'display.max_rows',
    None,
    )

for c in (FutureWarning, UserWarning):
    warnings.simplefilter(
        action = 'ignore',
        category = c,
        )

In [None]:
cd ..

In [None]:
# import annotated `d_train`

d = pd.read_excel("d_annotated_prelim.xlsx", index_col = [0])

d.info()
d.head(6)

In [None]:
#!python -m spacy download en_core_web_lg --user

In [None]:
# load spaCy NER pipeline

nlp = spacy.load("en_core_web_lg")

# redact named entities

PII_LABELS = {
    "PERSON", "NORP", "FAC", "ORG", 
    "GPE", "LOC", "PRODUCT", "EVENT",
    }

def redact_named_entities(doc) -> str:
    '''
    Replaces all spaCy-recognized (configurable upstream) 
    PII_LABELS with `<PII>` pseudoword token.
    '''
    chars = list(doc.text)
    for ent in sorted(doc.ents, key = lambda e: e.start_char, reverse = True):
        if ent.label_ not in PII_LABELS:
            continue
        chars[ent.start_char : ent.end_char] = list("<PII>")
    return "".join(chars)

        ### TODO: SJS 2/3: certain synthetic prospective client names (e.g. Darius) are not automatically redacted

texts = d['text'].astype(str).tolist()
d['text'] = [redact_named_entities(doc) for doc in nlp.pipe(texts)]

# remove stray manual redactions (pilot / Phase 1)

manual_redactions = [
    "[PATIENT NAME]", "[PHONE NUMBER]",
    "[MHP NAME]", "[CITY]", "[URL]",
    ]

for m in manual_redactions:
    d['text'] = d['text'].str.replace(
        m, 
        " ", 
        regex = False,
        )

# remove numerals, newlines

d['text'] = d['text'].str.replace(
    r"[\d\r\n]+", 
    " ", 
    regex = True,
    )

# replace NaN with 0 in `target` varlist

labels = [
    "afrm", "agnt", "brdn", "dmnd", "fitt", 
    "just", "prbl", "rbnd", "refl",
    ]

for l in labels:
    d[l] = pd.to_numeric(d[l], errors = "coerce").fillna(0).astype(int)

#d.info()
#d.head(6)

In [None]:
# gen `trgt` indicator: dummy code pos(1) rows

d['trgt'] = d[[
    'afrm', 'agnt',
    'fitt', 'refl',
    ]].apply(lambda row: 1 if any(row) else 0, axis = 1)

def augment(df):
    '''
    Detects pos(1) `trgt` rows; duplicates as new row; replaces new row `text` 
    with expert annotator-curated `rtnl` text to augment training instance.
    '''
    new_rows = []
    for index, row in df.iterrows():
        if row['trgt'] > 0:
            new_row = row.copy()
            new_row['text'] = row['rtnl']
            new_rows.append((index + 0.5, new_row))

    if not new_rows:
        return df

    aug_df = pd.DataFrame(
        [row for _, row in new_rows],
        index = [idx for idx, _ in new_rows],
        )
    df = pd.concat([df, aug_df])
    df = df.sort_index(kind = "stable").reset_index(drop = True)
    return df

d = augment(d.copy())

# gen `agmt` indicator: dummy code augmented rows

d['agmt'] = 0
t_indices = d['rtnl'].apply(lambda i: isinstance(i, str))
d.loc[t_indices.shift(1, fill_value = False), 'agmt'] = 1

# remove `rtnl` construct salience delineator pseudoword tokens (parent / Phase 2) / `<PII>` redactions

salience_delineators = [
    "<PII>", "<|PII|>", "<|AFRM|>", "<|AGNT|>", "<|BRDN|>", "<|DMND|>",  
    "<|FITT|>", "<|JUST|>", "<|PRBL|>", "<|RBND|>", "<|REFL|>",
    ]

for s in salience_delineators:
    d['text'] = d['text'].str.replace(
        s, 
        " ", 
        regex = False,
        )

d.shape
d.head(6)

#### Compute weights ($w$): inverse class ($c$) freq: $w_c = N / (2 * n_c)$

In [None]:
class_weights = {}

for l in labels:
    value_counts = d[l].value_counts()
    n_pos = value_counts.get(1, 0)
    n_neg = value_counts.get(0, 0)
    w_pos = round(len(d) / (2 * n_pos), 4) if n_pos > 0 else 0
    w_neg = round(len(d) / (2 * n_neg), 4) if n_neg > 0 else 0
    class_weights[l] = {
        'w_pos': w_pos,
        'w_neg': w_neg,
        }

class_weights

In [None]:
        ### TODO: SJS 2/3: harmonize w/ preregistered PAP methods: https://osf.io/wgu8q/

logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

SEED = 56
EPOCHS = 2
BATCH_SIZE = 16
MAX_LEN = 512
MODEL_NAME = "bert-base-uncased"

def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)

set_seed(SEED)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len = MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            max_length = self.max_len,
            padding = "max_length",
            truncation = True,
            return_tensors = "pt",
            )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype = torch.long),
            }

print(f"Device: {device}")

#### held-out test set split / augmented-instance mapping

In [None]:
# map augmented rows â†’ parent originals via sequential ordering in d

orig_d_positions = d.index[d['agmt'] == 0].tolist()
augm_d_positions = d.index[d['agmt'] == 1].tolist()

d_orig = d.loc[orig_d_positions].reset_index(drop = True)
d_augm = d.loc[augm_d_positions].copy()

# for each augmented row, record parent's idx in d_orig

d_pos_to_orig_idx = {pos: i for i, pos in enumerate(orig_d_positions)}
parent_indices = []
for ai in augm_d_positions:
    parent_pos = max(op for op in orig_d_positions if op < ai)
    parent_indices.append(d_pos_to_orig_idx[parent_pos])

d_augm['_parent_idx'] = parent_indices
d_augm = d_augm.reset_index(drop = True)

# stratified holdout test set - originals (non-augmented) only

train_val_idx, test_idx = train_test_split(
    np.arange(len(d_orig)),
    test_size = 0.15,
    random_state = SEED,
    stratify = d_orig['trgt'],
    )

d_test = d_orig.iloc[test_idx].reset_index(drop = True)

# parse augmented rows to those whose parent is in train/val

train_val_set = set(train_val_idx)
d_augm_tv = d_augm[d_augm['_parent_idx'].isin(train_val_set)].reset_index(drop = True)

print(f"Train/val originals:     {len(train_val_idx)}")
print(f"Test originals:          {len(test_idx)}")
print(f"Augmented (train only):  {len(d_augm_tv)}")

#### train / evaluate helper fx

In [None]:
def train_model(train_texts, train_labels, weights, epochs = EPOCHS):
    set_seed(SEED)
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels = 2,
        ).to(device)

    loss_fn = torch.nn.CrossEntropyLoss(weight = weights.to(device))
    optimizer = torch.optim.AdamW(model.parameters(), lr = 2e-5)

    train_ds = TextDataset(train_texts, train_labels, tokenizer)
    train_loader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)

    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(
                input_ids = batch['input_ids'].to(device),
                attention_mask = batch['attention_mask'].to(device),
                )
            loss = loss_fn(outputs.logits, batch['label'].to(device))
            loss.backward()
            optimizer.step()

    return model

def evaluate(model, eval_texts, eval_labels):
    eval_ds = TextDataset(eval_texts, eval_labels, tokenizer)
    eval_loader = DataLoader(eval_ds, batch_size = BATCH_SIZE)

    model.eval()
    all_preds, all_probs, all_labels = [], [], []

    with torch.no_grad():
        for batch in eval_loader:
            outputs = model(
                input_ids = batch['input_ids'].to(device),
                attention_mask = batch['attention_mask'].to(device),
                )
            probs = torch.softmax(outputs.logits, dim = 1)[:, 1]
            preds = outputs.logits.argmax(dim = 1)
            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(batch['label'].numpy())

    return {
        'f1_macro': f1_score(all_labels, all_preds, average = 'macro', zero_division = 0),
        'mcc': matthews_corrcoef(all_labels, all_preds),
        'auprc': average_precision_score(all_labels, all_probs) if sum(all_labels) > 0 else 0.0,
        }

# 5-fold stratified CV per label + held-out test

d_tv = d_orig.iloc[train_val_idx]
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = SEED)
results = {}

for label in labels:
    print(f"\n{'=' * 50}")
    print(f"Label: {label}")
    print(f"{'=' * 50}")

    w = class_weights[label]
    weights = torch.tensor([w['w_neg'], w['w_pos']], dtype = torch.float)

    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(d_tv, d_tv[label])):

        # training: fold originals + matching augmented rows

        fold_train = d_tv.iloc[train_idx]
        train_orig_set = set(train_val_idx[train_idx])
        augm_fold = d_augm_tv[d_augm_tv['_parent_idx'].isin(train_orig_set)]
        fold_train = pd.concat([fold_train, augm_fold], ignore_index = True)

        # validation: fold originals only

        fold_val = d_tv.iloc[val_idx]

        model = train_model(
            fold_train['text'].tolist(),
            fold_train[label].tolist(),
            weights,
            )

        metrics = evaluate(
            model,
            fold_val['text'].tolist(),
            fold_val[label].tolist(),
            )
        fold_metrics.append(metrics)

        print(f"  Fold {fold + 1}: F1 = {metrics['f1_macro']:.4f}  MCC = {metrics['mcc']:.4f}  AUPRC = {metrics['auprc']:.4f}")

        del model
        if hasattr(torch.mps, 'empty_cache'):
            torch.mps.empty_cache()

    # mean CV metrics

    mean_metrics = {k: np.mean([m[k] for m in fold_metrics]) for k in fold_metrics[0]}
    print(f"  CV Mean: F1 = {mean_metrics['f1_macro']:.4f}  MCC = {mean_metrics['mcc']:.4f}  AUPRC = {mean_metrics['auprc']:.4f}")

    # final model: retrain on all train/val + augmented, evaluate on held-out test

        ### TODO: SJS 2/3: _best-performing_ model graduates to held-out test set...

    all_train = pd.concat([d_tv, d_augm_tv], ignore_index = True)
    model = train_model(
        all_train['text'].tolist(),
        all_train[label].tolist(),
        weights,
        )

    test_metrics = evaluate(
        model,
        d_test['text'].tolist(),
        d_test[label].tolist(),
        )
    print(f"  Test:    F1 = {test_metrics['f1_macro']:.4f}  MCC = {test_metrics['mcc']:.4f}  AUPRC = {test_metrics['auprc']:.4f}")

    results[label] = {'cv': mean_metrics, 'test': test_metrics}

    del model
    if hasattr(torch.mps, 'empty_cache'):
        torch.mps.empty_cache()

# summarize

print(f"\n{'=' * 50}")
print("Summary")
print(f"{'=' * 50}")

for label, res in results.items():
    cv, test = res['cv'], res['test']
    print(
        f"{label:>6s}  "
        f"CV: F1 = {cv['f1_macro']:.4f} MCC = {cv['mcc']:.4f} AUPRC = {cv['auprc']:.4f}  |  "
        f"Test: F1 = {test['f1_macro']:.4f} MCC = {test['mcc']:.4f} AUPRC = {test['auprc']:.4f}"
        )