# Passive suicidality in a repressive U.S. political context: Aim I(c)

_Labels primary analytic sample using tuned RoBERTa sequence classification models. Includes regex for additional explicit suicidal ideation encodings, GPT-4o-enabled disambiguation function to verify intended in-context usage, temperature scaling for calibration._

> aim_i_c_infer_calibrate.ipynb<br>
> Simone J. Skeen (03-17-2025)

1. [Prepare](#scrollTo=bIwQOmXN_1kp)<br>
2. [Write](#scrollTo=YRM3LeaUBxE-)<br>
&nbsp;&nbsp;[`roberta_predict.py`](#scrollTo=khjrYTtHKb2f)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`load_model`](#scrollTo=2Fwh2tjnKf3G)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`preprocess_data`](#scrollTo=XwxslZHIKoxg)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`predict`](#scrollTo=Y-rIKdaEKw53)<br>
&nbsp;&nbsp;[`gpt_assist.py`](xx)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`disambiguate_texts_with_gpt`](#scrollTo=CFF5OAzdgMpg)<br>
3. [Infer](#scrollTo=h7QN7U2GCG2M)<br>
4. [Visualize](xx)


### 1. Prepare
Installs, imports, and downloads requisite models and packages. Organizes RAP-consistent directory structure.
***

##### _Install_

In [None]:
#!python -m spacy download en_core_web_lg --user

##### _Import_

In [None]:
#import en_core_web_lg
import glob
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import nltk
import numpy as np
import openai
import os
import pandas as pd
import random
import re
import seaborn as sns
#import spacy
import time
import torch
import warnings

from google.colab import drive
from matplotlib.lines import Line2D
from nltk.text import Text
from scipy.optimize import minimize
from textblob import TextBlob
from torch.utils.data import(
    DataLoader,
    TensorDataset,
    )
from tqdm import tqdm
from transformers import(
    RobertaForSequenceClassification,
    RobertaTokenizer,
    )

#nltk.download('punkt_tab')
#spacy.cli.download('en_core_web_lg')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.mode.copy_on_write = True

pd.set_option(
    'display.max_columns',
    None,
    )

pd.set_option(
    'display.max_rows',
    None,
    )

warnings.simplefilter(
    action = 'ignore',
    category = FutureWarning,
    )

##### _Set env variable_

In [None]:
os.environ['OPENAI_API_KEY'] = ' '
#os.environ

##### _Mount gdrive_

In [None]:
drive.mount(
    '/content/drive',
    force_remount = True,
    )

##### _Structure dir_

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality
#%cd /content/drive/My Drive/#<my_project_folder>

#%mkdir bar_policy_suicidality
#%cd bar_policy_suicidality

In [None]:
#%mkdir inputs outputs code temp

In [None]:
#%cd inputs
#%mkdir annotation archives data

In [None]:
#%cd ../outputs
#%mkdir models tables figures

In [None]:
bar_policy_suicidality/
├── inputs/
│   ├── archives
│   └── data
├── outputs/
│   ├── models
│   ├── tables
│   └── figures
├── code/
└── temp/

### 2. Write
Writes and imports requisite custom scripts in .py.
***

In [None]:
%cd code

#### `roberta_predict.py`

##### `load_model`

In [None]:
%%writefile roberta_predict.py

import torch
from torch.utils.data import(
    DataLoader,
    TensorDataset,
    )
from transformers import(
    RobertaTokenizer,
    RobertaForSequenceClassification,
    )
from tqdm import tqdm
import pandas as pd

def load_model(model_path, model_class, pretrained_model_name):
    """
    Loads a pre-trained fine-tined RoBERTa from prespecified path model_path.
    """
    model = model_class.from_pretrained(pretrained_model_name)
    model.load_state_dict(torch.load(model_path))

    # set model to eval mode

    model.eval()
    return model

##### `preprocess_data`

In [None]:
%%writefile -a roberta_predict.py

import torch
from torch.utils.data import(
    DataLoader,
    TensorDataset,
    )
from transformers import(
    RobertaTokenizer,
    RobertaForSequenceClassification,
    )
from tqdm import tqdm
import pandas as pd

def preprocess_data(tokenizer, texts):
    """
    Tokenizes a list of texts using RobertaTokenizer.
    """
    encoded_texts = tokenizer(
        texts,
        padding = True,
        truncation = True,
        return_tensors = 'pt'
    )
    return encoded_texts

##### `predict`

In [None]:
%%writefile -a roberta_predict.py

import torch
from torch.utils.data import(
    DataLoader,
    TensorDataset,
    )
from transformers import(
    RobertaTokenizer,
    RobertaForSequenceClassification,
    )
from tqdm import tqdm
import pandas as pd

def predict(model, tokenizer, texts, batch_size = 8, use_cuda = True):
    """
    Predicts labels and generates label probabilities for a list of texts using RobertaForSequenceClassification and RobertaTokenizer.
    """
    print(f"\nTotal number of texts to predict: {len(texts)}")
    encoded_texts = preprocess_data(tokenizer, texts)
    dataset = TensorDataset(
        encoded_texts['input_ids'],
        encoded_texts['attention_mask'],
        )
    data_loader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = False,
        )

    print(f"Batch size: {batch_size}")
    print(f"Total number of batches: {len(data_loader)}")

    if use_cuda:
        model.cuda()

    all_predictions = []
    all_probabilities = []
    # ADDED: Create a list to store the raw logits
    all_logits = []

    with torch.no_grad():
        progress_bar = tqdm(
            total = len(data_loader),
            desc = "Predicting",
            leave = False,
            )
        for batch in data_loader:
            input_ids, attention_mask = batch
            if use_cuda:
                input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()

            outputs = model(
                input_ids,
                attention_mask = attention_mask,
                )

            # ADDED: Store the raw logits before applying softmax
            logits = outputs.logits
            all_logits.extend(logits.cpu().tolist())

            probabilities = torch.softmax(
                outputs.logits,
                #dim = 1,
                dim = -1,
                )
            #predictions = torch.argmax(probabilities, dim = 1).cpu().tolist()
            predictions = torch.argmax(probabilities, dim = -1).cpu().tolist()
            all_predictions.extend(predictions)
            all_probabilities.extend(probabilities.cpu().tolist())
            progress_bar.update(1)
        progress_bar.close()
    # CHANGED: Now also returns raw logits
    return all_predictions, all_probabilities, all_logits

#### `calibrate.py`

##### `temperature_scale`

In [None]:
%%writefile calibrate.py

from scipy.optimize import minimize
import torch

def temperature_scale(logits, temperature):
    """Apply the temperature scaling to the logits."""
    return torch.softmax(logits / temperature, dim=-1)

##### `nll_criterion`

In [None]:
%%writefile -a calibrate.py

from scipy.optimize import minimize
import torch

def nll_criterion(logits, labels, temperature):
    """Calculate the negative log likelihood."""
    scaled_probs = temperature_scale(logits, temperature)
    return -torch.mean(torch.log(scaled_probs[range(labels.size(0)), labels]))

##### `find_optimal_temperature`

In [None]:
%%writefile -a calibrate.py

from scipy.optimize import minimize
import torch

def find_optimal_temperature(logits, labels, device):
    """
    Find the optimal temperature for scaling by minimizing the negative log-likelihood
    on the given (logits, labels).
    """
    # Move data to the same device as 'device'
    logits, labels = logits.to(device), labels.to(device)

    def objective(temp):
        # temp is a float, but nll_criterion expects a tensor
        temp_tensor = torch.tensor([temp], device=device)
        return nll_criterion(logits, labels, temp_tensor).item()

    # We use 'minimize' from scipy.optimize to find the temperature
    # that yields the minimal NLL.
    res = minimize(
        objective,
        x0=1.0,
        #bounds=[(0.01, 5.0)],
        bounds=[(0.001, 10.0)],
        method='L-BFGS-B'
    )
    return res.x[0]

##### `calibrate_probabilities`

In [None]:
%%writefile -a calibrate.py

def calibrate_probabilities(d_calibrate, target_col, device):
    """
    Given a dataframe with raw logits in d_calibrate[f'{target_col}_logit']
    and ground truth labels in d_calibrate[target_col] (0 or 1),
    this function finds the optimal temperature, prints it,
    and adds a new column f'{target_col}_calibrated_prob' with the
    temperature-scaled probability distribution.
    """
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # define locally

    # 1. Extract logits (list of lists) -> convert to tensor
    logits_np = np.stack(d_calibrate[f"{target_col}_logit"].values)  # shape: [N, num_labels]
    logits_tensor = torch.tensor(logits_np, dtype=torch.float32, device=device)

    # 2. Extract the labels and convert to tensor
    labels_np = d_calibrate[target_col].values  # e.g., 0 or 1
    labels_tensor = torch.tensor(labels_np, dtype=torch.long, device=device)

    # 3. Find the optimal temperature
    optimal_temp = find_optimal_temperature(logits_tensor, labels_tensor, device)
    print(f"Optimal temperature for '{target_col}': {optimal_temp:.4f}")

    # 4. Apply temperature scaling
    with torch.no_grad():
        scaled_probs_tensor = temperature_scale(logits_tensor, optimal_temp)
    scaled_probs = scaled_probs_tensor.cpu().numpy()  # shape: [N, num_labels]

    # 5. Store them in the dataframe
    d_calibrate[f"{target_col}_calibrated_prob"] = list(scaled_probs)

    return d_calibrate

##### `calculate_ece`

In [None]:
%%writefile -a calibrate.py

import numpy as np
import matplotlib.pyplot as plt

def calculate_ece(probabilities, labels, n_bins=10, plot=True):
    """
    Calculate the Expected Calibration Error (ECE) for a binary classifier.
    Optionally generates a simple calibration plot (reliability diagram).

    This function robustly handles a Pandas Series (or list-like) of probabilities,
    where each row can be either:
      - a single float (p(class_1)), or
      - a 2-element list/array [p(class_0), p(class_1)].

    Parameters:
    -----------
    probabilities : array-like
        An iterable of length N, where each element is either:
          - A float for p(class_1), OR
          - A 2-element list/array for [p(class_0), p(class_1)].
    labels : array-like
        Ground-truth labels of shape [N], each 0 or 1.
    n_bins : int, optional
        Number of bins to use for calibration measurement. Default is 10.
    plot : bool, optional
        Whether to generate a calibration plot (Reliability Diagram). Default is True.

    Returns:
    --------
    ece : float
        The expected calibration error.
    """

    # 1) Convert each row into a 2-element [p0, p1]
    prob_list = list(probabilities)  # force iteration-friendly structure
    processed = []
    for i, p in enumerate(prob_list):
        # Case A: p is a single float => interpret as p(class_1)
        if isinstance(p, float) or isinstance(p, int) or isinstance(p, np.number):
            p0 = 1.0 - float(p)
            p1 = float(p)
            processed.append([p0, p1])

        # Case B: p is a list (or tuple, etc.) of length 2 => interpret as [p(class_0), p(class_1)]
        elif (isinstance(p, (list, tuple, np.ndarray)) and len(p) == 2):
            # Convert to float explicitly
            p0 = float(p[0])
            p1 = float(p[1])
            processed.append([p0, p1])
        else:
            raise ValueError(
                f"Row {i} has invalid probability format: {p}.\n"
                "Must be either a single number or a 2-element list/tuple/array."
            )

    # Now convert processed to a float array of shape [N, 2]
    probabilities_2d = np.array(processed, dtype=float)

    # 2) Convert labels to a NumPy array of ints
    labels = np.array(labels, dtype=int)
    if labels.shape[0] != probabilities_2d.shape[0]:
        raise ValueError("Number of labels must match number of probability rows.")

    # 3) Prepare bins
    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0

    # For plotting
    bin_accuracies = []
    bin_confidences = []
    bin_counts = []

    # 4) Compute ECE across bins using p(class_1)
    for bin_lower, bin_upper in zip(bins[:-1], bins[1:]):
        # Indices of samples whose p(class_1) is in [bin_lower, bin_upper)
        in_bin = (probabilities_2d[:, 1] >= bin_lower) & (probabilities_2d[:, 1] < bin_upper)

        if not in_bin.any():
            # If no samples in this bin, record placeholder values
            bin_accuracies.append(0.0)
            bin_confidences.append((bin_lower + bin_upper) / 2.0)
            bin_counts.append(0)
            continue

        # Accuracy in this bin: fraction of correct predictions
        # (argmax(prob) vs label)
        bin_accuracy = np.mean(labels[in_bin] == np.argmax(probabilities_2d[in_bin], axis=1))

        # Mean confidence for class_1 in this bin
        bin_confidence = np.mean(probabilities_2d[in_bin, 1])

        # Proportion of total samples that fall into this bin
        bin_weight = np.mean(in_bin)

        # Accumulate ECE
        ece += abs(bin_accuracy - bin_confidence) * bin_weight

        # Save for plotting
        bin_accuracies.append(bin_accuracy)
        bin_confidences.append(bin_confidence)
        bin_counts.append(np.sum(in_bin))

    # 5) Optional: Generate a calibration plot (Reliability Diagram)
    if plot:
        plt.figure(figsize=(6, 6))
        plt.plot(bin_confidences, bin_accuracies, marker='o', label='Calibration curve')
        plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Perfect calibration')
        plt.xlabel('Confidence (Predicted Probability of Positive)')
        plt.ylabel('Accuracy (Fraction of Positives)')
        plt.title('Calibration Plot (Reliability Diagram)')
        plt.legend(loc='upper left')
        plt.grid(True)
        plt.show()

    return ece

##### `apply_optimized_temperature_scaling`

In [None]:
%%writefile -a calibrate.py

def apply_optimized_temperature_scaling(d_new, targets, temp_dict, device="cpu"):
    """
    Given a dictionary of previously found optimal temperatures (temp_dict),
    apply them to the raw logits in d_new[f"{t}_logit"] for each target 't'
    to produce calibrated probabilities in d_new[f"{t}_calibrated_prob"].
    """
    for t in targets:
        # Convert the stored logits to a tensor
        logits_np = np.stack(d_new[f"{t}_logit"].values)
        logits_tensor = torch.tensor(logits_np, dtype=torch.float32, device=device)

        # Fetch the saved temperature for this target
        temperature = temp_dict[t]

        # Compute the scaled probabilities
        with torch.no_grad():
            scaled_probs_tensor = temperature_scale(logits_tensor, temperature)

        scaled_probs = scaled_probs_tensor.cpu().numpy()
        d_new[f"{t}_calibrated_prob"] = list(scaled_probs)

    return d_new

#### `gpt_assist.py`

##### `disambiguate_text_with_gpt`

In [None]:
%%writefile gpt_assist.py

import pandas as pd
import openai
import time

def disambiguate_text_with_gpt(df, input_column, output_column, system_prompt, prompt_template, model = 'gpt-4o'):
    """
    Transforms text data in a specified df column using GPT based on provided prompts.

    Args:
        df (pd.DataFrame): df containing the text to be transformed.
        input_column (str): name of the input column in the df that contains the text to transform.
        output_column (str): name of the output column where the transformed text will be stored.
        system_prompt (str): system prompt that sets up the assistant's behavior.
        prompt_template (str): template string describing the transformation to be applied to each entry.
          Use '{input_text}' as a placeholder for the input text.
        model (str, optional): The name of the OpenAI GPT model to use (default = 'gpt-4o').

    Returns:
        pd.DataFrame: df with new output column added, containing the transformed text.
    """

    # Fx to send row-wise API requests

    def call_gpt(input_text):
        if pd.isnull(input_text) or input_text.strip() == ' ':
            return ' '

        prompt = prompt_template.format(input_text = input_text)

        try:
            response = openai.chat.completions.create(
                model = model,
                messages = [
                    {'role': 'system',
                    'content': system_prompt},
                    {'role': 'user',
                    'content': prompt},
                    ],
                #max_tokens = 500,
                #n = 1,
                #temperature = 0,
                )

            # extract text from API response

            result = response.choices[0].message.content.strip()
            return result

        except Exception as e:
            print(f"Error processing input text: {input_text}\nError: {str(e)}")
            return input_text ### returns input string in case of error

        finally:

            # impose delay between API calls

            time.sleep(1)

    df[output_column] = df[input_column].apply(call_gpt)

    return df

#### Import

In [None]:
from gpt_assist import(
    disambiguate_text_with_gpt,
    )

from roberta_predict import(
    load_model,
    preprocess_data,
    predict,
    )

from calibrate import(
    temperature_scale,
    nll_criterion,
    find_optimal_temperature,
    calibrate_probabilities,
    calculate_ece,
    apply_optimized_temperature_scaling,
    )

### 3. Infer-Calibrate
Loads fine-tuned RoBERTa by target. Optimizes temperature param $T$ on held-out $\mathcal{d}$<sub>calibrate</sub>. Labels $\mathcal{D}$<sub>inference</sub> with strain, explicit targeting, implicit vulnerability encodings.
***

#### Define inference params

In [None]:
# define helper fx

def create_params(targets, model_path_prefix, model_base = 'roberta-base'):
    return [
        {
        'target': target,
        'model_class': RobertaForSequenceClassification,
        'tokenizer_class': RobertaTokenizer,
        'pretrained_model_name': model_base,
        'model_path': f'{model_path_prefix}{target}_{model_base}_best_tuned_model.bin',
        }
        for target in targets
    ]

# set targets

targets = [
    'asp',
    'dep',
    'val',
    'prg',
    'tgd',
    'age',
    'race',
    'dbty',
    ]

# set models_path_prefix

models_path_prefix = '/content/drive/MyDrive/Colab/bar_policy_suicidality/outputs/models/'

# create inference params

params = create_params(
    targets,
    models_path_prefix,
    )

# inspect

for p in params:
    print(p)

#### Calibrate: $\mathcal{d}$<sub>calibrate</sub>

In [None]:
%cd ../inputs/data

# load d_calibrate

d_calibrate = pd.read_excel(
    'd_calibrate.xlsx',
    index_col = [0],
    )

# delete empty/NaN 'text' cells

d_calibrate = d_calibrate[d_calibrate['text'].notnull() & (d_calibrate['text'].str.strip() != ' ')]

d_calibrate.info()
d_calibrate.head(3)
#d_calibrate.tail(3)

##### _Optimize temperature param $T$_

In [None]:
# Determine the device to use
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = model.to(device)  # Ensure model is on the right device

# coerce 'text' to str

d_calibrate['text'] = d_calibrate['text'].astype(str)
calibrate_texts = d_calibrate['text'].tolist()

# inference loop

for p in params:
    target = p['target']

    # load tokenizers, models

    tokenizer = p['tokenizer_class'].from_pretrained(p['pretrained_model_name'])
    model = load_model(
        p['model_path'],
        p['model_class'],
        p['pretrained_model_name'],
        )

    # define device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # model to device

    model = model.to(device)

    # infer predictions, probabilities, logits (raw)

    predictions, probabilities, logits = predict(
        model,
        tokenizer,
        calibrate_texts,
        )

    d_calibrate[f'{target}_logit'] = logits
    d_calibrate[f'{target}_pred'] = predictions
    d_calibrate[f'{target}_prob'] = probabilities

    # 4. Calibrate and store the new calibrated probabilities
    d_calibrate = calibrate_probabilities(d_calibrate, target, device)

# inspect

d_calibrate.head(3)

# export

d_calibrate.to_excel('d_calibrate_labeled.xlsx')

##### _Store optimal $T$ values_: `optimal_temps`

In [None]:
# create optimal_temps dict

optimal_temps = {
    'asp': 1.1755,
    'dep': 1.2069,
    'val': 1.6518,
    'prg': 1.2523,
    'tgd': 1.5852,
    'age': 1.2651,
    #'dbty':
    'race': 0.8056,
    }

##### _$\mathcal{d}$<sub>calibrate</sub> ECE: pre-$T$-scaling_

In [None]:
for p in params:
    target = p['target']
    probabilities = d_calibrate[f'{target}_prob']
    print(probabilities.shape)
    labels = d_calibrate[target]  # Assuming labels for evaluation are in d_calibrate

    ece = calculate_ece(probabilities, labels, n_bins=5, plot=True)
    print("ECE (matrix input):", ece)

##### _$\mathcal{d}$<sub>calibrate</sub> ECE: post-$T$-scaling_

In [None]:
for p in params:
    target = p['target']
    probabilities = d_calibrate[f'{target}_calibrated_prob']
    print(probabilities.shape)
    labels = d_calibrate[target]  # Assuming labels for evaluation are in d_calibrate

    ece = calculate_ece(probabilities, labels, n_bins=5, plot=True)
    print("ECE (matrix input):", ece)

#### Label: $\mathcal{D}$<sub>inference</sub>

##### _Import: $\mathcal{d}$<sub>posts</sub> &rarr; $\mathcal{D}$<sub>inference</sub>_

In [None]:
%cd ../inputs/data

d_inference = pd.read_csv('d_posts.csv')

# delete empty/NaN 'text' cells

d_inference = d_inference[d_inference['text'].notnull() & (d_inference['text'].str.strip() != ' ')]

# inspect

d_inference.info()
counts = d_inference['p_sbrt'].value_counts()
print("\n")
print(counts)
print("\n")
d_inference.head(3)
d_inference.tail(3)

##### _Join: titles + texts_

In [None]:
d_inference['text'] = d_inference['p_titl'].astype(str) + ' ' + d_inference['text'].astype(str)

d_inference.head(3)

##### _Batch: $\mathcal{D}$<sub>inference</sub> ($n$ = 1.2M) / 10 &rarr;_ `d_inf_{01,...,10}`



In [None]:
# batch d_inference / 10

d_batches = np.array_split(d_inference, 10)

# loop over batches, save d_inf_{01,...,10}

for i, batch in enumerate(d_batches, start = 1):
    filename = f'd_inf_batch_{i:02d}.csv' ### zero-pad batch number to 2 digits
    batch.to_csv(
        filename,
        index = False,
        )
    print(f"Saved {filename} with {len(batch)} rows.")


##### _Inference loop:_ `d_inf_{01,...,10}`

In [None]:
%pwd

In [None]:
%cd ../inputs/data

# set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# loop over d_inf_{01,...,10}

for i in range(1, 11):

    # construct input .csv filename by batch

    csv_in_path = f'd_inf_batch_{i:02d}.csv'
    print(f"\n======================================================================")
    print(f"Labeling d_inference batch: {csv_in_path}")

    # read current batch

    d_batch = pd.read_csv(csv_in_path)

    # coerce 'text' to str

    d_batch['text'] = d_batch['text'].astype(str)
    inference_texts = d_batch['text'].tolist()

    # inference loop

    for p in params:
        target = p['target']

        # load tokenizer, model

        tokenizer = p['tokenizer_class'].from_pretrained(p['pretrained_model_name'])
        model = load_model(
            p['model_path'],
            p['model_class'],
            p['pretrained_model_name'],
        )

        # model to device

        model.to(device)

        # infer predictions, probabilities, (raw) logits

        predictions, probabilities, logits = predict(
            model,
            tokenizer,
            inference_texts,
        )

        # add to present d_batch

        d_batch[f'{target}_logit'] = logits
        d_batch[f'{target}_pred'] = predictions
        d_batch[f'{target}_prob'] = probabilities

    # construct output filename, save

    csv_out_path = f'd_inf_batch_{i:02d}_labeled.csv'
    d_batch.to_csv(csv_out_path, index = False)
    print(f"Saved labeled output: {csv_out_path}\n")

##### _Concatenate:_ `d_inf_{01,...,10}_labeled` &rarr; $\mathcal{D}$<sub>inf labeled</sub> ($n$ = 1.2M)

In [None]:
%cd ../inputs/data

# global .csv naming convention

file_list = sorted(glob.glob("d_inf_batch_*_labeled.csv"))

# read, concatenate

d_inf_labeled = pd.concat(
    (pd.read_csv(i) for i in file_list),
    ignore_index = True,
    )

# inspect

d_inf_labeled.info()
d_inf_labeled.head(3)

# save

d_inf_labeled.to_csv('d_inf_labeled.csv')

##### _Apply optimized $T$ scaling post-inference_

In [None]:
%cd ../inputs/data

d_inf_labeled = pd.read_csv(
    'd_inf_labeled.csv',
    index_col = 0,
    )

d_inf_labeled.info()
d_inf_labeled.head(3)

In [None]:
# set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# define target

targets = [
    'asp',
    'dep',
    'val',
    'prg',
    'tgd',
    'age',
    'race',
    #'dbty', ### T-scale error on dbty - resolve
    ]

# calibrate {target}_prob in inference set

d_inf_labeled = apply_optimized_temperature_scaling(
    d_inf_labeled,
    targets,
    optimal_temps,
    device = device,
    )

d_inf_labeled.info()
d_inf_labeled.head(3)

#### RegEx: match


##### _Neurodivergences:_ `adhd_re`, `aut_re`, `bpd_re`, `ptsd_re`

In [None]:
# 'ndvg' regex patterns

patterns = {
    'adhd_re': r'\bADD\b|\bADHD\b',
    'aut_re': r'\bautism\b|\bautistic\b',
    'bpd_re': r'\bborderline\b|\bbpd\b',
    'ptsd_re': r'\bPTSD\b|\bCPTSD\b|\bC-PTSD\b'
    }

# encode regex match

for key, p in patterns.items():
    case_sensitive = (key == 'adhd') ### matches "ADD" not "add"
    d_inf_labeled[key] = d_inf_labeled['text'].str.contains(
        p,
        case = (key == 'adhd'),
        regex = True,
        ).astype(int)

# inspect

d_inf_labeled.head(3)

##### _Explicit disclosed SI:_ `sui_re`

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

    ### SJS 2/17: redoing 'sui' - delete when done

d_inf_labeled = pd.read_csv(
    'd_inf_labeled.csv',
    index_col = [0],
    )

d_inf_labeled.info()
d_inf_labeled.head(3)
#d.tail(3)

In [None]:
#d_inf_labeled = d_inf_labeled.drop(
#    'sui_re',
#    axis = 1,
#    )

d_inf_labeled.info()

In [None]:
# 'sui' regex pattern

sui_re = re.compile(
    r'\bsuicid\S*|\bkill\s+myself\b',
    re.I,
    )

# pilot sui_re

#test_strings = [
#    "I might kill myself today",
#    "He was feeling suicidal",
#    "I might  hurt   myself tomorrow"  # multiple spaces
#    ]

#for t in test_strings:
#    match = sui_re.search(t)
#    if match:
#        print(f"Matched: '{match.group(0)}' in '{t}'")
#    else:
#        print(f"No match in: '{t}'")

# match pattern

d_inf_labeled['sui_re'] = d_inf_labeled['text'].str.contains(
    sui_re,
    regex = True,
    )

# encode match

d_inf_labeled['sui_re'] = d_inf_labeled['sui_re'].astype(int)

d_inf_labeled.head(3)

# reassign: d_inf_labeled_re

d_inf_labeled_re = d_inf_labeled

# save

d_inf_labeled_re.to_csv('d_inf_labeled_re.csv')

In [None]:
d_inf_labeled_re[[
    'adhd_re',
    'aut_re',
    'bpd_re',
    'ptsd_re',
    'sui_re',
    ]].sum()

In [None]:
# batch d_inf_labeled_re x num_splits

num_splits = 10
split_size = len(d_inf_labeled_re) // num_splits

dfs = {}
for i in range(num_splits):
    df_name = f'd_inf_labeled_re_{i+1:02d}' ### format idx: 01, 02, ..., 10
    dfs[df_name] = d_inf_labeled_re.iloc[i * split_size:(i + 1) * split_size]
    dfs[df_name].to_csv(
        f'{df_name}.csv',
        index = False,
        )

# add leftover rows to final split

if len(d_inf_labeled_re) % num_splits != 0:
    last_df_name = f'd_inf_labeled_re_{num_splits:02d}'
    dfs[last_df_name] = d_inf_labeled_re.iloc[(num_splits - 1) * split_size:]
    dfs[last_df_name].to_csv(
        f"{last_df_name}.csv",
        index = False,
        )

#### spaCy EntityRecognizer: encode

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

import spacy
import pandas as pd
import nltk
from nltk.text import Text

# load spaCy model

nlp = spacy.load('en_core_web_lg')

# define GPE-extraction fx

def extract_gpe(text):
    doc = nlp(text)
    gpes = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    return gpes, len(gpes)

# loop over d_inf_labeled_re_{01,...,10}

num_splits = 10
for i in range(1, num_splits + 1):
    file_name = f"d_inf_labeled_re_{i:02d}.csv"
    df_name = f"d_inf_labeled_re_gpe_{i:02d}"

    df = pd.read_csv(file_name)

    # apply GPE-extraction function

    df['gpe'], _ = zip(*df['text'].apply(extract_gpe))

    #df[[
    #    'gpe',
    #    'gpe_count',
    #    ]] = df['text'].apply(lambda i: pd.Series(extract_gpe(i)))

    # save d_inf_labeled_re_gpe_{01,...,10}

    df.to_csv(
        f"{df_name}.csv",
        index = False,
        )

    print(f"GPEs extracted, {df_name}.csv saved")

#### GPT-4o: disambiguate

##### _Prompts: disclosed U.S. GPE restriction_

In [None]:
# retrieve OpenAI API key

openai.api_key = os.getenv('OPENAI_API_KEY')

# define system prompt

system_prompt = '''
    You are an expert at reading social media posts and understanding when a post author is describing their location outside the U.S.
    '''

# disclosed U.S. GPE prompt

gpe_prompt = '''
    The following text contains a geographical place name, political entity, or other named entity:

    {input_text}

    Ensure that:
        1.) the geographical place name, political entity, or other named entity is outside the U.S. and
        2.) the post author is describing it as their own current location (e.g. "I am writing from Pakistan")

    If the location is outside of the U.S., output a 1

    Otherwise, output a 0.
'''

# set wd

input_dir = '/content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data'

# loop over GPE-encoded batches

for i in range(10, 11):
    input_filename = f"d_inf_labeled_re_gpe_{i:02d}.csv"
    output_filename = f"d_inf_labeled_re_gpe_us_{i:02d}.csv"

    input_filepath = os.path.join(
        input_dir,
        input_filename,
        )
    output_filepath = os.path.join(
        input_dir,
        output_filename,
        )

    if not os.path.exists(input_filepath):
        print(f"{input_filename} not found.")
        continue

    print(f"Disambiguating {input_filename}")

    d_inf_labeled = pd.read_csv(input_filepath)

    # replace empty tuples with 0 in 'gpe'

    d_inf_labeled['gpe'] = d_inf_labeled['gpe'].replace('[]', 0)

    # extract rows where 'gpe' != 0

    d_parsed = d_inf_labeled[d_inf_labeled['gpe'] != 0]

    d_parsed = disambiguate_text_with_gpt(
        d_parsed,
        'text',
        'not_us',
        system_prompt,
        gpe_prompt,
    )

    # merge to d_inf_labeled_re_gpe{01,...,10}

    d_inf_labeled = d_inf_labeled.drop(
        columns = 'not_us',
        errors = 'ignore',
        )
    d_inf_labeled = d_inf_labeled.merge(
        d_parsed[['id', 'not_us']],
        on = 'id',
        how = 'left'
    )

    # replace NaN w/ 0

    d_inf_labeled.fillna(
        {'not_us': 0},
        inplace = True,
        )

    # save

    d_inf_labeled.to_csv(
        output_filepath,
        index = False,
        )
    print(f"Saved: {output_filename}")

##### _System prompt: indicator vars_

In [None]:
# retrieve OpenAI API key

openai.api_key = os.getenv('OPENAI_API_KEY')

# define system prompt

system_prompt = '''
    You are an expert at reading social media posts and understanding when a post author is sincere and referring to themself.
    '''

##### _Define static prompt:_ `'sui'` _disambiguation_

In [None]:
# explicit suicidal ideation prompt

sui_prompt = '''
    The following social media post contains a mention of suicidal ideation:

    {input_text}

    Ensure that the mention of suicidal ideation is not used in dark humor ("I will kill myself if I don't win that game"), refers to the
    person writing the post (e.g. "I feel suicidal" or "I want to kill myself"). If the mention is sincere and self-referential, output a 1.

    If the mention is _not_ sincere and self-referential (e.g. "My brother is suicidal" or "My sister said 'I want to kill myself'"),
    output a 0.
    '''

# set wd

input_dir = '/content/drive/MyDrive/Colab/bar_policy_suicidality/inputs/data'
output_dir = input_dir

# loop over sui-encoded batches

for i in range(7, 11):
    file_name = f'd_inf_labeled_re_gpe_us_{i:02d}.csv'
    input_path = os.path.join(
        input_dir,
        file_name,
        )

    if os.path.exists(input_path):
        print(f"Disambiguating: {file_name}")
        d_inf_labeled = pd.read_csv(input_path)

        # parse by 'sui_re' == 1

        d_parsed = d_inf_labeled[d_inf_labeled['sui_re'] == 1].copy()

        if not d_parsed.empty:

            # apply disambiguate_text_with_gpt()

            d_parsed = disambiguate_text_with_gpt(
                d_parsed,
                'text',
                'sui',
                system_prompt,
                sui_prompt,
                )

             # merge to d_inf_labeled_re_gpe_us_{01,...,10}

            d_inf_labeled = d_inf_labeled.drop(
                columns = 'sui',
                errors = 'ignore',
                )
            d_inf_labeled = d_inf_labeled.merge(
                d_parsed[[
                    'id',
                    'sui']],
                on = 'id',
                how = 'left',
                )

        # replace NaN w/ 0 in 'sui' column

        d_inf_labeled['sui'].fillna(
            0,
            inplace = True,
            )

        # save

        output_file = f'd_inf_labeled_re_gpe_us_sui_{i:02d}.csv'
        output_path = os.path.join(
            output_dir,
            output_file,
            )
        d_inf_labeled.to_csv(
            output_path,
            index = False,
            )

        print(f"Saved: {output_file}")

    else:
        print(f"File not found: {file_name}")

##### _Define static prompt:_ `'aut'` _disambiguation_

In [None]:
%cd ../inputs/data

    ### SJS 3/14: _note_ - I'm doing this well after GPE and sui disambig

d_inf_labeled = pd.read_csv(
    'd_inf_labeled_long.csv',
    index_col = [0],
    )

d_inf_labeled.info()
d_inf_labeled.head(3)

In [None]:
counts = d_inf_labeled['not_us'].value_counts()
print(counts)

    ### SJS 3/14: post-'not_us' drop, as it should be

In [None]:
# autism prompt

aut_prompt = '''
    The following social media post contains a mention of autism:

    {input_text}

    Ensure that the mention of autism refers to the person writing the post (e.g. "I have autism" or
    "I am autistic"). If the mention is self-referential, output a 1.

    If the mention is _not_ self-referential (e.g. "My brother has autism" or "My sister is autistic"),
    output a 0.
    '''

# d_parsed: 'sui_re' = 1

d_parsed = d_inf_labeled[d_inf_labeled['aut_re'] == 1]

d_parsed = disambiguate_text_with_gpt(
    d_parsed,
    'text',
    'aut',
    system_prompt,
    aut_prompt,
    )

# merge to d_inf_labeled

d_inf_labeled = d_inf_labeled.drop(
    columns = 'aut',
    errors = 'ignore',
    )

d_inf_labeled = d_inf_labeled.merge(
    d_parsed[[
        'id',
        'aut',
        ]],
    on = 'id',
    how = 'left',
    )

# replace NaN w/ 0

d_inf_labeled.fillna(
    {'aut': 0},
    inplace = True,
    )

# save

d_inf_labeled.to_csv('d_inf_labeled_long_aut.csv')

##### _Define dynamic prompt template:_ `'adhd'`, `'bpd'`, `'ptsd'` _disambiguation_

In [None]:
%cd ../inputs/data

    ### SJS 3/15: _note_ - I'm doing this well after GPE and sui disambig - reorganize as RAP asap

d_inf_labeled = pd.read_csv(
    'd_inf_labeled_long_aut_adhd_bpd.csv',
    index_col = [0],
    )

d_inf_labeled.info()
d_inf_labeled.head(3)

In [None]:
#adhd_re_counts = d_inf_labeled['adhd_re'].value_counts()
#print(adhd_re_counts)

#bpd_re_counts = d_inf_labeled['bpd_re'].value_counts()
#print(bpd_re_counts)

#ptsd_re_counts = d_inf_labeled['ptsd_re'].value_counts()
#print(ptsd_re_counts)

In [None]:
# AD(H)D prompt

adhd_prompt = '''
    The following social media post contains a mention of ADD or ADHD:

    {input_text}

    Ensure that the mention of ADD or ADHD refers to the person writing the post (e.g. "I have ADHD" or
    "I am struggling with attention deficit disorder"). If the mention is self-referential, output a 1.

    If the mention is _not_ self-referential (e.g. "My brother is so ADD" or "My sister has ADHD"),
    output a 0.
    '''

# d_parsed: 'adhd_re' = 1

d_parsed = d_inf_labeled[d_inf_labeled['adhd_re'] == 1]

d_parsed = disambiguate_text_with_gpt(
    d_parsed,
    'text',
    'adhd',
    system_prompt,
    adhd_prompt,
    )

# merge to d_inf_labeled

d_inf_labeled = d_inf_labeled.drop(
    columns = 'adhd',
    errors = 'ignore',
    )

d_inf_labeled = d_inf_labeled.merge(
    d_parsed[[
        'id',
        'adhd',
        ]],
    on = 'id',
    how = 'left',
    )

# replace NaN w/ 0

d_inf_labeled.fillna(
    {'adhd': 0},
    inplace = True,
    )

# save

d_inf_labeled.to_csv('d_inf_labeled_long_aut_adhd.csv')

In [None]:
# BPD prompt

bpd_prompt = '''
    The following social media post contains a mention of borderline personality disorder (BPD):

    {input_text}

    Ensure that the mention of borderline personality disorder or BPD refers to the person writing the post (e.g.
    "I have BPD" or "I struggle with borderline personality"). If the mention is self-referential, output a 1.

    If the mention is _not_ self-referential (e.g. "My brother has BPD" or "My sister has borderline"),
    output a 0.
    '''

# d_parsed: 'bpd_re' = 1

d_parsed = d_inf_labeled[d_inf_labeled['bpd_re'] == 1]

d_parsed = disambiguate_text_with_gpt(
    d_parsed,
    'text',
    'bpd',
    system_prompt,
    bpd_prompt,
    )

# merge to d_inf_labeled

d_inf_labeled = d_inf_labeled.drop(
    columns = 'bpd',
    errors = 'ignore',
    )

d_inf_labeled = d_inf_labeled.merge(
    d_parsed[[
        'id',
        'bpd',
        ]],
    on = 'id',
    how = 'left',
    )

# replace NaN w/ 0

d_inf_labeled.fillna(
    {'bpd': 0},
    inplace = True,
    )

# save

d_inf_labeled.to_csv('d_inf_labeled_long_aut_adhd_bpd.csv')

In [None]:
# (C)PTSD prompt

ptsd_prompt = '''
    The following social media post contains a mention of post-traumatic stress disorder (PTSD):

    {input_text}

    Ensure that the mention of post-traumatic stress disorder or PTSD (CPTSD may also be used) refers to the person
    writing the post (e.g. "I have PTSD" or "I struggle with post-traumatic stress"). If the mention is self-referential,
    output a 1.

    If the mention is _not_ self-referential (e.g. "My brother has PTSD" or "My sister got CPTSD from abuse"),
    output a 0.
    '''

# d_parsed: 'ptsd_re' = 1

d_parsed = d_inf_labeled[d_inf_labeled['ptsd_re'] == 1]

d_parsed = disambiguate_text_with_gpt(
    d_parsed,
    'text',
    'ptsd',
    system_prompt,
    ptsd_prompt,
    )

# merge to d_inf_labeled

d_inf_labeled = d_inf_labeled.drop(
    columns = 'ptsd',
    errors = 'ignore',
    )

d_inf_labeled = d_inf_labeled.merge(
    d_parsed[[
        'id',
        'ptsd',
        ]],
    on = 'id',
    how = 'left',
    )

# replace NaN w/ 0

d_inf_labeled.fillna(
    {'ptsd': 0},
    inplace = True,
    )

# save

d_inf_labeled.to_csv('d_inf_labeled_long_aut_adhd_bpd_ptsd.csv')

In [None]:
# cleanup

%cd ../inputs/data

d_inf_labeled = pd.read_csv(
    'd_inf_labeled_long_aut_adhd_bpd_ptsd.csv',
    index_col = [0],
    )

d_inf_labeled.info()
d_inf_labeled.head(3)

In [None]:
cols = [
    'aut',
    'adhd',
    'bpd',
    'ptsd',
    ]

d = d_inf_labeled.copy()

d[cols] = d[cols].astype(str).applymap(lambda x: int('1' in x))

d.dtypes
d.head(3)

In [None]:
# save subsample to crosswalk

#d_inspect_gpt = d_inf_labeled.sample(
#    n = 100000,
#    random_state = 56,
#    )

d_inspect_gpt = d_inf_labeled.iloc[:100000]
d_inspect_gpt.to_csv('d_inspect_gpt.csv')

#d_inspect_int = d.sample(
#    n = 100000,
#    random_state = 56,
#    )

d_inspect_int = d.iloc[:100000]
d_inspect_int.to_csv('d_inspect_int.csv')

In [None]:
d.to_csv('d_inf_labeled_long_ndvg.csv')

In [None]:
####################################### LIWC-22 encoding #######################################

##### _old old old_

In [None]:
    ### SJS 3/15: nested f-strings impossible to balance on short notice; pulling out of loop

'''
# define neurodivergences

neurodivergences = [
    'adhd',
    'bpd',
    'ptsd',
    ]

for n in neurodivergences:
    # 1. Build dynamic references for columns and prompt text
    match_col = f'{n}_re'      # e.g. 'aut_re', 'bpd_re', 'ptsd_re'
    encode_col = n               # e.g. 'aut', 'bpd', 'ptsd'

    # 2. Define the dynamic user prompt
    #    Here, we simply uppercase the mention to match the style from your example.
    user_prompt = f'''
        The following social media post contains a mention of "{n.upper()}":

       {input_text}

        Ensure that the mention of "{n.upper()}" refers to the person writing the post (e.g. "I have {n.upper()}").
        If the mention is self-referential, output a 1.

        If the mention is _not_ self-referential (e.g. "My friend has {n.upper()}"), output a 0.
        '''

    # 3. Subset rows where the regex column == 1
    d_parsed = d_inf_labeled[d_inf_labeled[match_col] == 1].copy()

    # 4. Call your disambiguation function


    d_parsed = disambiguate_text_with_gpt(
        d_parsed,
        'text',
        encode_col,
        system_prompt,
        user_prompt,
        )

    # This is your existing function signature for reference
    # def disambiguate_text_with_gpt(
    #     df,
    #     text_column,
    #     output_column,
    #     system_prompt,
    #     user_prompt,
    # ):
    #     ... (implementation here)

    # 5. Merge the results back onto d_inference
    #    (First drop the output column if it already exists)
    d_inf_labeled = d_inf_labeled.drop(
        columns = [encode_col],
        errors='ignore',
        )

    d_inf_labeled = d_inf_labeled.merge(
        d_parsed[['id', encode_col]],
        on = 'id',
        how = 'left',
        )

    # 6. Replace any NaN with 0 (meaning no or non-self-referential mention)
    d_inf_labeled.fillna(
        {encode_col: 0},
        inplace = True,
        )

# At this point, your d_inference dataframe now contains
# 'aut', 'bpd', and 'ptsd' columns (in addition to any others),
# each with 1 if the mention was self-referential, or 0 otherwise.



# save

d_inf_labeled.to_csv('d_inf_labeled_aut_adhd_bpd_ptsd.csv')

##### _Reconcatenate disambiguation batches_

In [None]:
%cd ../inputs/data

file_pattern = 'd_inf_labeled_re_gpe_us_sui_*.csv'

csv_files = glob.glob(file_pattern)

d = pd.concat(
    [pd.read_csv(file) for file in csv_files],
    ignore_index = True,
    )

# detect mixed dtypes

mixed_dtype_cols = [col for col in d.columns if d[col].map(type).nunique() > 1]
print("Columns with mixed data types:", mixed_dtype_cols)

In [None]:
# inspect mixed dtypes

non_string_psbrt = d[~d['p_sbrt'].apply(lambda i: isinstance(i, str))]
non_string_ptitl = d[~d['p_titl'].apply(lambda i: isinstance(i, str))]

invalid_not_us = d[~d['not_us'].isin([0, 1])]
invalid_sui = d[~d['sui'].isin([0, 1])]

non_string_psbrt.to_csv('non_string_psbrt.csv')
non_string_ptitl.to_csv('non_string_ptitl.csv')
invalid_not_us.to_csv('invalid_not_us.csv')
invalid_sui.to_csv('invalid_sui.csv')

In [None]:
# drop 'p_titl' - already joined to 'text' for inference

d.drop(
    columns = ['p_titl'],
    inplace = True,
    )

# drop 'p_sbrt' = ' '

d.dropna(subset = ['p_sbrt'])

In [None]:
# convert GPT-4o neg explanations to 0

d['not_us'] = pd.to_numeric(
    d['not_us'],
    errors = 'coerce',
    ).fillna(0).astype(int)

d['sui'] = pd.to_numeric(
    d['sui'],
    errors = 'coerce',
    ).fillna(0).astype(int)

# verify

unique_not_us = d['not_us'].unique()
unique_sui = d['sui'].unique()

print("Unique values in 'not_us':", unique_not_us)
print("Unique values in 'sui':", unique_sui)

In [None]:
# drop if disclosed GPE is non-U.S.

d = d[d['not_us'] == 0]

d.info()
d.head(3)

##### _Aggregate 'compound strain'_ `'cpnd'` _var_

In [None]:
%cd ../inputs/data

d = pd.read_csv(
    'd_inf_labeled_long_ndvg_placebo.csv',
    index_col = [0],
    )

d.info()
d.head(3)

In [None]:
# drop '*_re' col

d = d.drop([
    'adhd_re',
    'aut_re',
    'bpd_re',
    'ptsd_re'],
    axis = 1,
    )

d.info()

In [None]:
# compute mdn

tech_mdn = d['tech'].median()

# encode >mdn

d['tech_high'] = (d['tech'] > tech_mdn).astype(int)

# display mdn

print(f"\n'tech' Mdn: {tech_mdn:.6f}")

# inspect

d.info()

In [None]:
count_ones = d['tech_high'].sum()
print("Count of 1s in 'tech_high':", count_ones)

d_sampled_tech_high = d[d['tech_high'] == 0].sample(
    n = 1000,
    random_state = 56,
    )
d_sampled_tech_high[['tech', 'tech_high']].head(100)

In [None]:
cols = ['adhd', 'aut', 'bpd', 'ptsd']

# Count of 1s
count_ones = d[cols].sum()

# Total rows per column
total_rows = d[cols].count()

# Percentage of 1s
percent_ones = (count_ones / total_rows) * 100

# Combine into a dataframe for better readability
d_summary = pd.DataFrame({'Count of 1s': count_ones, 'Percentage of 1s': percent_ones})

d_summary.head(10)

In [None]:
d['cpnd_pred'] = (d[[
    'asp_pred',
    'dep_pred',
    'val_pred']].sum(axis = 1) == 3).astype(int)

d.info()
d.head(3)

##### _$\mathcal{D}$<sub>inf labeled long</sub>: includes texts, logits, softmax probs, $T$-scaled probs_

In [None]:
# save

d.to_csv('d_inf_labeled_long.csv')

##### _$\mathcal{D}$<sub>inf labeled long</sub> &rarr; $\mathcal{D}$<sub>inf labeled short</sub>: datetime and indicator vars only (Stata-friendly)_

In [None]:
d_inf_labeled_short = d[[
    'p_date',
    'id',
    'n_cmnt',
    'p_sbrt',
    'asp_pred',
    'dep_pred',
    'val_pred',
    'cpnd_pred',
    'prg_pred',
    'tgd_pred',
    'age_pred',
    'race_pred',
    'dbty_pred',
    'adhd',
    'aut',
    'bpd',
    'ptsd',
    'sui',
    'tech_high',
    ]].copy()


# convert 'p_date' so Stata datatime str

d_inf_labeled_short['p_date_str'] = pd.to_datetime(d_inf_labeled_short['p_date']).dt.strftime('%-m/%-d/%Y')

# rename

d_inf_labeled_short.rename(
    columns = {
        'asp_pred': 'asp',
        'dep_pred': 'dep',
        'val_pred': 'val',
        'cpnd_pred': 'cpnd',
        'prg_pred': 'prg',
        'tgd_pred': 'tgd',
        'age_pred': 'age',
        'race_pred': 'race',
        'dbty_pred': 'dbty',
        'tech_high': 'tech',
    }, inplace = True,
    )

# reset index

d.reset_index(
    drop = True,
    inplace = True,
    )

# verify

d_inf_labeled_short.info()
d_inf_labeled_short.head(3)

# save

d_inf_labeled_short.to_csv('d_inf_labeled_short.csv')

> End of aim_i_c_infer_calibrate.ipynb