# Passive suicidality in a repressive U.S. political context: Aim II

_Relinks $\mathcal{V}$-corpus posts and comments, encodes textual covariates of post &rarr; comment trasmissions of fatalistic semantics using sparse explicit TF-IDF and dense implicit vecotr space representations. Includes post-estimation inspection of key token covariates._

> aim_ii_model_02.ipynb<br>
> Simone J. Skeen (03-05-2025)

1. [Prepare](#scrollTo=I77md_rMVZUf)
2. [Transform](#scrollTo=SZT0o6TeV1uR)
3. [Fit/estimate](#scrollTo=lJahbPAe5I0h)

### 1. Prepare
Installs, imports, and downloads requisite models and packages.
***

In [None]:
%%capture

%pip install causalnlp

In [None]:
import nltk
import numpy as np
import pandas as pd
import re

from causalnlp import CausalInferenceModel
from causalnlp.core.causalbert import CausalBertModel
from causalnlp.key_driver_analysis import KeyDriverAnalysis
from causalnlp.autocoder import Autocoder

from google.colab import drive

from lightgbm import LGBMClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.text import Text
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.preprocessing import LabelEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option(
    'display.max_columns',
    None,
    )
pd.set_option(
    'display.max_rows',
    None,
    )

In [None]:
drive.mount(
    '/content/drive',
    force_remount = True,
    )

### 2. Transform
Merges $\mathcal{D}$<sub>inf labeled</sub>, $\mathcal{d}$<sub>comments</sub> &rarr; $\mathcal{D}$<sub>causal</sub>. Prepares for LIWC-22, causal-text, CausalNLP.
***

In [None]:
# D_causal merge

%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

d_inf = pd.read_csv(
    'd_inf_labeled.csv',
    index_col = [0],
    )

d_inf.info()
d_inf.head(3)
d_inf.tail(3)

d_c = pd.read_csv(
    'd_comments.csv',
    #index_col = [0],
    )

d_c.info()
d_c.head(3)
d_c.tail(3)

##### _Initial nested merge_

In [None]:
# inspect dtypes

print(
    d_inf['id'].dtype,
    d_c['id'].dtype,
    )

# convert to str

d_inf['id'] = d_inf['id'].astype(str)
d_c['id'] = d_c['id'].astype(str)

# strip whitespace

d_inf['id'] = d_inf['id'].str.strip()
d_c['id'] = d_c['id'].str.strip()

d = pd.merge(
    d_inf,
    d_c,
    on = 'id',
    how = 'left',
    )

d.info()
d.head(30)

In [None]:
# pseudo-word token for repeat rows

d['dupl'] = d.duplicated(
    subset = 'text',
    keep = 'first',
    )

d.loc[d['dupl'], 'text'] = '<|RPT|>'
d = d.drop(columns = ['dupl'])

d['dupl'] = d.duplicated(
    subset = 'p_titl',
    keep = 'first',
    )

d.loc[d['dupl'], 'p_titl'] = '<|RPT|>'
d = d.drop(columns = ['dupl'])

# create nested post-comment 'block' var

# new block" 'text' != '<|RPT|>'

d['new_block'] = d['text'] != '<|RPT|>'

# sum blocks for block indexing

d['block'] = (
    d['new_block'].cumsum() + 1).where(
        d['new_block'],
        0,
        ).astype(int)

# forward fill

d['block'] = d['block'].replace(
    0,
    method = 'ffill',
    ).astype(int)

        ### SJS 12/30: preserve new_block var for sense-check (for now)

# drop 'new_block'

#post_comments = post_comments.drop(columns=['new_block'])

# dummy for post author replying within comment threads

d['p_au_reply'] = np.where(
    d['p_au'] == d['c_au'], 1, 0)

d.ino()
d.head(30)

In [None]:
# decouple post author and commenter comments

# dupe c_text col

d['p_au_c_text'] = d['c_text']

# dupe c_text text

d.loc[d['p_au_reply'] == 1, 'p_au_c_text'] = d.loc[d['p_au_reply'] == 1, 'c_text']

# disaggregate about p_author_reply = 1/0

d.loc[d['p_au_reply'] == 1, 'c_text'] = ' '
d.loc[d['p_au_reply'] != 1, 'p_au_c_text'] = ' '

# force to str

d['c_text'] = d['c_text'].astype(str)
d['p_au_c_text'] = d['p_au_c_text'].astype(str)

#d['p_uniq'].value_counts()
d.info()
d.head(30)

# save

#d.to_csv(
#    'd_causal.csv',
#    index = True,
#    )

##### _Join_ `'c_text'` _(commenter comments in comment threads) and_ `'p_au_c_text'` _(post author comments in comment threads) by block_

In [None]:
# join 'c_text' (commenter comments in comment threads) and 'p_au_c_text' (post author comments in comment threads) by block

%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

d = pd.read_csv(
    'd_causal.csv',
    index_col = [0],
    )

# force to str

d['c_text'] = d['c_text'].astype(str)
d['p_au_c_text'] = d['p_au_c_text'].astype(str)

# concat

d['block_c_text'] = d.groupby('block')['c_text'].transform(lambda i: ' '.join(i))
d['block_p_au_c_text'] = d.groupby('block')['p_au_c_text'].transform(lambda i: ' '.join(i))

# drop 'new_block' = False

d = d[d['new_block']]

# append post title to concatenated post author comments

#d['text_covar_a'] = d.apply(lambda row: row['block_p_au_c_text'] + row['p_titl'] if row['p_titl'] != '<|RPT|>' else row['block_p_au_c_text'], axis = 1)

d.info()
d.head(30)

# save

d.to_csv(
    'd_causal.csv',
    index = True,
    )

In [None]:
# inspect joined subset

d = d[[
    'block_c_text',
    'block_p_au_c_text',
    ]].copy()

d.head(3)

##### _Clean, tokenize_

In [None]:
# preprocess

texts = [
    'block_c_text',
    'block_p_au_c_text',
    ]

# remove URLs, special characters, convert to lc

for t in texts:
    d['block_p_au_c_text'] = d['block_p_au_c_text'].str.replace(
        'http\S+|www.\S+',
        ' ',
        case = False,
        )
    d['block_p_au_c_text'] = d['block_p_au_c_text'].str.replace(
        '[^A-Za-z0-9]+',
        ' ',
        )
    d['block_p_au_c_text'] = d['block_p_au_c_text'].map(lambda i: i if type(i)!=str else i.lower())

d.to_csv(
    'd_causal_clean.csv',
    index = True,
    )

##### _Encode_ `'T'` _BAR policy mention_

In [None]:
# GAHC criminalization regex

raw_patterns = [
    r'\bArkansas\b.*?\bHouse Bill 1570\b|\bHB\s?1570\b|\bto create the arkansas save adolescents from experimentation \(safe\) act\b',
    r'\bArizona\b.*?\bSenate Bill 1138\b|\bSB\s?1138\b|\bprohibition of irreversible gender reassignment surgery for minors\b',
    r'\bFlorida\b.*?\bCS/SB 254\b|\bSenate Bill 254\b|\ban act relating to treatments for sex reassignment\b',
    r'\bGeorgia\b.*?\bSenate Bill 140\b|\bSB\s?140\b',
    r'\bIowa\b.*?\bSenate File 538\b|\bSF\s?538\b|\ba bill for an act relating to prohibited activities regarding gender transition procedures relative to minors\b',
    r'\bIdaho\b.*?\bHouse Bill 71\b|\bHB\s?71\b|\brelating to the child protection act\b',
    r'\bIndiana\b.*?\bSenate Enrolled Act 480\b|\bSA\s?538\b',
    r'\bKentucky\b.*?\bSenate Bill 150\b|\bSB\s?150\b|\ban relating to children\b',
    r'\bLouisiana\b.*?\bHouse Bill 648\b|\bHB\s?648\b|\bthe stop harming our kids act\b',
    r'\bMissouri\b.*?\bSenate Bill 49\b|\bSB\s?49\b|\bMissouri save adolescents from experimentation \(safe\) act\b',
    r'\bMississippi\b.*?\bHouse Bill 1125\b|\bHB\s?1125\b|\bthe regulate experimental adolescent procedures \(reap\) act\b',
    r'\bMontana\b.*?\bSenate Bill 0099\b|\bSB\s?0?099\b|\ban act providing for a youth health protection act\b',
    r'\bNorth Carolina\b.*?\bHouse Bill 808\b|\bHB\s?808\b|\ban act to prohibit gender transition procedures for minors\b',
    r'\bNorth Dakota\b.*?\bHouse Bill 1254\b|\bHB\s?1254\b|\bthe prohibition of certain practices against a minor; to provide a penalty; and to declare an emergency\b',
    r'\bNebraska\b.*?\bLegislative Bill 574\b|\bLB\s?574\b|\badopt the let them grow act\b',
    r'\bOhio\b.*?\bHouse Bill 68\b|\bHB\s?68\b|\bsaving ohio adolescents from experimentation \(safe\) act\b',
    r'\bOklahoma\b.*?\bSenate Bill 613\b|\bSB\s?613\b|\ban act relating to health care\b',
    r'\bSouth Carolina\b.*?\bHouse Bill 4624\b|\bHB\s?4624\b|\bto prohibit the provision of gender transition procedures to a person under eighteen years of age\b',
    r'\bSouth Dakota\b.*?\bHouse Bill 1080\b|\bHB\s?1080\b|\ban act to prohibit certain medical and surgical interventions on minor patients\b',
    r'\bTennessee\b.*?\bSenate Bill 1\b|\bSB\s?1\b|\ban act to amend tennessee code annotated\b',
    r'\bTexas\b.*?\bSenate Bill 14\b|\bSB\s?14\b|\brelating to prohibitions on the provision to certain children of procedures and treatments for gender transitioning, gender reassignment, or gender dysphoria and on the use of public money or public assistance to provide those procedures and treatments\b',
    r'\bUtah\b.*?\bSenate Bill 16\b|\bSB\s?16\b|\btransgender medical treatments and procedures amendments\b',
    r'\bWest Virginia\b.*?\bHouse Bill 2007\b|\bHB\s?2007\b|\bWest Virginia Medical Practice Act\b',
    r'\bWyoming\b.*?\bSenate File 0099\b|\bSF\s?0?0999\b|\bgender transitioning and reassignment procedures for children prohibited\b',
    ]

# pre-compile regex patterns with case-insensitive flag

compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in raw_patterns]

# check match Fx

def check_match(text):
    if any(pattern.search(str(text)) for pattern in compiled_patterns):
        return 1
    return 0

# apply check match

d['T'] = d['text'].apply(check_match)

# tally matches

bar_n = d['T'].sum()
print(bar_n)

# save

d.to_csv(
    'd_causal_liwc.csv',
    index = True,
    )

##### _Encode_ `'C'` _subreddit (categorical) covariate_

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

    ### SJS 2/27: LIWC-22 renames 'block_c_text' to 'Text' - content is the same, crosswalk confirmed

d = pd.read_csv(
    'd_causal_liwc.csv',
    index_col = [0],
    )

In [None]:
# initialize LabelEncoder

le = LabelEncoder()

# fit_tramsform subreddit covar

d['C'] = le.fit_transform(d['p_sbrt'])

# verify mapping

category_mapping = dict(zip(
    le.classes_,
    le.transform(le.classes_)
    ))

print(
    "Encoding:",
    category_mapping,
    )

# inspect

d[['id', 'C', 'p_sbrt']].head(10)

# save

#d.to_csv(
#    'd_causal_liwc.csv',
#    index = True,
#    )

In [None]:
####################################### LIWC-22 encoding #######################################

##### _Post-LIWC-22: encode 'fatalism'_ `'Y'` _outcomes_

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

    ### SJS 2/27: LIWC-22 renames 'block_c_text' to 'Text' - content is the same, crosswalk confirmed

d = pd.read_csv(
    'd_causal_liwc.csv',
    index_col = [0],
    )

d.info()
#d.head(3)
d[['id', 'Text']].head(3)

In [None]:
# compute mdn

allnone_mdn = d['allnone'].median()
emo_neg_mdn = d['emo_neg'].median()

# encode >mdn

d['allnone_high'] = (d['allnone'] > allnone_mdn).astype(int)
d['emo_neg_high'] = (d['emo_neg'] > emo_neg_mdn).astype(int)

# display mdn

print("\n'allnone' Mdn:" allnone_mdn)
print("\n'emo_neg' Mdn:" emo_neg_mdn)

# inspect

d[['allnone', 'allnone_high', 'emo_neg', 'emo_neg_high']].head(3)

In [None]:
#print("\n'allnone' Mdn:", allnone_mdn)
#print("\n'emo_neg' Mdn:", emo_neg_mdn)

In [None]:
# 'Y' = 'allnone_high' * 'emo_neg_high'

d['Y'] = d['allnone_high'] * d['emo_neg_high']

# inspect

d[['allnone_high', 'emo_neg_high', 'Y']].head()

# save

d.to_csv(
    'd_causal_liwc.csv',
    index = True,
    )

##### _$\mathcal{D}$<sub>model 2 all blocks</sub>_: $N$ = _146K (incl empty `'W'` cells)_

In [None]:
# condense

d = d[[
    'date',
    'p_au',
    'id',
    'n_cmnt',
    'text',
    'C',
    'asp_pred',
    'dep_pred',
    'val_pred',
    'prg_pred',
    'tgd_pred',
    'age_pred',
    'race_pred',
    'dbty_pred',
    'sui_re',
    'block_p_au_c_text',
    'allnone',
    'emo_neg',
    'emo_anx',
    'emo_anger',
    'emo_sad',
    'allnone_high',
    'emo_neg_high',
    'T',
    'Y',
    ]].copy()

# rename

d.rename(
    columns = {
        'Text': 'block_c_text',
        'block_p_au_c_text': 'W',
    }, inplace = True,
    )

# save .csv

d.to_csv(
    'd_model_02_all_blocks.csv',
    index = True,
    )

# save .tsv

d.to_csv(
    'd_model_02_all_blocks.tsv',
    sep = '\t',
    index = True,
    )

##### _$\mathcal{D}$<sub>model 2 covar blocks</sub>_: $N$ = _59K (incl populated `'W'` cells)_

In [None]:
# drop NaN, empty 'W' cells

d = d.dropna(subset = ['W']).loc[d['W'].str.strip() != ' ']

# reset idx

d.reset_index(inplace = True)

# save .csv

d.to_csv(
    'd_model_02_covar_blocks.csv',
    index = True,
    )

# save .tsv

d.to_csv(
    'd_model_02_covar_blocks.tsv',
    sep = '\t',
    index = True,
    )

### 3. Fit/estimate
Fits meta-learner models to estimate $T_p$ &rarr; $y_c$ C/ATE.
***

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

d = pd.read_csv(
    'd_model_02_all_blocks.csv',
    index_col = [0],
    )

d.info()
d.head(3)

#### Model 2a. BAR policy salience ($T_p$) → fatalism CATE ($Y_c$)

In [None]:
# restrict to TGD Redditors

d = d[d['tgd_pred'] != 0]
d.reset_index(inplace = True)

d.shape
d.head(3)

##### _2a: Sparse explicit text covar_ `'W'` _representations: TF-IDF via T-Learner_

In [None]:
ignore_cols = [
    'level_0',
    'index',
    'date',
    'p_au',
    'id',
    'n_cmnt',
    'text',
    'p_titl',
    'asp_pred',
    'dep_pred',
    'val_pred',
    'prg_pred',
    'tgd_pred',
    'age_pred',
    'race_pred',
    'dbty_pred',
    'sui_re',
    'allnone',
    'emo_neg',
    'emo_anx',
    'emo_anger',
    'emo_sad',
    'allnone_high',
    'emo_neg_high',
    #'C',
    ]

# verify feature input dimensions

T = d['T'].values
W = d['W'].values.reshape(-1, 1) ### reshape
y = d['Y'].values

print(f"T shape: {T.shape}")
print(f"W shape: {W.shape}")
print(f"Y shape: {y.shape}")

# fit

cm = CausalInferenceModel(
    d,
    method = 't-learner',
    learner = LGBMClassifier(num_leaves = 500),
    treatment_col = 'T',
    outcome_col = 'Y',
    text_col = 'W',
    #include_cols = ['C'], ### 'C' implicitly adjusted for _unless_ in ignore_cols
    ngram_range = (1,3),
    min_df = 0.01,
    stop_words = 'english',
    ignore_cols = ignore_cols,
    verbose = -1,
    )

cm.fit()

In [None]:
ate = cm.estimate_ate()
ate

In [None]:
top_features = cm.interpret(
    plot = False,
    method = 'feature_importance',
    )[1][:20]

In [None]:
print(top_features)

##### _2a: Dense implicit text covar_ `'W'` _representations: embeddings via CausalBERT_

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

d = pd.read_csv(
    'd_model_02_covar_blocks.tsv',
    sep = '\t',
    on_bad_lines = 'skip',
    )

d['cpnd_pred'] = (d[[
    'asp_pred',
    'dep_pred',
    'val_pred']].sum(axis = 1) == 3).astype(int)

cpnd_n = d['cpnd_pred'].sum()
print(cpnd_n)

# remap 'C'

d['C'] = d['C'].apply(lambda i: 0 if i == 5 else 1)

In [None]:
d.info()

In [None]:
# verify feature input dimensions

T = d['sui_re'].values
W = d['W'].values.reshape(-1, 1) ### reshape
y = d['Y'].values

print(f"T shape: {T.shape}")
print(f"W shape: {W.shape}")
print(f"Y shape: {y.shape}")

# initialize

cb = CausalBertModel(
    batch_size = 32,
    max_length = 128,
    )

# train

cb.train(
    d['W'], ### texts
    d['C'], ### confounds
    d['sui_re'], ### treatment
    d['Y'], ### outcome
    epochs = 1,
    learning_rate = 2e-5,
    )

print(cb.estimate_ate(
    d['C'],
    d['W'],
    ))

#### Model 2b. Strain (_$\hat{s}_{p}$_) → fatalism CATE ($Y_c$).

In [None]:
# gen compound strain var

d['cpnd_pred'] = (d[[
    'asp_pred',
    'dep_pred',
    'val_pred']].sum(axis = 1) == 3).astype(int)

cpnd_n = d['cpnd_pred'].sum()
print(cpnd_n)

In [None]:
# restrict to TGD Redditors

d = d[d['tgd_pred'] != 0]
#d.reset_index(inplace = True)

d.shape
d.head(3)

In [None]:
print(d.columns)

##### _2b: Sparse explicit text covar_ `'W'` _representations: TF-IDF via T-Learner_

In [None]:
ignore_cols = [
    'level_0',
    'index',
    'date',
    'p_au',
    'id',
    'n_cmnt',
    'text',
    'p_titl',
    'asp_pred',
    'dep_pred',
    'val_pred',
    #'cpnd_pred',
    'prg_pred',
    'tgd_pred',
    'age_pred',
    'race_pred',
    'dbty_pred',
    'sui_re',
    'allnone',
    'emo_neg',
    'emo_anx',
    'emo_anger',
    'emo_sad',
    'allnone_high',
    'emo_neg_high',
    'T',
    #'C',
    ]

# verify feature input dimensions

T = d['cpnd_pred'].values
W = d['W'].values.reshape(-1, 1) ### reshape
y = d['Y'].values

print(f"T shape: {T.shape}")
print(f"W shape: {W.shape}")
print(f"Y shape: {y.shape}")

# fit

cm = CausalInferenceModel(
    d,
    method = 't-learner',
    learner = LGBMClassifier(num_leaves = 500),
    treatment_col = 'cpnd_pred',
    outcome_col = 'Y',
    text_col = 'W',
    #include_cols = ['C'], ### 'C' implicitly adjusted for _unless_ in ignore_cols
    ngram_range = (1,3),
    min_df = 0.01,
    stop_words = 'english',
    ignore_cols = ignore_cols,
    verbose = -1,
    )

cm.fit()

In [None]:
ate = cm.estimate_ate()
ate

In [None]:
top_features = cm.interpret(
    plot = False,
    method = 'feature_importance',
    )[1][:20]

In [None]:
print(top_features)

##### _Concordances_

In [None]:
%cd /content/drive/My Drive/Colab/bar_policy_suicidality/inputs/data

d = pd.read_csv(
    'd_model_02_covar_blocks.csv',
    index_col = [0],
    )

# restrict to TGD Redditors

d = d[d['tgd_pred'] != 0]
d.reset_index(inplace = True)

d.shape
d.head(3)

In [None]:
# derived qualitatively (deductively)

#tokens = [
#    'criminal'
#    ]

# derived by feature importance (inductively)

tokens = [
'thank',
'sense',
'didn',
'point',
'did',
    ]

# parse by T_p

d_parsed = d[d['sui_re'] == 1]

# transform to nltk text object

text_col = d_parsed['W'].dropna().tolist()
#text_col = d['W'].dropna().tolist()
joined_text = ' '.join(text_col)
tokenized_text = nltk.word_tokenize(joined_text)
nltk_text = Text(tokenized_text)

# examine token in context

for t in tokens:
    nltk_text.concordance(
        t,
        lines = 50,
        width = 100,
        )


> End of aim_ii_model_02.ipynb