In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from pingouin import bayesfactor_ttest
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.power import TTestIndPower
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

## Women with ADHD vs. Others

In [2]:
df = pd.read_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-liwc-proportional.pkl")
mask_adhd = (df['sex']=="Feminino") & (df['adhd_diagnosis']=="Sim, diagnosticado")
mask_others = ~mask_adhd
features = df.columns[-64:].tolist()

## Necessary functions

In [3]:
def univariate_liwc(df, features, mask_g1, mask_g2, alpha=0.05):
    """
    For each LIWC feature:
      - Welch’s t-test
      - JZS Bayes factor
      - Cohen’s d
    Returns a DataFrame with p-values, BF10, d, FDR‐corrected p’s, etc.
    """
    rows = []
    for feat in features:
        g1 = df.loc[mask_g1, feat].dropna()
        g2 = df.loc[mask_g2, feat].dropna()
        t_stat, p_val = stats.ttest_ind(g1, g2, equal_var=False)
        n1, n2 = len(g1), len(g2)
        bf10 = bayesfactor_ttest(t_stat, n1, n2, paired=False)
        s1, s2 = g1.std(ddof=1), g2.std(ddof=1)
        s_pool = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))
        d = (g1.mean() - g2.mean())/s_pool
        rows.append({
            'feature': feat,
            'mean_g1': g1.mean(),
            'sd_g1': s1,
            'mean_g2': g2.mean(),
            'sd_g2': s2,
            't_stat': t_stat,
            'p_val': p_val,
            'bf10': bf10,
            'cohen_d': d
        })
    df_res = pd.DataFrame(rows)
    _, p_corr, _, _ = multipletests(df_res['p_val'], method='fdr_bh')
    df_res['p_fdr'] = p_corr
    df_res['signif'] = df_res['p_fdr'] <= alpha
    df_res['abs_cohen_d'] = df_res['cohen_d'].abs()
    return df_res.sort_values('abs_cohen_d', ascending=False)

In [4]:
def pca_group_diff(df, features, mask_g1, mask_g2, n_pc=5, alpha=0.05):
    """
    Standardize LIWC features, run PCA, perform Welch’s t-test on each PC
    Returns a DataFrame of PC, explained_variance, t_stat, p_val, p_fdr.
    """
    X = df[features].fillna(0).values
    Xs = StandardScaler().fit_transform(X)
    pca = PCA(n_components=n_pc)
    pcs = pca.fit_transform(Xs)
    rows = []
    for i in range(n_pc):
        comp = pcs[:, i]
        t, p = stats.ttest_ind(comp[mask_g1], comp[mask_g2], equal_var=False)
        rows.append({
            'PC': f'PC{i+1}',
            'expl_var': pca.explained_variance_ratio_[i],
            't_stat':   t,
            'p_val':    p
        })
    df_pc = pd.DataFrame(rows)
    _, p_corr, _, _ = multipletests(df_pc['p_val'], method='fdr_bh')
    df_pc['p_fdr'] = p_corr
    return df_pc

In [5]:
def top_pc1_loadings(df, features, n=10):
    X = df[features].fillna(0).values
    Xs = StandardScaler().fit_transform(X)
    pca = PCA(n_components=1)
    pca.fit(Xs)
    load = pd.Series(pca.components_[0], index=features)
    df_load = load.abs().sort_values(ascending=False).head(n).to_frame('abs_loading')
    df_load['loading'] = load.loc[df_load.index]
    return df_load

In [6]:
def _adaptive_inner_cv(y, target_splits=5):
    """Use 5 folds if every class has >=5 samples; otherwise back off to 3."""
    min_class = np.min(np.bincount(y))
    n_splits = target_splits if min_class >= target_splits else 3
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [7]:
def _logregcv_l1(inner_cv):
    """Configured L1-Logistic with explicit C grid and ROC-AUC tuning."""
    C_grid = np.logspace(-4, 4, 30)
    return LogisticRegressionCV(
        Cs=C_grid, cv=inner_cv, penalty="l1", solver="saga",
        scoring="roc_auc", class_weight="balanced",
        max_iter=5000, random_state=42, refit=True, n_jobs=-1
    )

In [8]:
def _stratified_bootstrap(X, y, rng):
    """Per-class resample with replacement; preserves class balance."""
    Xb_list, yb_list = [], []
    for cls in np.unique(y):
        idx = np.where(y == cls)[0]
        samp = rng.choice(idx, size=len(idx), replace=True)
        Xb_list.append(X[samp])
        yb_list.append(y[samp])
    Xb = np.vstack(Xb_list)
    yb = np.concatenate(yb_list)
    perm = rng.permutation(len(yb))
    return Xb[perm], yb[perm]

In [9]:
def l1_logistic_top(df, features, mask_g1, n=10):
    X = df[features].fillna(0).values
    y = mask_g1.astype(int).values

    inner_cv = _adaptive_inner_cv(y, target_splits=5)
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", _logregcv_l1(inner_cv))
    ])
    pipe.fit(X, y)

    coef = pd.Series(pipe.named_steps["clf"].coef_[0], index=features)
    top = coef.abs().sort_values(ascending=False).head(n).index
    return pd.DataFrame({"coef": coef.loc[top]}).sort_values("coef", key=np.abs, ascending=False)

In [10]:
def nested_auc(df, features, mask_g1):
    X = df[features].fillna(0).values
    y = mask_g1.astype(int).values

    inner_cv = _adaptive_inner_cv(y, target_splits=5)
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", _logregcv_l1(inner_cv))
    ])

    outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
    auc_scores = cross_val_score(pipe, X, y, cv=outer_cv, scoring="roc_auc", n_jobs=-1)
    return auc_scores.mean(), auc_scores.std()

In [11]:
def l1_stability(df, features, mask_g1, n_boot=100, tol=1e-6):
    X = df[features].fillna(0).values
    y = mask_g1.astype(int).values

    inner_cv = _adaptive_inner_cv(y, target_splits=5)

    sel_counts = pd.Series(0, index=features, dtype=float)
    pos_counts = pd.Series(0, index=features, dtype=float)
    coef_sum   = pd.Series(0.0, index=features)

    rng = np.random.RandomState(1000)

    for b in range(n_boot):
        Xb, yb = _stratified_bootstrap(X, y, rng)
        if len(np.unique(yb)) < 2:
            continue  # extreme edge case, but safe-guard

        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", _logregcv_l1(inner_cv))
        ])
        pipe.fit(Xb, yb)

        coef = pd.Series(pipe.named_steps["clf"].coef_[0], index=features)
        selected = coef.abs() > tol

        sel_counts[selected] += 1
        pos_counts[selected & (coef > 0)] += 1
        coef_sum += coef

    stability = sel_counts / n_boot
    sign_consistency = (pos_counts / sel_counts.replace(0, np.nan))
    mean_coef = coef_sum / n_boot

    out = pd.DataFrame({
        "sel_prop": stability,
        "mean_coef": mean_coef,
        "pos_sign_prop": sign_consistency
    }).sort_values(["sel_prop", "mean_coef"], ascending=False)
    return out

In [12]:
def cohen_d(x, y):
    nx, ny = len(x), len(y)
    sx, sy = np.std(x, ddof=1), np.std(y, ddof=1)
    s_pooled = np.sqrt(((nx-1)*sx**2 + (ny-1)*sy**2) / (nx+ny-2))
    return (np.mean(x) - np.mean(y)) / s_pooled

In [13]:
def abs_cohen_d(x, y):
    return abs(cohen_d(x, y))

In [14]:
def bootstrap_ci(x, y, statfunc, n_boot=1000, ci=95):
    boot_stats = []
    for _ in range(n_boot):
        bx = resample(x, replace=True)
        by = resample(y, replace=True)
        boot_stats.append(statfunc(bx, by))
    lower = np.percentile(boot_stats, (100-ci)/2)
    upper = np.percentile(boot_stats, 100-(100-ci)/2)
    return lower, upper

In [15]:
def a_priori_power(effect_size=0.6, alpha=0.05, power=0.8):
    analysis = TTestIndPower()
    return analysis.solve_power(effect_size=effect_size, alpha=alpha, power=power, alternative='two-sided')

## Analysis

In [16]:
# 1) Univariate LIWC
uni = univariate_liwc(df, features, mask_adhd, mask_others)
print("\nLIWC dimensions |d| > 0.5:")
print(uni[uni['abs_cohen_d']>0.5].to_markdown(index=False, floatfmt=".3f"))


LIWC dimensions |d| > 0.5:
| feature   |   mean_g1 |   sd_g1 |   mean_g2 |   sd_g2 |   t_stat |   p_val |   bf10 |   cohen_d |   p_fdr | signif   |   abs_cohen_d |
|:----------|----------:|--------:|----------:|--------:|---------:|--------:|-------:|----------:|--------:|:---------|--------------:|
| cogmech   |     0.288 |   0.114 |     0.351 |   0.074 |   -2.541 |   0.018 |  3.792 |    -0.773 |   0.131 | False    |         0.773 |
| excl      |     0.033 |   0.024 |     0.054 |   0.028 |   -3.692 |   0.001 | 80.826 |    -0.732 |   0.050 | False    |         0.732 |
| funct     |     0.372 |   0.122 |     0.433 |   0.078 |   -2.331 |   0.028 |  2.439 |    -0.715 |   0.150 | False    |         0.715 |
| ipron     |     0.088 |   0.046 |     0.115 |   0.042 |   -2.674 |   0.012 |  5.099 |    -0.639 |   0.131 | False    |         0.639 |
| pronoun   |     0.135 |   0.063 |     0.170 |   0.053 |   -2.494 |   0.019 |  3.421 |    -0.639 |   0.131 | False    |         0.639 |
| relativ   |

In [17]:
# 2) Bootstrap CIs for features with |d| > 0.5
top_feats = uni[uni['abs_cohen_d'] > 0.5]['feature']
ci_list = []
for feat in top_feats:
    x = df.loc[mask_adhd, feat].dropna().values
    y = df.loc[mask_others, feat].dropna().values
    d_obs = cohen_d(x, y)
    lo, hi = bootstrap_ci(x, y, cohen_d, n_boot=2000, ci=95)
    ci_list.append({'feature': feat, 'd': d_obs, 'ci_lower': lo, 'ci_upper': hi})
ci_df = pd.DataFrame(ci_list)
print("\nBootstrap 95% CIs for Cohen's d (|d| > 0.5):")
print(ci_df.to_markdown(index=False, floatfmt=".3f"))


Bootstrap 95% CIs for Cohen's d (|d| > 0.5):
| feature   |      d |   ci_lower |   ci_upper |
|:----------|-------:|-----------:|-----------:|
| cogmech   | -0.773 |     -1.390 |     -0.254 |
| excl      | -0.732 |     -1.122 |     -0.373 |
| funct     | -0.715 |     -1.301 |     -0.190 |
| ipron     | -0.639 |     -1.134 |     -0.182 |
| pronoun   | -0.639 |     -1.166 |     -0.188 |
| relativ   | -0.620 |     -1.103 |     -0.162 |
| tentat    | -0.620 |     -1.135 |     -0.110 |
| space     | -0.597 |     -1.074 |     -0.213 |
| past      | -0.575 |     -0.948 |     -0.239 |
| discrep   | -0.530 |     -0.925 |     -0.157 |
| preps     | -0.525 |     -1.119 |      0.019 |
| negate    |  0.519 |     -0.087 |      1.124 |


In [18]:
# 3) A priori power
req_n = a_priori_power(effect_size=0.6)
print(f"\nRequired N per group for d=0.6, α=0.05, 80% power: {req_n:.1f}")


Required N per group for d=0.6, α=0.05, 80% power: 44.6


In [22]:
# 3.5) A priori power
uni = univariate_liwc(df, features, mask_adhd, mask_others)

# Compute required N for each effect size
uni['N'] = uni['abs_cohen_d'].apply(lambda d: a_priori_power(effect_size=d) if d > 0 else np.nan)

print("\nLIWC dimensions |d| > 0.5:")
subset = uni[uni['abs_cohen_d']>0.5]
print(subset.to_markdown(index=False, floatfmt=".3f"))

# Summary statistics for N
valid_n = uni['N'].dropna()
print(f"\nSummary of required N per group:")
print(f"Highest N: {valid_n.max():.1f} (feature: {uni.loc[uni['N'].idxmax(), 'feature']})")
print(f"Lowest N: {valid_n.min():.1f} (feature: {uni.loc[uni['N'].idxmin(), 'feature']})")
print(f"Mean N: {valid_n.mean():.1f}")
print(f"Median N: {valid_n.median():.1f}")

# Summary statistics for N in d > 0.5
valid_n = subset['N'].dropna()
if len(valid_n) > 0:
    print(f"\nSummary of required N per group:")
    print(f"Highest N: {valid_n.max():.1f} (feature: {subset.loc[subset['N'].idxmax(), 'feature']})")
    print(f"Mean N: {valid_n.mean():.1f}")
    print(f"Median N: {valid_n.median():.1f}")
else:
    print("\nNo valid N values found for features with |d| > 0.5")


LIWC dimensions |d| > 0.5:
| feature   |   mean_g1 |   sd_g1 |   mean_g2 |   sd_g2 |   t_stat |   p_val |   bf10 |   cohen_d |   p_fdr | signif   |   abs_cohen_d |      N |
|:----------|----------:|--------:|----------:|--------:|---------:|--------:|-------:|----------:|--------:|:---------|--------------:|-------:|
| cogmech   |     0.288 |   0.114 |     0.351 |   0.074 |   -2.541 |   0.018 |  3.792 |    -0.773 |   0.131 | False    |         0.773 | 27.283 |
| excl      |     0.033 |   0.024 |     0.054 |   0.028 |   -3.692 |   0.001 | 80.826 |    -0.732 |   0.050 | False    |         0.732 | 30.254 |
| funct     |     0.372 |   0.122 |     0.433 |   0.078 |   -2.331 |   0.028 |  2.439 |    -0.715 |   0.150 | False    |         0.715 | 31.690 |
| ipron     |     0.088 |   0.046 |     0.115 |   0.042 |   -2.674 |   0.012 |  5.099 |    -0.639 |   0.131 | False    |         0.639 | 39.384 |
| pronoun   |     0.135 |   0.063 |     0.170 |   0.053 |   -2.494 |   0.019 |  3.421 |    -0.63

In [19]:
# 4) PCA group differences
pc_res = pca_group_diff(df, features, mask_adhd, mask_others)
print("\nPCA group differences:")
print(pc_res.to_markdown(index=False, floatfmt=".3f"))


PCA group differences:
| PC   |   expl_var |   t_stat |   p_val |   p_fdr |
|:-----|-----------:|---------:|--------:|--------:|
| PC1  |      0.149 |   -2.725 |   0.011 |   0.055 |
| PC2  |      0.081 |    1.095 |   0.282 |   0.705 |
| PC3  |      0.069 |   -0.237 |   0.815 |   0.815 |
| PC4  |      0.062 |    0.264 |   0.794 |   0.815 |
| PC5  |      0.053 |    0.448 |   0.658 |   0.815 |


In [20]:
# 5) PCA group differences
pc1_ld = top_pc1_loadings(df, features, n=10)
print("\nTop PC1 loadings:")
print(pc1_ld[['loading']].to_markdown(floatfmt=".3f"))


Top PC1 loadings:
|         |   loading |
|:--------|----------:|
| ipron   |     0.275 |
| funct   |     0.274 |
| pronoun |     0.274 |
| nonfl   |     0.241 |
| shehe   |     0.238 |
| article |     0.237 |
| social  |     0.230 |
| you     |     0.224 |
| ppron   |     0.221 |
| cogmech |     0.220 |


In [21]:
df_sub = df[mask_adhd | mask_others]
new_mask_adhd = (df_sub['sex']=="Feminino") & (df_sub['adhd_diagnosis']=="Sim, diagnosticado")
print(df_sub.shape)
print(new_mask_adhd.sum())

(160, 127)
23


In [22]:
top_coef = l1_logistic_top(df_sub, features, new_mask_adhd, n=10)
print(top_coef.to_markdown(floatfmt=".3f"))

|         |   coef |
|:--------|-------:|
| excl    | -0.268 |
| negate  |  0.243 |
| past    | -0.150 |
| pronoun | -0.147 |
| space   | -0.123 |
| cogmech | -0.100 |
| relativ | -0.099 |
| achieve | -0.057 |
| humans  |  0.029 |
| anx     |  0.027 |


In [23]:
mean_auc, sd_auc = nested_auc(df_sub, features, new_mask_adhd)
print(f"Nested CV AUC: mean={mean_auc:.3f}, SD={sd_auc:.3f}")

Nested CV AUC: mean=0.590, SD=0.126


In [24]:
stab = l1_stability(df_sub, features, new_mask_adhd, n_boot=100)
print(stab.loc[top_coef.index].to_markdown(floatfmt=".3f"))

|         |   sel_prop |   mean_coef |   pos_sign_prop |
|:--------|-----------:|------------:|----------------:|
| excl    |      0.690 |      -0.537 |           0.174 |
| negate  |      0.900 |       0.812 |           0.978 |
| past    |      0.660 |      -0.272 |           0.227 |
| pronoun |      0.740 |      -0.710 |           0.000 |
| space   |      0.580 |      -0.295 |           0.172 |
| cogmech |      0.840 |      -1.510 |           0.000 |
| relativ |      0.770 |      -0.509 |           0.091 |
| achieve |      0.840 |      -1.352 |           0.012 |
| humans  |      0.730 |       0.828 |           0.918 |
| anx     |      0.600 |      -0.044 |           0.483 |
