# llm_survey_eval — Demo: Full Pipeline (Tiers 1–3)

This notebook constructs **toy human** and **toy LLM** datasets and runs Tier‑1 (marginals), Tier‑2 (associations), and Tier‑3 (joint metrics).

Along the way, we add visual sanity checks: distribution plots for ordinal and nominal variables, a triptych of association matrices, and a 2‑D projection of the joint embedding.

> Tip: This repository is GitHub‑first at this stage. Install locally with `pip install -e .`.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA

from llm_survey_eval.tier1 import run_tier1_comparisons
from llm_survey_eval.tier2 import tier2_structural, plot_three
from llm_survey_eval.tier3 import compute_global_metrics, _embed
from llm_survey_eval.tier4 import evaluate_tier4, summarize_tier4

%matplotlib inline
plt.rcParams['figure.dpi'] = 140

# Ensure a local data folder exists
DATA_DIR = Path('data'); DATA_DIR.mkdir(exist_ok=True)

rng = np.random.default_rng(0)
N = 500  # sample size per dataset

ordered_features = [
    'shopping_frequency','leisure_frequency','service_frequency',
    'shopping_satisfaction','leisure_satisfaction','service_satisfaction'
]
nominal_features = ['shopping_mode','leisure_mode','service_mode']

# Construct toy HUMAN data
human = pd.DataFrame({'agent_id': np.arange(N)})
for c in ordered_features:
    human[c] = rng.integers(1, 7, size=N)  # integer 1..6
base_p = np.array([0.15,0.15,0.20,0.15,0.10,0.10,0.15])
cats = np.arange(1,8)
for c in nominal_features:
    human[c] = rng.choice(cats, size=N, p=base_p)

# Construct toy LLM data with mild shifts
llm = pd.DataFrame({'agent_id': np.arange(N)})
for c in ordered_features:
    base = human[c].astype(float) + 0.25*rng.normal(size=N)
    llm[c] = np.clip(np.rint(base), 1, 6).astype(int)
bias_p = np.array([0.10,0.14,0.22,0.18,0.14,0.10,0.12])
for c in nominal_features:
    llm[c] = rng.choice(cats, size=N, p=bias_p)

# Save to CSV for Tier‑1/Tier‑2 convenience wrappers
human_path = DATA_DIR/'sampled_data.csv'
llm_path   = DATA_DIR/'dsv3.csv'
human.to_csv(human_path, index=False)
llm.to_csv(llm_path, index=False)
human.head(), llm.head()


## Visual sanity checks — Ordinal distributions

For each ordinal variable, we overlay the empirical distributions of Human vs LLM. Perfect overlap is unlikely in synthetic data; we expect small but systematic shifts.


In [None]:
ncols = 3
nrows = int(np.ceil(len(ordered_features)/ncols))
fig, axes = plt.subplots(nrows, ncols, figsize=(11, 7), sharey=True)
axes = axes.ravel()
bins = np.arange(0.5, 6.6, 1.0)
for i, col in enumerate(ordered_features):
    ax = axes[i]
    ax.hist(human[col], bins=bins, alpha=0.6, label='Human', density=True)
    ax.hist(llm[col],   bins=bins, alpha=0.6, label='LLM',   density=True)
    ax.set_title(col)
    ax.set_xticks([1,2,3,4,5,6])
    if i % ncols == 0: ax.set_ylabel('Density')
axes[0].legend(frameon=False, loc='upper right')
for j in range(i+1, len(axes)):
    axes[j].axis('off')
fig.suptitle('Ordinal distributions: Human vs LLM', y=1.02)
fig.tight_layout()
fig


## Visual sanity checks — Nominal composition

We compare category proportions for each nominal variable; alignment uses the shared category set 1–7.


In [None]:
cats = np.arange(1,8)
ncols = 3; nrows = 1
fig, axes = plt.subplots(nrows, ncols, figsize=(11, 3.5), sharey=True)
for i, col in enumerate(nominal_features):
    ax = axes[i]
    ph = human[col].value_counts(normalize=True).reindex(cats, fill_value=0).values
    pl = llm[col].value_counts(normalize=True).reindex(cats, fill_value=0).values
    x = np.arange(len(cats))
    w = 0.4
    ax.bar(x - w/2, ph, width=w, label='Human')
    ax.bar(x + w/2, pl, width=w, label='LLM')
    ax.set_xticks(x); ax.set_xticklabels([str(c) for c in cats])
    ax.set_title(col)
axes[0].set_ylabel('Proportion')
axes[0].legend(frameon=False, loc='upper right')
fig.suptitle('Nominal category proportions', y=1.03)
fig.tight_layout()
fig


## Tier‑1: Descriptive similarity (marginals)


In [None]:
tier1_out = run_tier1_comparisons(
    survey_csv=human_path,
    llm_csv=llm_path,
    ordered_features=ordered_features,
    multinomial_features=nominal_features,
    continuous_features=[],
    id_col='agent_id'
)
display(tier1_out.head(12))


## Tier‑2: Behavioural association consistency (pairwise structure)

We compare the mixed‑type association matrices (Spearman/η/Cramér's V). The right panel shows LLM − Human differences.


In [None]:
res2 = tier2_structural(
    human_path, llm_path,
    ordered_cols=ordered_features,
    nominal_cols=nominal_features,
    id_col='agent_id'
)
fig = plot_three(res2['assoc_h'], res2['assoc_l'], res2['assoc_diff'])
fig.tight_layout()
fig


### Absolute difference heatmap

A quick view of |LLM − Human| helps identify the largest deviations.


In [None]:
diff_abs = res2['assoc_diff'].abs()
fig, ax = plt.subplots(figsize=(6,5))
im = ax.imshow(diff_abs.values, vmin=0, vmax=1, cmap='magma')
ax.set_xticks(range(len(diff_abs.columns))); ax.set_xticklabels(diff_abs.columns, rotation=90)
ax.set_yticks(range(len(diff_abs.index)));  ax.set_yticklabels(diff_abs.index)
fig.colorbar(im, ax=ax, fraction=0.025, pad=0.02, label='|Δ association|')
fig.suptitle('Absolute differences in associations (|LLM − Human|)')
fig.tight_layout(); fig


## Tier‑3: Multivariate behavioural fidelity (joint shape)

We embed (ordered → [0,1], nominal → one‑hot with fixed categories) and compute Energy distance (√ED²), Gaussian‑kernel MMD, and C2ST AUC.


In [None]:
nominal_categories = {c: [1,2,3,4,5,6,7] for c in nominal_features}
res3 = compute_global_metrics(
    human, llm,
    ordered_features=ordered_features,
    nominal_features=nominal_features,
    nominal_categories=nominal_categories,
    seed=42, verbose=False
)
res3


### 2‑D projection of the embedding (PCA)

A rough visual check: if Human and LLM clouds strongly separate in 2‑D, expect high C2ST AUC; if they overlap, AUC ~ 0.5.


In [None]:
# Use internal _embed for a consistent map (ordered scaled, nominal one‑hot)
Xh = _embed(human, ordered_features, nominal_features, nominal_categories)
Xl = _embed(llm,   ordered_features, nominal_features, nominal_categories)
Z = np.vstack([Xh, Xl])
y = np.r_[np.zeros(len(Xh)), np.ones(len(Xl))]  # 0=Human, 1=LLM
pca = PCA(n_components=2, random_state=42).fit(Z)
Z2 = pca.transform(Z)
fig, ax = plt.subplots(figsize=(6,5))
ax.scatter(Z2[y==0,0], Z2[y==0,1], s=10, alpha=0.6, label='Human')
ax.scatter(Z2[y==1,0], Z2[y==1,1], s=10, alpha=0.6, label='LLM')
ax.set_title('PCA projection of joint embedding'); ax.legend(frameon=False)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); fig.tight_layout(); fig


## 4. Tier‑4: Inferential equivalence (Ordered / Multinomial Logit)
We compare coefficients between Human and LLM models using DCR (directional consistency) and SMR (significance matching) under a unified mixed‑type feature schema.

In [None]:
# 4.1 Define predictor schema (binary / continuous / ordinal / nominal)
feature_schema = {
 'gender': {'type': 'binary'},
 'income': {'type': 'continuous'},
 'season': {'type': 'nominal', 'categories': [1,2,3,4]},
}

# Create toy predictors compatible with the schema
for df in (human, llm):
 df['gender'] = rng.integers(0, 2, size=len(df))
 df['income'] = rng.normal(0, 1, size=len(df))
 df['season'] = rng.integers(1, 5, size=len(df))

# 4.2 Define outcomes: one ordered, one multinomial
outcomes = {
 'satisfaction_out': {'type': 'ordered', 'levels': [1,2,3,4,5]},
 'mode_out' : {'type': 'multinomial', 'levels': [1,2,3]},
}

# Build toy outcomes from latent utilities / thresholds for demonstration
def to_ord_latent(df):
 idx = 0.6*df['income'] + 0.4*df['gender'] + rng.normal(0, 1, size=len(df))
 cuts = [-np.inf, -0.4, 0.0, 0.5, 1.2, np.inf]
 return pd.cut(idx, bins=cuts, labels=[1,2,3,4,5]).astype(int)
human['satisfaction_out'] = to_ord_latent(human)
llm['satisfaction_out'] = to_ord_latent(llm)

def soft_choice(df):
 U = np.column_stack([
 0.4*df['income'] - 0.2*df['gender'],
 0.1*df['income'] + 0.3*df['gender'],
 0.0*df['income'] + 0.0*df['gender'],
 ])
 e = rng.gumbel(size=U.shape); U += e
 return 1 + np.argmax(U, axis=1)
human['mode_out'] = soft_choice(human)
llm['mode_out'] = soft_choice(llm)

# 4.3 Run Tier‑4 evaluation
evals = evaluate_tier4(human, llm, feature_schema, outcomes, alpha=0.05)
summ = summarize_tier4(evals)
summ


In [None]:
# Inspect coefficient‑level comparison for the ordered outcome
evals['satisfaction_out']['detail'].head(12)


In [None]:
# Inspect coefficient‑level comparison for the multinomial outcome
evals['mode_out']['detail'].head(12)
