In [1]:
import os, re, numpy as np, pandas as pd
from scipy import stats

CSV_PATH   = "mental_health_counseling_conversations_rated.csv"  
OUT_DIR    = "data/processed"
REPORT_DIR = "reports"
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)


RATING_COLS = ["avg_empathy_score", "avg_appropriateness_score", "avg_relevance_score"]


BASE_TEXT_COLS = ["context", "response"]


SEED = 42
np.random.seed(SEED)

In [2]:
try:
    import regex as re2
except Exception:
    re2 = re

try:
    from ftfy import fix_text
except Exception:
    def fix_text(x): return x

try:
    from unidecode import unidecode
except Exception:
    def unidecode(x): return x

RE_URL        = re2.compile(r"(https?://\S+|www\.\S+)", flags=re2.IGNORECASE)
RE_EMAIL      = re2.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
RE_PHONE      = re2.compile(r"(?:(?:\+?\d{1,3}[-.\s]*)?(?:\(?\d{2,4}\)?[-.\s]*)?\d{3}[-.\s]?\d{4})")
RE_MENTION    = re2.compile(r"(?<=\s|^)@\w+")
RE_HASHTAG    = re2.compile(r"(?<=\s|^)#\w+")
RE_WHITESPACE = re2.compile(r"\s+")
try:
    RE_EMOJI = re2.compile(r"[\p{So}\p{Sk}\p{Cs}\p{Cn}\U00010000-\U0010FFFF]+", flags=re2.UNICODE)
except Exception:
    RE_EMOJI = None

def basic_sanitize(text: str, remove_emoji=True, lowercase=True) -> str:
    if not isinstance(text, str):
        return ""
    text = fix_text(text)
    text = unidecode(text)
    text = RE_URL.sub(" ", text)
    text = RE_EMAIL.sub(" ", text)
    text = RE_PHONE.sub(" [PHONE] ", text)
    text = RE_MENTION.sub(" ", text)
    text = RE_HASHTAG.sub(" ", text)
    if RE_EMOJI is not None and remove_emoji:
        text = RE_EMOJI.sub(" ", text)
    text = RE_WHITESPACE.sub(" ", text).strip()
    if lowercase:
        text = text.lower()
    return text

In [3]:
try:
    import regex as re2
except Exception:
    re2 = re

try:
    from ftfy import fix_text
except Exception:
    def fix_text(x): return x

try:
    from unidecode import unidecode
except Exception:
    def unidecode(x): return x

RE_URL = re2.compile(r"(https?://\S+|www\.\S+)", re2.IGNORECASE)
RE_EMAIL = re2.compile(r"[A-Za-z0-9_.+-]+@[A-Za-z0-9-]+\.[A-Za-z0-9-.]+")
RE_PHONE = re2.compile(r"(?:(?:\+?\d{1,3}[-.\s]*)?(?:\(?\d{2,4}\)?[-.\s]*)?\d{3}[-.\s]?\d{4})")
RE_MENTION = re2.compile(r"(?<=\s|^)@\w+")
RE_HASHTAG = re2.compile(r"(?<=\s|^)#\w+")
RE_WS = re2.compile(r"\s+")
try:
    RE_EMOJI = re2.compile(r"[\p{So}\p{Sk}\p{Cs}\p{Cn}\U00010000-\U0010FFFF]+", re2.UNICODE)
except Exception:
    RE_EMOJI = None

def basic_sanitize(text: str, lower: bool = True, rm_emoji: bool = True) -> str:
    if not isinstance(text, str):
        return ""
    text = fix_text(text)
    text = unidecode(text)
    text = RE_URL.sub(" ", text)
    text = RE_EMAIL.sub(" ", text)
    text = RE_PHONE.sub(" [PHONE] ", text)
    text = RE_MENTION.sub(" ", text)
    text = RE_HASHTAG.sub(" ", text)
    if rm_emoji and RE_EMOJI:
        text = RE_EMOJI.sub(" ", text)
    text = RE_WS.sub(" ", text).strip()
    if lower:
        text = text.lower()
    return text


In [4]:
# === Patch: redefine basic_sanitize with expected keyword args ===
import re

try:
    import regex as re2
except Exception:
    re2 = re

try:
    from ftfy import fix_text
except Exception:
    def fix_text(x): return x

try:
    from unidecode import unidecode
except Exception:
    def unidecode(x): return x

# compile patterns inside this cell so it's self-contained
RE_URL        = re2.compile(r"(https?://\S+|www\.\S+)", flags=re2.IGNORECASE)
RE_EMAIL      = re2.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
RE_PHONE      = re2.compile(r"(?:(?:\+?\d{1,3}[-.\s]*)?(?:\(?\d{2,4}\)?[-.\s]*)?\d{3}[-.\s]?\d{4})")
RE_MENTION    = re2.compile(r"(?<=\s|^)@\w+")
RE_HASHTAG    = re2.compile(r"(?<=\s|^)#\w+")
RE_WHITESPACE = re2.compile(r"\s+")
try:
    RE_EMOJI = re2.compile(r"[\p{So}\p{Sk}\p{Cs}\p{Cn}\U00010000-\U0010FFFF]+", flags=re2.UNICODE)
except Exception:
    RE_EMOJI = None

def basic_sanitize(text: str, remove_emoji: bool = True, lowercase: bool = True) -> str:
    """Sanitize text with optional emoji removal and lowercasing."""
    if not isinstance(text, str):
        return ""
    t = fix_text(text)
    t = unidecode(t)
    t = RE_URL.sub(" ", t)
    t = RE_EMAIL.sub(" ", t)
    t = RE_PHONE.sub(" [PHONE] ", t)
    t = RE_MENTION.sub(" ", t)
    t = RE_HASHTAG.sub(" ", t)
    if remove_emoji and RE_EMOJI is not None:
        t = RE_EMOJI.sub(" ", t)
    t = RE_WHITESPACE.sub(" ", t).strip()
    if lowercase:
        t = t.lower()
    return t

In [5]:
df = pd.read_csv(CSV_PATH)
print("Columns:", df.columns.tolist())
print("Shape before:", df.shape)

for c in RATING_COLS:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# 构造 text：context + response
def build_text(row):
    parts = []
    for c in BASE_TEXT_COLS:
        if c in row and isinstance(row[c], str) and row[c].strip():
            parts.append(row[c].strip())
    return " [CTX] ".join(parts) if parts else ""

df["text"] = df.apply(build_text, axis=1)


df["text"] = df["text"].map(lambda x: basic_sanitize(x, remove_emoji=True, lowercase=True))
df["text_len"] = df["text"].astype(str).str.split().apply(len)


before = len(df)
df = df[df["text"].str.strip()!=""]
df = df[df["text_len"]>=2]
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"Filtered {before - len(df)} rows; remain: {len(df)}")

df[["text","text_len"] + [c for c in RATING_COLS if c in df.columns]].head(3)

Columns: ['index', 'context', 'response', 'empathy_llama-3-2-1b', 'empathy_llama-3-2-3b', 'empathy_llama-3-1-8b', 'empathy_qwen-2-5-7b', 'appropriateness_llama-3-2-1b', 'appropriateness_llama-3-2-3b', 'appropriateness_llama-3-1-8b', 'appropriateness_qwen-2-5-7b', 'relevance_llama-3-2-1b', 'relevance_llama-3-2-3b', 'relevance_llama-3-1-8b', 'relevance_qwen-2-5-7b', 'explanation_llama-3-2-1b', 'explanation_llama-3-2-3b', 'explanation_llama-3-1-8b', 'explanation_qwen-2-5-7b', 'generated_text_llama-3-2-1b', 'generated_text_llama-3-2-3b', 'generated_text_llama-3-1-8b', 'generated_text_qwen-2-5-7b', 'avg_empathy_score', 'avg_appropriateness_score', 'avg_relevance_score']
Shape before: (3512, 26)
Filtered 1483 rows; remain: 2029


Unnamed: 0,text,text_len,avg_empathy_score,avg_appropriateness_score,avg_relevance_score
0,i'm going through some things with my feelings...,224,2.5,2.75,4.25
1,i'm going through some things with my feelings...,429,4.5,5.0,4.75
2,i'm going through some things with my feelings...,121,3.0,4.25,3.25


In [6]:
from sklearn.model_selection import train_test_split


BIN_COL = None
if "avg_empathy_score" in df.columns:
    df["emp_bin"] = pd.qcut(df["avg_empathy_score"], q=3, labels=[0,1,2], duplicates="drop")
    BIN_COL = "emp_bin"

train, tv = train_test_split(
    df, test_size=0.2, random_state=SEED,
    stratify=df[BIN_COL] if BIN_COL else None
)
val, test = train_test_split(
    tv, test_size=0.5, random_state=SEED,
    stratify=tv[BIN_COL] if BIN_COL else None
)

print("sizes:", len(train), len(val), len(test))


train.to_csv(os.path.join(OUT_DIR, "train.csv"), index=False)
val.to_csv(os.path.join(OUT_DIR, "val.csv"), index=False)
test.to_csv(os.path.join(OUT_DIR, "test.csv"), index=False)
print("Saved to", OUT_DIR)

sizes: 1623 203 203
Saved to data/processed


In [7]:
def iqr_outlier_share(s: pd.Series):
    s = pd.to_numeric(s, errors="coerce").dropna()
    if s.empty: return np.nan, np.nan, np.nan
    q1, q3 = s.quantile([0.25, 0.75])
    iqr = q3 - q1
    low  = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    share = ((s < low) | (s > high)).mean()
    return float(low), float(high), float(share)

def ks_pvalue(a: pd.Series, b: pd.Series):
    a = pd.to_numeric(a, errors="coerce").dropna()
    b = pd.to_numeric(b, errors="coerce").dropna()
    if len(a) < 50 or len(b) < 50:
        return np.nan
    stat, p = stats.ks_2samp(a, b)
    return float(p)

# 缺失率
cols_to_check = ["text"] + [c for c in RATING_COLS if c in train.columns]
miss_train = train[cols_to_check].isna().mean()
miss_val   = val[cols_to_check].isna().mean()

# 描述统计 & IQR
desc_rows, iqr_rows = [], []
for c in [x for x in RATING_COLS if x in train.columns] + ["text_len"]:
    if c in train.columns and c in val.columns:
        desc_rows.append({
            "metric": c,
            "train_mean": pd.to_numeric(train[c], errors="coerce").mean(),
            "val_mean": pd.to_numeric(val[c], errors="coerce").mean()
        })
        low, high, share = iqr_outlier_share(train[c])
        iqr_rows.append({"metric": c, "low": low, "high": high, "outlier_share": share})

desc_df = pd.DataFrame(desc_rows)
iqr_df  = pd.DataFrame(iqr_rows)


ks = {}
for c in ["avg_empathy_score", "text_len"]:
    if c in train.columns and c in val.columns:
        ks[c] = ks_pvalue(train[c], val[c])

# Rater consistency
empathy_cols = [c for c in df.columns if c.startswith("empathy_") and c != "avg_empathy_score"]
def cronbach_alpha(df_sub: pd.DataFrame):
    df_sub = df_sub.dropna(axis=0, how="any")
    if df_sub.shape[1] < 2 or df_sub.shape[0] == 0:
        return np.nan
    k = df_sub.shape[1]
    variances = df_sub.var(axis=0, ddof=1)
    total_var = df_sub.sum(axis=1).var(ddof=1)
    if total_var == 0:
        return np.nan
    return float((k / (k - 1)) * (1 - variances.sum() / total_var))

alpha_empathy = cronbach_alpha(train[empathy_cols]) if len(empathy_cols) >= 2 else np.nan

desc_df, iqr_df, ks, alpha_empathy

(                      metric  train_mean    val_mean
 0          avg_empathy_score    3.444855    3.357143
 1  avg_appropriateness_score    3.959489    3.859606
 2        avg_relevance_score    4.072089    4.018473
 3                   text_len  230.064695  221.266010,
                       metric     low     high  outlier_share
 0          avg_empathy_score   1.500    5.500       0.008010
 1  avg_appropriateness_score   1.625    6.625       0.012323
 2        avg_relevance_score   2.250    6.250       0.033888
 3                   text_len -76.500  495.500       0.051756,
 {'avg_empathy_score': 0.2909456262837639, 'text_len': 0.3134243942218427},
 0.7731378309976039)

In [8]:
from IPython.display import Markdown, display

lines = []
lines.append("# Trustworthiness Report\n")
lines.append("This report summarizes data trustworthiness checks: missingness, distribution shifts, outliers, and rater consistency.\n")

lines.append("## Missingness\n**Train**")
lines += [f"- {k}: {v:.3f}" for k, v in miss_train.items()]
lines.append("\n**Val**")
lines += [f"- {k}: {v:.3f}" for k, v in miss_val.items()]

lines.append("\n## Descriptive Statistics (train vs. val)\n")
for _, r in desc_df.iterrows():
    lines.append(f"- {r['metric']}: train_mean={r['train_mean']:.3f}, val_mean={r['val_mean']:.3f}")

lines.append("\n## IQR Outlier Share (train)\n")
for _, r in iqr_df.iterrows():
    lines.append(f"- {r['metric']}: bounds=({r['low']:.3f}, {r['high']:.3f}), outlier_share={r['outlier_share']:.3f}")

lines.append("\n## Drift Check (KS test p-values)\n")
for c, p in ks.items():
    flag = "drift suspected" if (isinstance(p, float) and p < 0.05) else "OK"
    lines.append(f"- {c}: p={p if not np.isnan(p) else 'NA'} → {flag}")

lines.append("\n## Rater Consistency (Cronbach’s α)\n")
if np.isnan(alpha_empathy):
    lines.append("- empathy_* columns insufficient; α = NA")
else:
    lines.append(f"- empathy_* internal consistency (alpha) = {alpha_empathy:.3f}")

lines.append("\n---\n*Optional integrations:* Evidently (drift dashboards), AIF360 (fairness metrics) can be added if installed.\n")

report_md = "\n".join(lines)

with open(os.path.join(REPORT_DIR, "trustworthiness_report.md"), "w", encoding="utf-8") as f:
    f.write(report_md)

display(Markdown(report_md))
print("[OK] wrote:", os.path.join(REPORT_DIR, "trustworthiness_report.md"))

# Trustworthiness Report

This report summarizes data trustworthiness checks: missingness, distribution shifts, outliers, and rater consistency.

## Missingness
**Train**
- text: 0.000
- avg_empathy_score: 0.000
- avg_appropriateness_score: 0.000
- avg_relevance_score: 0.000

**Val**
- text: 0.000
- avg_empathy_score: 0.000
- avg_appropriateness_score: 0.000
- avg_relevance_score: 0.000

## Descriptive Statistics (train vs. val)

- avg_empathy_score: train_mean=3.445, val_mean=3.357
- avg_appropriateness_score: train_mean=3.959, val_mean=3.860
- avg_relevance_score: train_mean=4.072, val_mean=4.018
- text_len: train_mean=230.065, val_mean=221.266

## IQR Outlier Share (train)

- avg_empathy_score: bounds=(1.500, 5.500), outlier_share=0.008
- avg_appropriateness_score: bounds=(1.625, 6.625), outlier_share=0.012
- avg_relevance_score: bounds=(2.250, 6.250), outlier_share=0.034
- text_len: bounds=(-76.500, 495.500), outlier_share=0.052

## Drift Check (KS test p-values)

- avg_empathy_score: p=0.2909456262837639 → OK
- text_len: p=0.3134243942218427 → OK

## Rater Consistency (Cronbach’s α)

- empathy_* internal consistency (alpha) = 0.773

---
*Optional integrations:* Evidently (drift dashboards), AIF360 (fairness metrics) can be added if installed.


[OK] wrote: reports\trustworthiness_report.md
