In [24]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

WORK_DIR = Path(".").absolute()
DATA_DIR = WORK_DIR / "raw"
OUTPUT_DIR = WORK_DIR / "cleaned"

print(f"WORK_DIR: {WORK_DIR}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")

WORK_DIR: /Users/scott/repos/CBM_NLP/dataset/essay
DATA_DIR: /Users/scott/repos/CBM_NLP/dataset/essay/raw
OUTPUT_DIR: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned


In [23]:
SRC = DATA_DIR / "QA_train_annotated.csv"

OUT_TRAIN_MAN = OUTPUT_DIR / "train_manual.csv"
OUT_DEV_MAN = OUTPUT_DIR / "dev_manual.csv"
OUT_TEST_MAN = OUTPUT_DIR / "test_manual.csv"

OUT_TRAIN_GEN = os.path.join(OUTPUT_DIR, "train_generated.csv")
OUT_DEV_GEN = os.path.join(OUTPUT_DIR, "dev_generated.csv")
OUT_TEST_GEN = os.path.join(OUTPUT_DIR, "test_generated.csv")

print(f"SRC: {SRC}")
print(f"OUT_TRAIN_MAN: {OUT_TRAIN_MAN}")
print(f"OUT_DEV_MAN: {OUT_DEV_MAN}")
print(f"OUT_TEST_MAN: {OUT_TEST_MAN}")
print(f"OUT_TRAIN_GEN: {OUT_TRAIN_GEN}")
print(f"OUT_DEV_GEN: {OUT_DEV_GEN}")
print(f"OUT_TEST_GEN: {OUT_TEST_GEN}")

SRC: /Users/scott/repos/CBM_NLP/dataset/essay/raw/QA_train_annotated.csv
OUT_TRAIN_MAN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/train_manual.csv
OUT_DEV_MAN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/dev_manual.csv
OUT_TEST_MAN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/test_manual.csv
OUT_TRAIN_GEN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/train_generated.csv
OUT_DEV_GEN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/dev_generated.csv
OUT_TEST_GEN: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/test_generated.csv


In [27]:


CONCEPT_COLS = ["FC","CC","TU","CP","R","DU","EE","FR"]

def map_concept(v):
    try:
        v = int(v)
    except Exception:
        return "unknown"
    if v == 3:
        return "Positive"
    if v == 2:
        return "unknown"
    return "Negative"

def to_text(row):
    q = str(row.get("question", "")).strip()
    a = str(row.get("student_answer", "")).strip()
    if q and a:
        return f"Q: {q}\nA: {a}"
    return a or q

def to_label_binary(v):
    try:
        s = float(v)
    except Exception:
        return 0
    return 1 if s >= 3.5 else 0

def load_and_transform():
    df = pd.read_csv(SRC)
    # 基础字段
    out = pd.DataFrame()
    out["text"] = df.apply(to_text, axis=1)
    out["label"] = df["score_avg"].apply(to_label_binary)

    # 概念列
    for c in CONCEPT_COLS:
        if c in df.columns:
            out[c] = df[c].apply(map_concept)
        else:
            out[c] = "unknown"

    # 清理空文本与空标签
    out = out.dropna(subset=["text", "label"])
    out = out[out["text"].astype(str).str.strip() != ""].reset_index(drop=True)
    return out

def stratified_split(df, seed=42):
    train, temp = train_test_split(df, test_size=0.30, stratify=df["label"], random_state=seed)
    dev, test = train_test_split(temp, test_size=0.50, stratify=temp["label"], random_state=seed)
    return train.reset_index(drop=True), dev.reset_index(drop=True), test.reset_index(drop=True)

def main():
    os.makedirs(DATA_DIR, exist_ok=True)
    df = load_and_transform()

    # 切分
    train, dev, test = stratified_split(df)

    # 保存 manual
    train.to_csv(OUT_TRAIN_MAN, index=False)
    dev.to_csv(OUT_DEV_MAN, index=False)
    test.to_csv(OUT_TEST_MAN, index=False)

    # 生成 generated（基线：初始先复制，后续可替换为LLM生成概念）
    train.to_csv(OUT_TRAIN_GEN, index=False)
    dev.to_csv(OUT_DEV_GEN, index=False)
    test.to_csv(OUT_TEST_GEN, index=False)

    # 简要统计
    def stats(name, part):
        print(f"{name}: n={len(part)}, label=1比例={part['label'].mean():.3f}")

    stats("train", train)
    stats("dev", dev)
    stats("test", test)

if __name__ == "__main__":
    main()

train: n=1591, label=1比例=0.793
dev: n=341, label=1比例=0.795
test: n=341, label=1比例=0.792


# Validation

In [29]:
import pandas as pd
from collections import Counter

files = [
    "train_manual.csv","dev_manual.csv","test_manual.csv",
    "train_generated.csv","dev_generated.csv","test_generated.csv"
]
concepts = ["FC","CC","TU","CP","R","DU","EE","FR"]

def check_file(path):
    df = pd.read_csv(path)
    need_cols = ["text","label"] + concepts
    missing = [c for c in need_cols if c not in df.columns]
    print(f"\n=== {path} ===")
    if missing:
        print("缺失列:", missing)
    else:
        print("列齐全")
    # 空值/空文本
    null_counts = df[need_cols].isnull().sum().to_dict()
    print("空值计数:", {k:int(v) for k,v in null_counts.items()})
    empty_text = (df["text"].astype(str).str.strip()=="").sum()
    print("空文本行数:", int(empty_text))
    # 标签取值
    print("label取值分布:", Counter(df["label"]))
    # 概念合法取值
    for c in concepts:
        vals = set(df[c].astype(str).unique().tolist())
        bad = vals - {"Positive","Negative","unknown"}
        if bad:
            print(f"{c} 含非法取值: {bad}")
    # 概念分布
    for c in concepts:
        vc = df[c].value_counts(normalize=True).to_dict()
        print(f"{c} 分布:", {k: round(v,3) for k,v in vc.items()})

for f in files:
    check_file(f"{OUTPUT_DIR}/{f}")


=== /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/train_manual.csv ===
列齐全
空值计数: {'text': 0, 'label': 0, 'FC': 0, 'CC': 0, 'TU': 0, 'CP': 0, 'R': 0, 'DU': 0, 'EE': 0, 'FR': 0}
空文本行数: 0
label取值分布: Counter({1: 1261, 0: 330})
FC 分布: {'Positive': 0.488, 'Negative': 0.296, 'unknown': 0.216}
CC 分布: {'Negative': 0.388, 'Positive': 0.385, 'unknown': 0.227}
TU 分布: {'Positive': 0.507, 'unknown': 0.289, 'Negative': 0.204}
CP 分布: {'Positive': 0.409, 'unknown': 0.312, 'Negative': 0.279}
R 分布: {'Positive': 0.747, 'unknown': 0.138, 'Negative': 0.115}
DU 分布: {'Positive': 0.411, 'Negative': 0.375, 'unknown': 0.214}
EE 分布: {'Negative': 0.774, 'unknown': 0.157, 'Positive': 0.069}
FR 分布: {'Positive': 0.806, 'unknown': 0.156, 'Negative': 0.038}

=== /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/dev_manual.csv ===
列齐全
空值计数: {'text': 0, 'label': 0, 'FC': 0, 'CC': 0, 'TU': 0, 'CP': 0, 'R': 0, 'DU': 0, 'EE': 0, 'FR': 0}
空文本行数: 0
label取值分布: Counter({1: 271, 0: 70})
FC 分布: {'Positive': 0.516, 'Negative': 