In [12]:
#!/usr/bin/env python3
"""Data preparation script for essay dataset - PLMs and CBE-PLMs only.

This script processes the raw essay data and creates the required CSV files
for PLMs (standard) and CBE-PLMs (joint) experiments.
"""
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# Paths
WORK_DIR = Path('.').absolute()
DATA_DIR = WORK_DIR / "raw"
OUTPUT_DIR = WORK_DIR / "cleaned"

# Input file
SRC = DATA_DIR / "QA_train_annotated.csv"

# Output files (only manual variants needed)
OUT_TRAIN_MAN = OUTPUT_DIR / "train_manual.csv"
OUT_DEV_MAN = OUTPUT_DIR / "dev_manual.csv"
OUT_TEST_MAN = OUTPUT_DIR / "test_manual.csv"

# Concept columns
CONCEPT_COLS = ["FC", "CC", "TU", "CP", "R", "DU", "EE", "FR"]

In [13]:
def map_concept(v):
    """Map numerical concept scores to categorical labels."""
    try:
        v = int(v)
    except Exception:
        return "unknown"
    if v == 3:
        return "Positive"
    if v == 2:
        return "unknown"
    return "Negative"


def to_text(row):
    """Convert question and answer to text format."""
    q = str(row.get("question", "")).strip()
    a = str(row.get("student_answer", "")).strip()
    if q and a:
        return f"Q: {q}\nA: {a}"
    return a or q


def to_label_binary(v):
    """Convert score_avg to binary label (threshold: 3.5)."""
    try:
        s = float(v)
    except Exception:
        return 0
    return 1 if s >= 3.5 else 0


def load_and_transform():
    """Load and transform the raw data."""
    print(f"Loading data from: {SRC}")
    df = pd.read_csv(SRC)
    
    out = pd.DataFrame()
    out["text"] = df.apply(to_text, axis=1)
    out["label"] = df["score_avg"].apply(to_label_binary)
    
    # Map concept columns
    for c in CONCEPT_COLS:
        if c in df.columns:
            out[c] = df[c].apply(map_concept)
        else:
            out[c] = "unknown"
    
    # Clean data
    out = out.dropna(subset=["text", "label"])
    out = out[out["text"].astype(str).str.strip() != ""].reset_index(drop=True)
    
    print(f"Loaded {len(out)} samples after cleaning")
    return out


def stratified_split(df, seed=42):
    """Perform stratified split to maintain label distribution (7:2:1 ratio)."""
    # First split: 70% train, 30% temp (dev + test)
    train, temp = train_test_split(
        df, test_size=0.30, stratify=df["label"], random_state=seed
    )
    # Second split: from 30%, get 20% dev and 10% test
    # 20% of total = 30% * (2/3), 10% of total = 30% * (1/3)
    dev, test = train_test_split(
        temp, test_size=1/3, stratify=temp["label"], random_state=seed
    )
    return train.reset_index(drop=True), dev.reset_index(drop=True), test.reset_index(drop=True)


def validate_data(df, name):
    """Validate data quality."""
    print(f"\n=== {name} Validation ===")
    print(f"Shape: {df.shape}")
    print(f"Label distribution: {df['label'].value_counts(normalize=True).to_dict()}")
    
    # Check concept distributions
    for col in CONCEPT_COLS:
        dist = df[col].value_counts(normalize=True).to_dict()
        print(f"{col}: {dist}")
    
    # Check for missing values
    missing = df[["text", "label"] + CONCEPT_COLS].isnull().sum()
    if missing.sum() > 0:
        print(f"Missing values: {missing[missing > 0].to_dict()}")
    else:
        print("No missing values")


def main():
    """Main data preparation function."""
    print("=" * 60)
    print("ESSAY DATASET PREPARATION - PLMs & CBE-PLMs ONLY")
    print("=" * 60)
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Load and transform data
    df = load_and_transform()
    
    # Split data
    print("\nPerforming stratified split...")
    train, dev, test = stratified_split(df)
    
    # Save manual files (only these are needed)
    print("\nSaving manual files...")
    train.to_csv(OUT_TRAIN_MAN, index=False)
    dev.to_csv(OUT_DEV_MAN, index=False)
    test.to_csv(OUT_TEST_MAN, index=False)
    
    print(f"Saved to: {OUTPUT_DIR}")
    
    # Validate outputs
    validate_data(train, "Train")
    validate_data(dev, "Dev")
    validate_data(test, "Test")
    
    # Summary statistics
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    def stats(name, part):
        print(f"{name}: n={len(part)}, label=1比例={part['label'].mean():.3f}")
    
    stats("Train", train)
    stats("Dev", dev)
    stats("Test", test)
    
    print("\n" + "=" * 60)
    print("DATA PREPARATION COMPLETED")
    print("=" * 60)
    print("Files created:")
    print(f"  - {OUT_TRAIN_MAN}")
    print(f"  - {OUT_DEV_MAN}")
    print(f"  - {OUT_TEST_MAN}")
    print("\nReady for PLMs and CBE-PLMs experiments!")

In [14]:
main()

ESSAY DATASET PREPARATION - PLMs & CBE-PLMs ONLY
Loading data from: /Users/scott/repos/CBM_NLP/dataset/essay/raw/QA_train_annotated.csv
Loaded 2273 samples after cleaning

Performing stratified split...

Saving manual files...
Saved to: /Users/scott/repos/CBM_NLP/dataset/essay/cleaned

=== Train Validation ===
Shape: (1591, 10)
Label distribution: {1: 0.792583280955374, 0: 0.20741671904462602}
FC: {'Positive': 0.48774355751099935, 'Negative': 0.2960402262727844, 'unknown': 0.21621621621621623}
CC: {'Negative': 0.38843494657448147, 'Positive': 0.3846637335009428, 'unknown': 0.22690131992457574}
TU: {'Positive': 0.5065996228786926, 'unknown': 0.2891263356379635, 'Negative': 0.2042740414833438}
CP: {'Positive': 0.40917661847894404, 'unknown': 0.3117536140791955, 'Negative': 0.27906976744186046}
R: {'Positive': 0.7467001885606537, 'unknown': 0.13827781269641734, 'Negative': 0.11502199874292897}
DU: {'Positive': 0.4110622250157134, 'Negative': 0.37460716530483973, 'unknown': 0.2143306096794

# Validation

In [15]:
import glob
import pandas as pd
from collections import Counter

files = glob.glob(f"{OUTPUT_DIR}/*.csv")
concepts = ["FC","CC","TU","CP","R","DU","EE","FR"]

def check_file(path):
    df = pd.read_csv(path)
    need_cols = ["text","label"] + concepts
    missing = [c for c in need_cols if c not in df.columns]
    print(f"\n=== {path} ===")
    if missing:
        print("缺失列:", missing)
    else:
        print("列齐全")
    # 空值/空文本
    null_counts = df[need_cols].isnull().sum().to_dict()
    print("空值计数:", {k:int(v) for k,v in null_counts.items()})
    empty_text = (df["text"].astype(str).str.strip()=="").sum()
    print("空文本行数:", int(empty_text))
    # 标签取值
    print("label取值分布:", Counter(df["label"]))
    # 概念合法取值
    for c in concepts:
        vals = set(df[c].astype(str).unique().tolist())
        bad = vals - {"Positive","Negative","unknown"}
        if bad:
            print(f"{c} 含非法取值: {bad}")
    # 概念分布
    for c in concepts:
        vc = df[c].value_counts(normalize=True).to_dict()
        print(f"{c} 分布:", {k: round(v,3) for k,v in vc.items()})

for f in files:
    check_file(f)


=== /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/train_manual.csv ===
列齐全
空值计数: {'text': 0, 'label': 0, 'FC': 0, 'CC': 0, 'TU': 0, 'CP': 0, 'R': 0, 'DU': 0, 'EE': 0, 'FR': 0}
空文本行数: 0
label取值分布: Counter({1: 1261, 0: 330})
FC 分布: {'Positive': 0.488, 'Negative': 0.296, 'unknown': 0.216}
CC 分布: {'Negative': 0.388, 'Positive': 0.385, 'unknown': 0.227}
TU 分布: {'Positive': 0.507, 'unknown': 0.289, 'Negative': 0.204}
CP 分布: {'Positive': 0.409, 'unknown': 0.312, 'Negative': 0.279}
R 分布: {'Positive': 0.747, 'unknown': 0.138, 'Negative': 0.115}
DU 分布: {'Positive': 0.411, 'Negative': 0.375, 'unknown': 0.214}
EE 分布: {'Negative': 0.774, 'unknown': 0.157, 'Positive': 0.069}
FR 分布: {'Positive': 0.806, 'unknown': 0.156, 'Negative': 0.038}

=== /Users/scott/repos/CBM_NLP/dataset/essay/cleaned/test_manual.csv ===
列齐全
空值计数: {'text': 0, 'label': 0, 'FC': 0, 'CC': 0, 'TU': 0, 'CP': 0, 'R': 0, 'DU': 0, 'EE': 0, 'FR': 0}
空文本行数: 0
label取值分布: Counter({1: 181, 0: 47})
FC 分布: {'Positive': 0.465, 'Negative':