In [1]:
from pathlib import Path
import pandas as pd

In [None]:
# ==============================
# Locate Data_Warehouse
# ==============================
try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()

def find_data_process_dir(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "Data_Warehouse").exists():
            return p
    raise FileNotFoundError("Could not locate Data_Warehouse folder")

data_process_dir = find_data_process_dir(script_dir)
warehouse_dir = data_process_dir / "Data_Warehouse"
print(f"Using Data_Warehouse: {warehouse_dir}")

# ==============================
# Helper to standardize columns
# ==============================
def standardize_columns(df):
    col_map = {}
    for c in df.columns:
        lc = c.lower().strip()
        if lc in ["text", "selftext", "content", "sentence", "body", "post"]:
            col_map[c] = "text"
        elif lc in ["label", "labels", "target", "y", "class"]:
            col_map[c] = "label"
    return df.rename(columns=col_map)

# ==============================
# Read and check duplicates
# ==============================
csv_files = list(warehouse_dir.glob("*.csv"))
frames = []
summary = []

for f in csv_files:
    df = pd.read_csv(f)
    df = standardize_columns(df)

    if not {"text", "label"}.issubset(df.columns):
        print(f"⚠️ Skipping {f.name}: missing text/label columns -> {df.columns.tolist()}")
        continue

    total = len(df)
    within_dupes = df.duplicated(subset=["text", "label"]).sum()
    summary.append({"file": f.name, "rows": total, "within_file_duplicates": within_dupes})
    df["file"] = f.name
    frames.append(df[["text", "label", "file"]])

combined = pd.concat(frames, ignore_index=True)

# Mark duplicates across all files
combined["is_duplicate"] = combined.duplicated(subset=["text", "label"], keep=False)

# Count cross-file duplicates
cross_file_counts = (
    combined.loc[combined["is_duplicate"]]
    .groupby("file")
    .size()
    .reset_index(name="cross_file_duplicates")
)

# Merge summaries
summary_df = pd.DataFrame(summary).merge(cross_file_counts, on="file", how="left")
summary_df["cross_file_duplicates"] = summary_df["cross_file_duplicates"].fillna(0).astype(int)

# ==============================
# Save outputs
# ==============================
out_summary = warehouse_dir / "duplicate_report.csv"
summary_df.to_csv(out_summary, index=False)

out_all_dupes = warehouse_dir / "all_duplicates.csv"
combined.loc[combined["is_duplicate"]].to_csv(out_all_dupes, index=False)

# Save final combined dataset
out_file = warehouse_dir / "combined_mental_condition_dataset.csv"
final_df = combined.drop(columns=["file", "is_duplicate"]).drop_duplicates(subset=["text", "label"]).reset_index(drop=True)
final_df.to_csv(out_file, index=False, encoding="utf-8")

print("\nPer-file duplicate report:")
print(summary_df)
print(f"\n✅ Saved summary at: {out_summary}")
print(f"✅ Saved all duplicate rows at: {out_all_dupes}")
print(f"✅ Final combined dataset saved at: {out_file} ({len(final_df)} rows)")

Using Data_Warehouse: z:\Documents\Projects\PSS_XAI\Data_Process\Data_Warehouse


ValueError: Usecols do not match columns, columns expected but not found: ['text', 'label']