In [None]:
import pandas as pd
import numpy as np
import os

# LIST OF CSV FILES (put your 7 corrupted file paths)
csv_files = [
    "1_stores_corrupted.csv",
    "2_products_corrupted.csv",
    "3_customers_corrupted.csv",
    "4_promotions_corrupted.csv",
    "5_loyalty_rules_corrupted.csv",
    "6_sales_headers_corrupted.csv",
    "7_sales_line_items_corrupted.csv"
]


# CLEANING LOGIC

def clean_csv(df):

    # COMPLETENESS (Missing Values)
    essential_cols = [c for c in df.columns if c.lower() in ["id", "date", "price", "quantity"]]

    # Drop rows where essential cols are missing
    if essential_cols:
        df.dropna(subset=essential_cols, inplace=True)

    # Fill non-essential missing
    df.fillna("Unknown", inplace=True)


    # VALIDITY (Rules like price ≥ 0)
    if "price" in df.columns:
        df = df[df["price"].astype(str).str.replace(",", "").astype(float) >= 0]

    if "quantity" in df.columns:
        df = df[df["quantity"].astype(str).str.replace(",", "").astype(float) > 0]

    if "age" in df.columns:
        df = df[df["age"].astype(float).between(0, 120)]

    # Date standardization
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        df = df.dropna(subset=["date"])

    # CONSISTENCY (Format Fixes)
    strings = df.select_dtypes(include="object").columns
    for col in strings:
        df[col] = df[col].astype(str).str.strip()

    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # UNIQUENESS
    df.drop_duplicates(inplace=True)

    # INTEGRITY (Example: price * quantity match)
    if {"price", "quantity"}.issubset(df.columns):
        df["calculated_total"] = (
            df["price"].astype(float) * df["quantity"].astype(float)
        )

    return df

# PROCESS ALL FILES

for file in csv_files:
    print(f"\nProcessing → {file}")

    try:
        # Read corrupted CSV safely
        df = pd.read_csv(file, on_bad_lines="skip", encoding_errors="ignore")

        df_clean = clean_csv(df)

        # Save cleaned
        out_path = file.replace(".csv", "_cleaned.csv")
        df_clean.to_csv(out_path, index=False)

        print(f"✔ Cleaned file saved: {out_path}")

    except Exception as e:
        print(f"❌ Error processing {file}: {e}")



Processing → 1_stores_corrupted.csv
✔ Cleaned file saved: 1_stores_corrupted_cleaned.csv

Processing → 2_products_corrupted.csv
✔ Cleaned file saved: 2_products_corrupted_cleaned.csv

Processing → 3_customers_corrupted.csv
✔ Cleaned file saved: 3_customers_corrupted_cleaned.csv

Processing → 4_promotions_corrupted.csv
✔ Cleaned file saved: 4_promotions_corrupted_cleaned.csv

Processing → 5_loyalty_rules_corrupted.csv
✔ Cleaned file saved: 5_loyalty_rules_corrupted_cleaned.csv

Processing → 6_sales_headers_corrupted.csv
✔ Cleaned file saved: 6_sales_headers_corrupted_cleaned.csv

Processing → 7_sales_line_items_corrupted.csv
✔ Cleaned file saved: 7_sales_line_items_corrupted_cleaned.csv
