In [2]:
import pandas as pd
import numpy as np
import hashlib

In [3]:
df = pd.read_csv("../data/raw/train.csv")


In [4]:
# Drop columns (sparse / noise) — confirm these exist in your CSV
drop_cols = [
    "Id", "PoolQC", "MiscFeature", "Alley", "Fence",
    "GarageArea", "BsmtFinSF1", "BsmtUnfSF", "GarageYrBlt",
    "TotRmsAbvGrd", "MoSold", "YrSold", "MiscVal", "1stFlrSF"
]
for c in drop_cols:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

In [5]:
# Convert numeric → categorical if needed
if "MSSubClass" in df.columns:
    df["MSSubClass"] = df["MSSubClass"].astype("object")

In [7]:
# Columns to fill with "None" (categoricals, including ordinals)
fill_none_cols = [
    "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "MasVnrType", "ExterQual", "KitchenQual", "HeatingQC"
]
for c in fill_none_cols:
    if c in df.columns:
        df[c] = df[c].fillna("None")

In [8]:
# Numeric imputation (median)
median_cols = ["LotFrontage", "MasVnrArea"]
for col in median_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

In [9]:
# Mode for small missing
if "Electrical" in df.columns:
    df["Electrical"] = df["Electrical"].fillna(df["Electrical"].mode()[0])



In [10]:
# Cap extreme numeric outliers (optional)
if "LotFrontage" in df.columns:
    df["LotFrontage"] = df["LotFrontage"].clip(upper=df["LotFrontage"].quantile(0.99))

# OPTIONAL: Decide whether to log-transform the target here or in modeling stage.
# If you do it here, remember to invert the transform when evaluating final metrics.
# df["SalePrice"] = np.log1p(df["SalePrice"])

In [11]:
# Save cleaned data
df.to_csv("../data/clean/train_clean.csv", index=False)

In [12]:
# Save a simple checksum for provenance
data_bytes = open("../data/clean/train_clean.csv", "rb").read()
checksum = hashlib.sha256(data_bytes).hexdigest()
with open("../data/clean/train_clean.sha256", "w") as f:
    f.write(checksum)

In [13]:
print("Data cleaning done. Clean CSV saved at data/clean/train_clean.csv")
print("Checksum saved at data/clean/train_clean.sha256:", checksum)

Data cleaning done. Clean CSV saved at data/clean/train_clean.csv
Checksum saved at data/clean/train_clean.sha256: d3d32d1400bc8049ed02ba03a0cd568266b1b5b9182011c4c3d1def7a0a71409
