In [10]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv("data/raw/train.csv")
test  = pd.read_csv("data/raw/test.csv")

# Keep Ids for submission later
train_ids = train["Id"].copy()
test_ids  = test["Id"].copy()

# Separate target + drop Id from features
y = train["SalePrice"].copy()
train_features = train.drop(columns=["SalePrice", "Id"])
test_features  = test.drop(columns=["Id"])

print("train_features:", train_features.shape, "test_features:", test_features.shape)

train_features: (1460, 79) test_features: (1459, 79)


In [3]:
# Combine features (data leak safe: no target included) 
all_data = pd.concat([train_features, test_features], axis=0, ignore_index=True)

print("combined:", all_data.shape)
print("missing before:", all_data.isnull().sum().sum())

combined: (2919, 79)
missing before: 15707


# Handling NaNs

In [4]:
# Structural categorical missing => "None"
fill_none = [
    "Alley", "MasVnrType",
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "PoolQC", "Fence", "MiscFeature"
]
for col in fill_none:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna("None")

# B) Structural numeric missing => 0
fill_zero = [
    "MasVnrArea",
    "GarageYrBlt", "GarageCars", "GarageArea",
    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF",
    "BsmtFullBath", "BsmtHalfBath"
]
for col in fill_zero:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(0)

# LotFrontage => Neighborhood median, then global median
if "LotFrontage" in all_data.columns and "Neighborhood" in all_data.columns:
    all_data["LotFrontage"] = (
        all_data.groupby("Neighborhood")["LotFrontage"]
        .transform(lambda s: s.fillna(s.median()))
    )
    all_data["LotFrontage"] = all_data["LotFrontage"].fillna(all_data["LotFrontage"].median())

# Small true-missing categoricals => mode
fill_mode = [
    "Electrical", "MSZoning", "Utilities", "Exterior1st", "Exterior2nd",
    "KitchenQual", "Functional", "SaleType"
]
for col in fill_mode:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# numeric->median, categorical->mode
num_cols = all_data.select_dtypes(include=["int64", "float64"]).columns
all_data[num_cols] = all_data[num_cols].fillna(all_data[num_cols].median())

cat_cols = all_data.select_dtypes(include=["object"]).columns
for col in cat_cols:
    if all_data[col].isnull().any():
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

print("missing after:", all_data.isnull().sum().sum())

missing after: 0


In [6]:
# Split back
X = all_data.iloc[:len(train_features), :].copy()
X_test = all_data.iloc[len(train_features):, :].copy()

print("X:", X.shape)
print("X_test:", X_test.shape)

X: (1460, 79)
X_test: (1459, 79)


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
coded_categoricals = ["MSSubClass", "MoSold", "YrSold"]
for col in coded_categoricals:
    if col in X.columns:
        X[col] = X[col].astype(str)
    if col in X_test.columns:
        X_test[col] = X_test[col].astype(str)

In [11]:
# Identify categorical + numeric columns
cat_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop"
)

# Fit ONLY on training features (X from train.csv) and transform both
X_encoded = preprocess.fit_transform(X)
X_test_encoded = preprocess.transform(X_test)

print("Encoded shapes:", X_encoded.shape, X_test_encoded.shape)

Encoded shapes: (1460, 331) (1459, 331)


# Train Test Split

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42
)

print("Split shapes:", X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

Split shapes: (1022, 331) (438, 331) (1022,) (438,)


# Save encoded data as CSVs

In [15]:
from scipy import sparse
import os

os.makedirs("data/processed", exist_ok=True)

sparse.save_npz("data/processed/X_train.npz", X_train)
sparse.save_npz("data/processed/X_valid.npz", X_valid)
sparse.save_npz("data/processed/X_test.npz", X_test_encoded)

y_train.to_csv("data/processed/y_train.csv", index=False)
y_valid.to_csv("data/processed/y_valid.csv", index=False)

# To load later:

In [None]:
from scipy import sparse
X_train = sparse.load_npz("data/processed/X_train.npz")
X_valid = sparse.load_npz("data/processed/X_valid.npz")
X_test  = sparse.load_npz("data/processed/X_test.npz")