In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print(train_df.shape)
train_df.head()


(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# Identify feature types
numeric_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

numeric_features.remove("SalePrice")  # target column


In [4]:
# Fill missing values
train_df[numeric_features] = train_df[numeric_features].fillna(train_df[numeric_features].median())
test_df[numeric_features] = test_df[numeric_features].fillna(train_df[numeric_features].median())

train_df[categorical_features] = train_df[categorical_features].fillna(train_df[categorical_features].mode().iloc[0])
test_df[categorical_features] = test_df[categorical_features].fillna(train_df[categorical_features].mode().iloc[0])


In [5]:
X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [7]:
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test_df)


In [8]:
from scipy import sparse

sparse.save_npz("../data/X_processed.npz", X_processed)
sparse.save_npz("../data/X_test_processed.npz", X_test_processed)

y.to_csv("../data/y.csv", index=False)

print("Preprocessing completed & files saved!")


Preprocessing completed & files saved!
