In [113]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder
from scipy.sparse import issparse

In [114]:
df = pd.read_csv("processed_data/preprocessed_house_data.csv")

In [115]:
df.shape

(1460, 14)

In [116]:
## Independent and dependent features
X = df.drop(labels=['SalePrice'],axis=1)
Y = df[['SalePrice']]

In [117]:
X.shape

(1460, 13)

In [118]:
Y.shape

(1460, 1)

In [119]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [120]:
categorical_cols

Index(['Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'Foundation',
       'GarageFinish', 'GarageType', 'SaleCondition', 'MSZoning',
       'HouseStyle'],
      dtype='object')

In [121]:
numerical_cols

Index(['OverallQual', 'GrLivArea', 'GarageCars'], dtype='object')

In [122]:
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]

In [123]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# =========================
# Columns
# =========================
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

ordinal_cols = ["ExterQual", "KitchenQual", "BsmtQual", "GarageFinish"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]

# =========================
# Domain Categories
# =========================
quality_categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_categories = ['NoBsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish_categories = ['Unf', 'RFn', 'Fin']

# =========================
# Numerical Pipeline
# =========================
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# =========================
# Ordinal Pipeline
# =========================
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(
        categories=[
            quality_categories,
            quality_categories,
            bsmt_categories,
            garage_finish_categories
        ],
        handle_unknown='use_encoded_value',
        unknown_value=-1
    )),
    ('scaler', StandardScaler())
])

# =========================
# Nominal Pipeline
# =========================
nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# =========================
# Column Transformer
# =========================
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('ordinal_pipeline', ordinal_pipeline, ordinal_cols),
    ('nominal_pipeline', nominal_pipeline, nominal_cols)
])

print("✅ Preprocessing Pipeline Created Successfully!")

✅ Preprocessing Pipeline Created Successfully!


In [124]:
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.sparse import issparse  # ✅ import added

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

# Fit preprocessor on training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert sparse matrix to dense if needed
if issparse(X_train_transformed):
    X_train_transformed = X_train_transformed.toarray()
    X_test_transformed = X_test_transformed.toarray()

# Get feature names from preprocessor
feature_names = preprocessor.get_feature_names_out()

# Make sure the number of feature names matches the transformed data
if X_train_transformed.shape[1] != len(feature_names):
    feature_names = feature_names[:X_train_transformed.shape[1]]

# Convert to DataFrame with proper column names
X_train = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test = pd.DataFrame(X_test_transformed, columns=feature_names)

# Optional: reset index to match original dataset
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Check shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Number of features:", len(feature_names))

X_train shape: (1022, 63)
X_test shape: (438, 63)
Number of features: 63


In [126]:
X_train.head()

Unnamed: 0,num_pipeline__OverallQual,num_pipeline__GrLivArea,num_pipeline__GarageCars,ordinal_pipeline__ExterQual,ordinal_pipeline__KitchenQual,ordinal_pipeline__BsmtQual,ordinal_pipeline__GarageFinish,nominal_pipeline__Neighborhood_Blmngtn,nominal_pipeline__Neighborhood_Blueste,nominal_pipeline__Neighborhood_BrDale,...,nominal_pipeline__MSZoning_RL,nominal_pipeline__MSZoning_RM,nominal_pipeline__HouseStyle_1.5Fin,nominal_pipeline__HouseStyle_1.5Unf,nominal_pipeline__HouseStyle_1Story,nominal_pipeline__HouseStyle_2.5Fin,nominal_pipeline__HouseStyle_2.5Unf,nominal_pipeline__HouseStyle_2Story,nominal_pipeline__HouseStyle_SFoyer,nominal_pipeline__HouseStyle_SLvl
0,1.37514,2.261913,0.332543,1.061022,2.274292,0.656312,1.512421,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.522577,-1.222343,0.332543,-0.662928,0.763026,-0.82765,-0.94301,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.073719,-0.072917,0.332543,-0.662928,-0.748239,0.656312,-0.94301,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.798148,-1.222343,0.332543,-0.662928,0.763026,-0.82765,0.284705,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.073719,-0.379683,0.332543,-0.662928,0.763026,-0.82765,0.284705,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
