In [77]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

In [78]:
df = pd.read_csv("processed_data/preprocessed_house_data.csv")

In [79]:
df.shape

(1460, 14)

In [80]:
## Independent and dependent features
X = df.drop(labels=['SalePrice'],axis=1)
Y = df[['SalePrice']]

In [81]:
X.shape

(1460, 13)

In [82]:
Y.shape

(1460, 1)

In [83]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [84]:
categorical_cols

Index(['Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'Foundation',
       'GarageFinish', 'GarageType', 'SaleCondition', 'MSZoning',
       'HouseStyle'],
      dtype='object')

In [85]:
numerical_cols

Index(['OverallQual', 'GrLivArea', 'GarageCars'], dtype='object')

In [86]:
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]

In [91]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# =========================
# Columns
# =========================
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

ordinal_cols = ["ExterQual", "KitchenQual", "BsmtQual", "GarageFinish"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]

# =========================
# Domain Categories
# =========================
quality_categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_categories = ['NoBsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish_categories = ['Unf', 'RFn', 'Fin']

# =========================
# Numerical Pipeline
# =========================
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# =========================
# Ordinal Pipeline
# =========================
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(
        categories=[
            quality_categories,
            quality_categories,
            bsmt_categories,
            garage_finish_categories
        ],
        handle_unknown='use_encoded_value',
        unknown_value=-1
    )),
    ('scaler', StandardScaler())
])

# =========================
# Nominal Pipeline
# =========================
nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# =========================
# Column Transformer
# =========================
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('ordinal_pipeline', ordinal_pipeline, ordinal_cols),
    ('nominal_pipeline', nominal_pipeline, nominal_cols)
])

print("✅ Preprocessing Pipeline Created Successfully!")

✅ Preprocessing Pipeline Created Successfully!


In [92]:
## Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

X_train_transformed = preprocessor.fit_transform(X_train)

print("Transformed shape:", X_train_transformed.shape)
print("Feature names length:", len(preprocessor.get_feature_names_out()))


X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

Transformed shape: (1022, 63)
Feature names length: 63


ValueError: Shape of passed values is (1022, 1), indices imply (1022, 63)

In [90]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1022, 13), (438, 13), (1022, 1), (438, 1))

In [69]:
X_train.shape, y_train.shape

((1022, 13), (1022, 1))

In [61]:
print(X.shape)
print(X.columns)

(1460, 13)
Index(['OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'ExterQual',
       'KitchenQual', 'BsmtQual', 'Foundation', 'GarageFinish', 'GarageType',
       'SaleCondition', 'MSZoning', 'HouseStyle'],
      dtype='object')


In [35]:
df.columns

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'ExterQual',
       'KitchenQual', 'BsmtQual', 'Foundation', 'GarageFinish', 'GarageType',
       'SaleCondition', 'MSZoning', 'HouseStyle', 'SalePrice'],
      dtype='object')

In [29]:
df["BsmtQual"] = df["BsmtQual"].fillna("NoBsmt")

In [70]:

Categorical_col = ["Neighborhood", "ExterQual", "KitchenQual", "Foundation", "BsmtQual"]
Numerical_col = ["OverallQual", "GrLivArea", "GarageCars", "TotalBsmtSF", "FullBath", "YearBuilt"]
# 1. Ordinal Encoding
ordinal_cols = ["ExterQual", "KitchenQual", "BsmtQual"]
quality_categories = ['NoBsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

encoder = OrdinalEncoder(categories=[quality_categories]*3)
df[ordinal_cols] = encoder.fit_transform(df[ordinal_cols])

# 2. Target Encoding
neighborhood_mean = df.groupby("Neighborhood")["SalePrice"].mean()
df["Neighborhood"] = df["Neighborhood"].map(neighborhood_mean)

# 3. OneHot Encoding
df = pd.get_dummies(df, columns=["Foundation"], drop_first=True)

ValueError: Found unknown categories [nan] in column 2 during fit

In [23]:
df[Categorical_col].describe()

KeyError: "['Foundation'] not in index"