# Setup Imports

In [1]:
import numpy as np
# NumPy is used for numerical operations (arrays, math functions, etc.)

import pandas as pd
# Pandas is used to load, inspect, clean, and manipulate tabular datasets (DataFrames)


from sklearn.model_selection import train_test_split
# Used to split the dataset into training and testing sets
# This helps us evaluate model performance on unseen data


from sklearn.impute import SimpleImputer, KNNImputer
# SimpleImputer: fills missing values using strategies like mean, median, mode, or constant
# KNNImputer: fills missing values using nearest neighbors (more advanced imputation)


from sklearn.preprocessing import (
    StandardScaler,    # Standardizes features (mean=0, std=1)
    MinMaxScaler,      # Scales features to a fixed range (usually 0 to 1)
    RobustScaler,      # Scales using median and IQR (robust to outliers)
    MaxAbsScaler,      # Scales by maximum absolute value (good for sparse data)
    Normalizer         # Normalizes each row to unit length (more common in text/vector data)
)
# These are all feature scaling techniques used before training ML models


from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# OneHotEncoder: converts categorical variables into binary (0/1) columns
# OrdinalEncoder: converts categories into integer labels


from sklearn.compose import ColumnTransformer
# Allows us to apply different preprocessing steps to different column types
# (e.g., scaling numeric columns and encoding categorical columns)


from sklearn.pipeline import Pipeline
# Chains preprocessing steps and the ML model together
# Ensures consistent and leakage-free workflow


from sklearn.linear_model import LinearRegression, LogisticRegression
# LinearRegression: used for predicting continuous values (e.g., house prices)
# LogisticRegression: used for binary classification (e.g., HighPrice = 0 or 1)


from sklearn.metrics import (
    r2_score,               # Measures regression performance (explained variance)
    mean_squared_error,     # Measures regression error magnitude
    accuracy_score,         # Measures classification accuracy
    classification_report   # Provides precision, recall, F1-score
)
# These help evaluate model performance


from pathlib import Path
# Used to create clean and OS-independent file paths (better than hardcoding strings)


import joblib
# Used to save and load trained models or preprocessing pipelines
# Important for deployment and reproducibility


In [2]:
# Mounting your google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset Overview: House Prices – Advanced Regression Techniques
For this preprocessing demonstration, we are using the House Prices – Advanced Regression Techniques dataset from Kaggle.

**Dataset Link:**
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

This dataset is based on the Ames Housing dataset, which includes detailed information about residential homes in Ames, Iowa.

**It contains:**

- Numeric features (e.g., LotArea, YearBuilt, TotalBsmtSF)

- Categorical features (e.g., Neighborhood, HouseStyle, GarageType)

- Missing values (useful for demonstrating imputation)

- Target variable: SalePrice (continuous variable)


**Why We Chose This Dataset**

This dataset is ideal for demonstrating preprocessing techniques because:

- It contains both numeric and categorical variables

- It includes missing values

- It allows us to demonstrate scaling and encoding

- It is realistic and slightly messy (like real-world data)

**ML Tasks in This Notebook**

**We use this dataset for:**

- Linear Regression → Predicting SalePrice

- Logistic Regression → Predicting HighPrice
(a binary variable created based on whether the house price is above the median)

This allows us to demonstrate preprocessing techniques for both regression and classification using the same dataset.

# 1) Load Kaggle House Prices dataset (train.csv)

We first load the dataset into a pandas DataFrame called df. This is the raw “table” we will inspect and clean before training ML models.

In [3]:
# Create a path object pointing to the dataset
data_path = Path("/content/drive/MyDrive/ML/train.csv") # --> customize the path according to your folder structure here

# Load dataset
df = pd.read_csv(data_path)

# Quick preview
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


**Note:**
df.head() is used to just get a quick preview of the dataset.

By default, it will show only the first five rows in the dataset. If you wish to see more rows for a better understanding, you can pass the number of rows you want to see as a parameter inside the function.

**Eg.:** df.head(10) --> this will show the first ten rows

# 2) Keep a subset of columns

The original dataset has many columns. To make things easy, we keep a smaller set that still includes both numeric and categorical features (and missing values), so we can demonstrate all preprocessing steps clearly.

In [4]:
cols = [
    "SalePrice",          # Target variable for regression (continuous output)
    "LotArea",            # Numeric feature (lot size in square feet)
    "LotFrontage",        # Numeric feature with missing values
    "OverallQual",        # Numeric feature representing overall material/finish quality (ordinal-like)
    "YearBuilt",          # Numeric feature (year the house was built)
    "TotalBsmtSF",        # Numeric feature (total basement area in square feet)
    "Neighborhood",       # Categorical feature (location of the house)
    "HouseStyle",         # Categorical feature (type/style of house)
    "GarageType",         # Categorical feature with missing values
    "MSZoning"            # Categorical feature with missing values
]

df = df[cols].copy()
# We create an independent copy of only the selected columns. Using .copy()
# prevents SettingWithCopyWarning and ensures modifications to this DataFrame do not affect the original one.

df.head()


Unnamed: 0,SalePrice,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF,Neighborhood,HouseStyle,GarageType,MSZoning
0,208500,8450,65.0,7,2003,856,CollgCr,2Story,Attchd,RL
1,181500,9600,80.0,6,1976,1262,Veenker,1Story,Attchd,RL
2,223500,11250,68.0,7,2001,920,CollgCr,2Story,Attchd,RL
3,140000,9550,60.0,7,1915,756,Crawfor,2Story,Detchd,RL
4,250000,14260,84.0,8,2000,1145,NoRidge,2Story,Attchd,RL


# 3) Create targets for Linear and Logistic Regression

For Linear Regression, we predict the actual **SalePrice**. For Logistic Regression, we create a simple yes/no label called **HighPrice** based on whether a house is above the median price.

In [5]:
# Create a new binary target variable column called "HighPrice".
# If a house's SalePrice is greater than the median SalePrice, assign 1.
# Otherwise, assign 0.
# This converts a continuous regression target into a classification target.

df["HighPrice"] = (df["SalePrice"] > df["SalePrice"].median()).astype(int)


# Display the original SalePrice and the new HighPrice column
# to verify that the binary transformation was created correctly.

df[["SalePrice", "HighPrice"]].head()

Unnamed: 0,SalePrice,HighPrice
0,208500,1
1,181500,1
2,223500,1
3,140000,0
4,250000,1


**Note:**

**astype(int)** converts the data type of a column into integers.

In this case, the comparison: **df["SalePrice"] > df["SalePrice"].median()** produces Boolean values:
- True → price is above the median

- False → price is below or equal to the median

Machine learning models typically require numeric inputs, not Boolean values.
So:

- True becomes 1

- False becomes 0

This converts the logical condition into a proper numeric binary target suitable for classification models like Logistic Regression.



# 4) Split features (X) and targets (y)

We separate inputs (features) from outputs (targets). Models learn patterns from X to predict y.

In [6]:
# Create the feature matrix by removing the target variables.
# X contains only the input variables that will be used to train the models.

X = df.drop(columns=["SalePrice", "HighPrice"])

# Define the regression target variable.
# This will be used for predicting continuous house prices (Linear Regression).

y_reg = df["SalePrice"]

# Define the classification target variable.
# This will be used for predicting whether a house is high-priced (Logistic Regression).

y_clf = df["HighPrice"]


# Display the first few rows of the feature matrix
# to verify that target variables have been removed correctly.

X.head()

Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF,Neighborhood,HouseStyle,GarageType,MSZoning
0,8450,65.0,7,2003,856,CollgCr,2Story,Attchd,RL
1,9600,80.0,6,1976,1262,Veenker,1Story,Attchd,RL
2,11250,68.0,7,2001,920,CollgCr,2Story,Attchd,RL
3,9550,60.0,7,1915,756,Crawfor,2Story,Detchd,RL
4,14260,84.0,8,2000,1145,NoRidge,2Story,Attchd,RL


**Note:**
The **drop()** function is used to remove rows or columns from a dataset.

In this case, it is used to remove specific columns so they are not included in the feature matrix.

The columns parameter tells the function which columns should be removed.

- If you want to remove multiple columns, pass them as a list.

- If you want to remove only one column, you can pass just the column name as a string

# **A) Missing values: detection + handling**

## A1) df.isna() — see missing values (True/False)

This shows where values are missing. True means “this cell is empty/NA” and False means it has data.

In [7]:
# First, check for missing values in the dataset using isna().
# Then, head(10) displays the first 10 rows so we can visually inspect where missing values occur.
# This is an example of function chaining: The output of isna() becomes the input to head().

X.isna().head(10)

Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF,Neighborhood,HouseStyle,GarageType,MSZoning
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False
7,False,True,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False


**Note:**

**isna()** checks each cell in the dataset and identifies whether it contains a missing value.

It returns:

- True if the value is missing (NaN)

- False if the value is present

It does not modify the data — it simply helps detect where missing values exist so they can be handled properly.

## A2) df.isna().sum() — missing values per column

This counts how many missing values each column has. It helps you decide where you need to clean or impute.

In [8]:
# Check the total number of missing values in each column.
# sum() adds up the True values in each column.
# Since True is treated as 1 and False as 0,
# the result shows how many missing values each column contains.

X.isna().sum()

Unnamed: 0,0
LotArea,0
LotFrontage,259
OverallQual,0
YearBuilt,0
TotalBsmtSF,0
Neighborhood,0
HouseStyle,0
GarageType,81
MSZoning,0


**Note:**

isna() → identifies missing values

sum() → counts them

Result → number of missing entries per column

This is one of the most common first checks in preprocessing before building ML models.

# A3) df.isna().sum(axis=1) — missing values per row
This counts missing values per row (per house). It helps identify rows that are “too incomplete” and might be removed.

In [9]:
# Step 1: isna() identifies missing values in every cell (True = missing, False = present).
# Step 2: sum(axis=1) adds across columns for each row.
#         axis=1 means we are summing horizontally (row-wise).
#
# This tells us how many missing values each individual sample (house)

X.isna().sum(axis=1).head(10)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,1
8,0
9,0


# A4) df.dropna() — drop rows that contain any missing value
This removes any row that has at least one missing value. It’s simple, but you can lose a lot of data

In [10]:
# dropna() deletes any row where one or more columns have missing data.
# This is a simple way to clean data, but it can reduce the dataset size significantly.
#
# We then compare the original shape with the new shape
# to see how many rows were removed.
# .shape returns (number_of_rows, number_of_columns).

X_drop_rows = X.dropna()
(X.shape, X_drop_rows.shape)

((1460, 9), (1127, 9))

# A5) df.dropna(axis=1) — drop columns that contain any missing value
This removes columns that have missing values. It’s easy, but you might accidentally delete important features.

In [11]:
# dropna(axis=1) deletes columns instead of rows.
# axis=1 means the operation is performed column-wise.
#
# This keeps only columns that have no missing values at all.
# While simple, this approach can remove important features.
#
# We compare the original shape with the new shape
# and also display the remaining column names
# to see which features were retained.

X_drop_cols = X.dropna(axis=1)
(X.shape, X_drop_cols.shape, X_drop_cols.columns)

((1460, 9),
 (1460, 7),
 Index(['LotArea', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'Neighborhood',
        'HouseStyle', 'MSZoning'],
       dtype='object'))

# A6) df.dropna(subset=[...]) — drop rows only if specific columns are missing
Sometimes only certain columns are critical. This drops rows only when the value in those particular columns are missing.

In [12]:
# Remove rows only if specific columns contain missing values.
#
# The subset parameter allows us to specify which columns
# should be checked for missing values.
# In this case, rows will be removed only if
# "LotFrontage" or "GarageType" is missing.
#
# Other columns are ignored when deciding whether to drop the row.
# This is useful when only certain features are considered critical.
#
# We compare the original shape with the new shape
# to see how many rows were removed.

X_drop_subset = X.dropna(subset=["LotFrontage", "GarageType"])
(X.shape, X_drop_subset.shape)

((1460, 9), (1127, 9))

# A7) df.fillna(value) — fill all missing values with one constant
This replaces missing values with a single constant. It’s fast and sometimes useful (like filling missing categories with "Missing").

In [13]:
# Replace all missing values in the dataset with a constant value.
#
# fillna("Missing") tells the dataset to replace every missing entry
# with the string "Missing".
# You can chose to replace it with any other value.

# After filling, we check again for missing values using isna().sum()
# to confirm that no missing values remain.

X_fill_const = X.fillna("Missing")
X_fill_const.isna().sum()

Unnamed: 0,0
LotArea,0
LotFrontage,0
OverallQual,0
YearBuilt,0
TotalBsmtSF,0
Neighborhood,0
HouseStyle,0
GarageType,0
MSZoning,0


# A8) df[col].fillna(df[col].mean()) — mean imputation (one column)
If a numeric column has missing values, replacing them with the column’s average is a common quick fix.

In [16]:
# Create a separate copy of the dataset so the original data remains unchanged.
#
# Replace missing values in the "LotFrontage" column
# with the mean (average) of the existing values in that column.
#
# When calculating the mean, missing values (NaN) are automatically ignored.
# This means the average is computed only using the available (non-missing) values.
#
# This method is called mean imputation and assumes that
# missing values can be reasonably approximated by the overall average.
#
# Finally, we check whether any missing values remain in the "LotFrontage" column.

X_mean = X.copy()
X_mean["LotFrontage"] = X_mean["LotFrontage"].fillna(X_mean["LotFrontage"].mean())
X_mean["LotFrontage"].isna().sum()

np.int64(0)

# A9) df[col].fillna(df[col].median()) — median imputation (one column)
Median is often safer than mean when there are outliers (very large/small values).

In [17]:
# Create a separate copy of the dataset so the original data remains unchanged.
#
# Replace missing values in the "LotFrontage" column
# with the median (middle value) of the existing values in that column.
#
# The median is calculated using only the non-missing values.
# Missing values (NaN) are automatically ignored during the calculation.
#
# Median imputation is often preferred when the data contains outliers,
# because the median is less affected by extremely large or small values.
#
# Finally, we check whether any missing values remain in the "LotFrontage" column.

X_median = X.copy()
X_median["LotFrontage"] = X_median["LotFrontage"].fillna(X_median["LotFrontage"].median())
X_median["LotFrontage"].isna().sum()


np.int64(0)

# A10) df[col].fillna(df[col].mode()[0]) — mode imputation (one column)
Mode means “most common value.” This is often used for categorical columns.

In [18]:
# Create a separate copy of the dataset so the original data remains unchanged.
#
# Replace missing values in the "GarageType" column
# with the mode (the most frequently occurring value) of that column.
#
# mode() returns one or more values that appear most often.
# Since it returns a collection, we select the first value using [0].
#
# Missing values (NaN) are automatically ignored when computing the mode.
#
# Mode imputation is commonly used for categorical features,
# because replacing missing entries with the most common category
# preserves the overall distribution reasonably well.
#
# Finally, we check whether any missing values remain in the "GarageType" column.

X_mode = X.copy()
X_mode["GarageType"] = X_mode["GarageType"].fillna(X_mode["GarageType"].mode()[0])
X_mode["GarageType"].isna().sum()

np.int64(0)

# A11) SimpleImputer(strategy="mean")
SimpleImputer is the scikit-learn way to impute missing values. It’s better for ML because you can fit it on training data only, then reuse it on test data.

In [19]:
# Identify all numeric columns in the dataset.
#
# select_dtypes(include=[np.number]) selects only columns that contain numeric data types.
# .columns extracts just the column names from that selection.
# This helps us apply numeric preprocessing only to numeric features.

numeric_cols = X.select_dtypes(include=[np.number]).columns


# Create a SimpleImputer object with strategy="mean".
#
# This means missing values in numeric columns
# will be replaced with the column's mean (average).

imp_mean = SimpleImputer(strategy="mean")


# Fit the imputer on the numeric columns and transform the data.
# The result is returned as a NumPy array.

X_num_imp_mean = imp_mean.fit_transform(X[numeric_cols])


# Convert the transformed array back into a DataFrame
# and restore the original column names for clarity.
#
# Since the display limit is set higher (e.g., vertical limit = 200),
# more rows can be viewed without truncation.

pd.DataFrame(X_num_imp_mean, columns=numeric_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,8450.0,65.0,7.0,2003.0,856.0
1,9600.0,80.0,6.0,1976.0,1262.0
2,11250.0,68.0,7.0,2001.0,920.0
3,9550.0,60.0,7.0,1915.0,756.0
4,14260.0,84.0,8.0,2000.0,1145.0


## **Note**
**select_dtypes(include=[np.number])**

Selects only the numeric columns from the dataset.
This ensures that numeric preprocessing (like mean imputation) is applied only to numerical features.

**.columns**

Extracts the column names from the selected subset.
This allows us to reference those numeric columns later.

**SimpleImputer(strategy="mean")**

Creates an imputation object that will replace missing values with the mean of each column.
The mean is calculated using only the non-missing values.

**fit_transform()**

Performs two steps in one call:

- fit() → learns the mean value of each numeric column

- transform() → replaces missing values using those learned means

It returns a NumPy array containing the imputed data.

**pd.DataFrame(..., columns=...)**

Converts the NumPy array back into a DataFrame and restores the original column names for clarity and readability.

# A12) SimpleImputer(strategy="median")
Median imputation using scikit-learn works the same way, and is more robust to outliers.

In [20]:
# Create a SimpleImputer object with strategy="median".
#
# This means missing values in numeric columns
# will be replaced with the median (middle value) of each column.

imp_median = SimpleImputer(strategy="median")

# Fit the imputer on the numeric columns and transform the data.
# The result is returned as a NumPy array.

X_num_imp_median = imp_median.fit_transform(X[numeric_cols])

# Convert the transformed array back into a DataFrame and restore the original column names for clarity.
pd.DataFrame(X_num_imp_median, columns=numeric_cols).head()

Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,8450.0,65.0,7.0,2003.0,856.0
1,9600.0,80.0,6.0,1976.0,1262.0
2,11250.0,68.0,7.0,2001.0,920.0
3,9550.0,60.0,7.0,1915.0,756.0
4,14260.0,84.0,8.0,2000.0,1145.0


# A13) SimpleImputer(strategy="most_frequent")
For categorical columns, filling missing values with the most common category is a typical approach.

In [21]:
# Identify all categorical (non-numeric) columns in the dataset.

cat_cols = X.select_dtypes(exclude=[np.number]).columns


# Create a SimpleImputer with strategy="most_frequent".
#
# This means missing values will be replaced with the most frequently occurring value (mode) in each column.
# This is commonly used for categorical features.

imp_most = SimpleImputer(strategy="most_frequent")


# Fit the imputer on the categorical columns and transform the data.
# The result is returned as a NumPy array.

X_cat_imp_most = imp_most.fit_transform(X[cat_cols])


# Convert the transformed array back into a DataFrame
# and restore the original column names for clarity.

pd.DataFrame(X_cat_imp_most, columns=cat_cols).head()


Unnamed: 0,Neighborhood,HouseStyle,GarageType,MSZoning
0,CollgCr,2Story,Attchd,RL
1,Veenker,1Story,Attchd,RL
2,CollgCr,2Story,Attchd,RL
3,Crawfor,2Story,Detchd,RL
4,NoRidge,2Story,Attchd,RL


##**Note**

**include vs exclude in select_dtypes()**

include selects only the specified data types, while exclude removes the specified data types.

include=[np.number] → keeps only numeric columns

exclude=[np.number] → removes numeric columns and keeps non-numeric (categorical) columns

We use this distinction to separate numeric and categorical features so that each type can be preprocessed appropriately.

# A14) SimpleImputer(strategy="constant")
This replaces missing values with a constant like "Missing" for categories or 0 for numbers. It’s very explicit and easy to explain.

In [22]:
# Create a SimpleImputer with strategy="constant".
#
# This means all missing values in the selected columns will be replaced with a fixed value.
# Here, we set fill_value="Missing", which inserts the string "Missing" wherever a value is absent.
#
# This approach is useful for categorical features,
# where explicitly marking missing entries can preserve information.

imp_const = SimpleImputer(strategy="constant", fill_value="Missing")


# Fit the imputer on the categorical columns and transform the data.
#
# fit_transform() first learns nothing from the data (since the value is constant),
# and then replaces all missing values with the specified constant.
#
# The result is returned as a NumPy array.

X_cat_imp_const = imp_const.fit_transform(X[cat_cols])


# Convert the transformed array back into a DataFrame
# and restore the original column names for readability.

pd.DataFrame(X_cat_imp_const, columns=cat_cols).head()


Unnamed: 0,Neighborhood,HouseStyle,GarageType,MSZoning
0,CollgCr,2Story,Attchd,RL
1,Veenker,1Story,Attchd,RL
2,CollgCr,2Story,Attchd,RL
3,Crawfor,2Story,Detchd,RL
4,NoRidge,2Story,Attchd,RL


# A15) KNNImputer()
KNN imputation fills missing values by looking at “similar rows” (nearest neighbors). It can be more realistic, but it’s heavier than SimpleImputer.

In [23]:
# Create a KNNImputer object.
#
# KNNImputer fills missing values using the k-Nearest Neighbors approach.
# n_neighbors=5 means that for each missing value,
# the algorithm looks at the 5 most similar rows (based on other features)
# and uses their values to estimate the missing one.

knn_imp = KNNImputer(n_neighbors=5)


# Fit the imputer on the numeric columns and transform the data.
#
# The result is returned as a NumPy array.

X_num_knn = knn_imp.fit_transform(X[numeric_cols])


# Convert the transformed array back into a DataFrame
# and restore the original column names for clarity.

pd.DataFrame(X_num_knn, columns=numeric_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,8450.0,65.0,7.0,2003.0,856.0
1,9600.0,80.0,6.0,1976.0,1262.0
2,11250.0,68.0,7.0,2001.0,920.0
3,9550.0,60.0,7.0,1915.0,756.0
4,14260.0,84.0,8.0,2000.0,1145.0


## **Note**
KNNImputer is not only for KNN models.

It is a preprocessing technique and can be used before any machine learning model (Linear Regression, Logistic Regression, Random Forest, etc.).

It works best with numeric data, because it relies on distance calculations between rows.

It may not be ideal for purely categorical data unless categories are encoded numerically first.

It is computationally heavier than SimpleImputer, especially for large datasets.

Use KNNImputer when:

- You believe missing values are related to patterns in other features.

- You want a more data-driven imputation than simple mean/median filling.

- The dataset is not extremely large (since it can be slow).

# B) Duplicates

# B1) df.duplicated().sum() — count duplicate rows
Duplicates are repeated rows. Counting them helps detect accidental repeats that could bias training.

In [24]:
# duplicated() returns True for rows that are exact copies of a previous row and False otherwise.
#
# sum() counts how many True values exist, which gives the total number of duplicate rows.
#
# Identifying duplicates is important because repeated records
# can bias model training and distort evaluation results.

df.duplicated().sum()

np.int64(0)

# B2) df.drop_duplicates() — remove duplicates
This removes exact duplicate rows. It’s a simple cleanup step before modeling.

In [25]:
# Remove duplicate rows from the dataset.
#
# drop_duplicates() removes rows that are exact copies
# of earlier rows in the dataset.
#
# We then compare the original shape with the cleaned shape
# to see how many rows were removed.
# .shape returns (number_of_rows, number_of_columns).

df_no_dupes = df.drop_duplicates()
(df.shape, df_no_dupes.shape)


((1460, 11), (1460, 11))

# C) Data type fixes and parsing

# C1) Inspect types with df.dtypes
Models need the right data types. This shows which columns are numeric vs categorical (object/string).

In [26]:
# Display the data type of each column in the dataset.
df.dtypes

Unnamed: 0,0
SalePrice,int64
LotArea,int64
LotFrontage,float64
OverallQual,int64
YearBuilt,int64
TotalBsmtSF,int64
Neighborhood,object
HouseStyle,object
GarageType,object
MSZoning,object


##**Note**
**dtypes** shows whether each column is stored as numeric,categorical (object), integer, float, etc.

Understanding data types is important because:
 - Numeric columns are used for mathematical operations and scaling.
- Categorical columns require encoding before modeling.
- Incorrect data types can lead to preprocessing errors.

This step helps verify that each feature is stored in the expected format before applying transformations.

# C2) pd.to_numeric(..., errors="coerce") — convert messy numeric strings to numbers
Sometimes numbers are stored like text (example: "1,234"). to_numeric converts them into real numbers. errors="coerce" turns unparseable values into NaN so you can handle them cleanly.

In [27]:
# Create a separate copy of the dataset so the original data remains unchanged.

X_parse = X.copy()


# Here I am creating a teaching example by converting a numeric column into a messy string format.
# Here, LotArea values are converted into strings with comma separators (e.g., 8450 becomes "8,450").
# This simulates real-world scenarios where numeric data may be stored as text.

X_parse["LotArea_str"] = X_parse["LotArea"].map(lambda v: f"{int(v):,}")


# Convert the messy string column back into a numeric format.

X_parse["LotArea_parsed"] = pd.to_numeric(
    X_parse["LotArea_str"].str.replace(",", ""),
    errors="coerce"
)


# Display original numeric values, the messy string version,
# and the parsed numeric result to verify the transformation.

X_parse[["LotArea", "LotArea_str", "LotArea_parsed"]].head()


Unnamed: 0,LotArea,LotArea_str,LotArea_parsed
0,8450,8450,8450
1,9600,9600,9600
2,11250,11250,11250
3,9550,9550,9550
4,14260,14260,14260


## **Note**
**str.replace()** is used to modify text inside a string column.
In this example: .str.replace(",", "") - It removes commas from values

This step is necessary because commas prevent numeric conversion.
Before converting text to numbers, we must clean formatting characters such as:

- Commas (,)

- Currency symbols ($)

- Spaces

- Percentage signs (%)

The errors parameter controls what happens if a value cannot be converted to a number.
- errors="raise" → throws an error (default behavior)

- errors="coerce" → converts invalid values to NaN

- errors="ignore" → leaves the value unchanged

In preprocessing, we usually use: errors="coerce"

This is safer because:

- It prevents the program from crashing.

- Any invalid value becomes NaN.

- We can then handle those NaN values using imputation.

**You can do the string character replacement and conversion into numeric separately, rather than chaining the functions**

# C3) pd.to_datetime(...) — parse dates
Dates are often strings or integers. Turning them into datetime lets you extract year/month/day and do proper time-based features later.

In [28]:
# Create a separate copy of the dataset so the original data remains unchanged.

X_dates = X.copy()


# Create a synthetic date column from the YearBuilt feature.
#
# Step 1: Convert the YearBuilt values to string format.
# Step 2: Append "-01-01" so each year becomes a full date (January 1 of that year).
# Step 3: Convert the resulting string into a proper datetime object.

X_dates["BuiltDate"] = pd.to_datetime(
    X_dates["YearBuilt"].astype(str) + "-01-01",
    errors="coerce"
)

# Display the original YearBuilt column and the new BuiltDate column to verify the conversion.

X_dates[["YearBuilt", "BuiltDate"]].head()


Unnamed: 0,YearBuilt,BuiltDate
0,2003,2003-01-01
1,1976,1976-01-01
2,2001,2001-01-01
3,1915,1915-01-01
4,2000,2000-01-01


## **Note**
**pd.to_datetime()** converts data into a datetime format that pandas can understand and manipulate. It takes values such as "2005-01-01", "03/15/2010", "1998", and converts them into structured datetime objects.

When to_datetime() runs:

- It reads the input values (usually strings).

- It tries to detect the date format automatically.

- It parses the year, month, and day.

- It converts the value into a standardized datetime representation.

If it cannot interpret a value, behavior depends on the errors parameter.


# C4) astype("category") — convert a column to categorical type
This tells pandas “this column is a category, not free-form text.” It can help memory usage and makes the intent clearer.

In [29]:
# Create a separate copy of the dataset so the original data remains unchanged.

X_cat_type = X.copy()


# Convert the "Neighborhood" column to categorical type.
#
# astype("category") tells the dataset that this column
# represents discrete categories rather than free-form text.
#
# This can improve memory efficiency and clearly signals that the feature is categorical in nature.

X_cat_type["Neighborhood"] = X_cat_type["Neighborhood"].astype("category")


# Display the data type of the column to confirm that it has been converted to a categorical type.

X_cat_type["Neighborhood"].dtype


CategoricalDtype(categories=['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr',
                  'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR',
                  'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes',
                  'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer',
                  'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
, ordered=False, categories_dtype=object)

## **Note**
We pass "category" as a string because astype() expects the name of the target data type.

In pandas, data types can be specified either:

- As built-in Python types (int, float, str)

- Or as pandas-recognized type names written as strings ("category", "float64", etc.)

Here, "category" tells pandas: “Convert this column into a categorical data type.”

When a column is converted to categorical type:

- It identifies all unique values in that column.

- It stores those unique values as a fixed set of categories.

- Internally, it replaces the actual text values with integer codes.

- It maintains a mapping between codes and category labels.

Converting to "category": Does NOT automatically encode it for machine learning. You still need encoding (e.g., OneHotEncoder) before feeding it into most ML models.

# C5) .str.strip() — remove extra spaces in strings
Extra spaces can create fake categories (like "NAmes" vs "NAmes "). Stripping whitespace prevents that.

In [30]:
# Create a separate copy of the dataset so the original data remains unchanged.

X_strip = X.copy()


# Teaching example: artificially add extra spaces to the first few rows of the "MSZoning" column.
#
# This simulates real-world messy data where text fields may contain
# unintended leading or trailing whitespace.

X_strip.loc[X_strip.index[:5], "MSZoning"] = (
    X_strip.loc[X_strip.index[:5], "MSZoning"].astype(str) + "  ")


# Clean the column by removing leading and trailing spaces.
#
# astype(str) ensures all values are treated as strings.
# str.strip() removes whitespace from both ends of each value.
#
# This prevents duplicate categories caused by hidden spaces
# (e.g., "RL" vs "RL  ").

X_strip["MSZoning"] = X_strip["MSZoning"].astype(str).str.strip()


# Display the first 10 rows to verify that extra spaces have been removed.

X_strip["MSZoning"].head(10)


Unnamed: 0,MSZoning
0,RL
1,RL
2,RL
3,RL
4,RL
5,RL
6,RL
7,RL
8,RM
9,RL


# C6) .str.lower() — standardize text to lowercase
Lowercasing reduces category duplicates caused by capitalization differences (like "RL" vs "rl").

In [31]:
# Create a separate copy of the dataset so the original data remains unchanged.

X_lower = X.copy()


# Convert all values in the "HouseStyle" column to lowercase.
#
# astype(str) ensures that all values are treated as strings.
# str.lower() converts each value to lowercase.
#
# This helps standardize categorical text data and prevents
# duplicate categories caused by inconsistent capitalization
# (e.g., "TwoStory" vs "twostory").

X_lower["HouseStyle"] = X_lower["HouseStyle"].astype(str).str.lower()


# Display the first few rows to verify that the text has been converted to lowercase.

X_lower["HouseStyle"].head()


Unnamed: 0,HouseStyle
0,2story
1,1story
2,2story
3,2story
4,2story


# D) Z-score standardization

# D1) Manual Z-score: (x - mean) / std
Z-score standardization rescales values so they have mean 0 and standard deviation 1. This helps many models (including logistic regression) train more smoothly.

In [32]:
# Select a numeric column for demonstration.
# Here, we choose "LotArea" to manually compute z-score standardization.

col = "LotArea"


# Convert the selected column to float type.
# This ensures that mathematical operations can be performed safely.

x = X[col].astype(float)


# Manually compute the z-score for each value in the column.
#
# Step 1: x.mean() calculates the average of the column.
# Step 2: x.std(ddof=0) calculates the standard deviation.
#         ddof=0 means we are using population standard deviation.
# Step 3: (x - mean) centers the data around zero.
# Step 4: Dividing by standard deviation scales the spread.
#
# The result is a standardized column with:
# - Mean approximately 0
# - Standard deviation approximately 1

z_manual = (x - x.mean()) / x.std(ddof=0)


# Display the first few standardized values.

z_manual.head()



Unnamed: 0,LotArea
0,-0.207142
1,-0.091886
2,0.07348
3,-0.096897
4,0.375148


**Note:** You generally do not use manual z-score standardization. You will use the automatic method StandardScaler() as shown below.

# E) Scalers (fit on train only)


# E0) Train/test split (common base for scaler demos)
We split data into train and test sets. We fit preprocessing on the training set only, then apply it to both sets. This prevents “peeking” at the test set.

In [33]:
# Split the dataset into training and testing sets.
#
# X contains the input features.
# y_reg contains the regression target (SalePrice).
#
# test_size=0.2 means 20% of the data will be used for testing,
# and the remaining 80% will be used for training.
#
# random_state=42 ensures reproducibility.
# This means the split will be the same every time the code runs.
#
# The function returns:
# - X_train: features for training
# - X_test:  features for testing
# - y_train_reg: target values for training
# - y_test_reg:  target values for testing

X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)


# Display the shapes of the training and testing feature sets
# to confirm the split proportions.

(X_train.shape, X_test.shape)


((1168, 9), (292, 9))

# E1) StandardScaler (fit on train, transform train/test)
StandardScaler does z-score standardization automatically. It learns mean/std from training data, then applies the same scaling to test data.

In [34]:
# Identify all numeric feature columns.
#
# select_dtypes(include=[np.number]) selects only numeric columns.
# .columns extracts the column names.
# .tolist() converts them into a regular Python list so they can be indexed easily.

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Create a StandardScaler object.
#
# StandardScaler performs z-score standardization automatically
#
# This is especially useful for models like Linear Regression and Logistic Regression.

scaler = StandardScaler()

# Fit the scaler on the training data and transform it.
#
# IMPORTANT: We fit only on training data to prevent data leakage.

X_train_scaled = scaler.fit_transform(X_train[num_cols])


# Transform the test data using the same scaling parameters learned from the training data.
# We do NOT call fit() on the test data.

X_test_scaled = scaler.transform(X_test[num_cols])


# Convert the scaled NumPy array back into a DataFrame and restore the column names for readability.

pd.DataFrame(X_train_scaled, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,-0.212896,-0.013818,-0.820445,-0.455469,0.572612
1,-0.265245,-0.455871,-0.088934,0.718609,-0.596547
2,-0.177841,-0.134378,-0.820445,-1.988293,-0.603357
3,-0.324474,-0.415684,-0.820445,-1.107734,-0.750921
4,-0.529035,-0.81755,-0.820445,-1.531707,-0.081209


# E2) MinMaxScaler
MinMaxScaler squeezes values into a fixed range (usually 0 to 1). It’s useful when you want all numeric features on the same bounded scale.

In [35]:
# Create a MinMaxScaler object.
#
# MinMaxScaler rescales numeric features to a fixed range, typically between 0 and 1.
#
# Formula:
# (value - min) / (max - min)
#
# This preserves the shape of the distribution but changes the scale of the values.

scaler = MinMaxScaler()


# Fit the scaler on the training data and transform it.
#
# We fit only on training data to avoid data leakage.

X_train_scaled = scaler.fit_transform(X_train[num_cols])


# Transform the test data using the same learned min/max values.
# We do NOT fit again on the test data.

X_test_scaled = scaler.transform(X_test[num_cols])


# Convert the scaled NumPy array back into a DataFrame and restore column names for readability.

pd.DataFrame(X_train_scaled, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,0.033186,0.167808,0.444444,0.615942,0.215057
1,0.030555,0.130137,0.555556,0.876812,0.130769
2,0.034948,0.157534,0.444444,0.275362,0.130278
3,0.027577,0.133562,0.444444,0.471014,0.11964
4,0.017294,0.099315,0.444444,0.376812,0.167921


# E3) RobustScaler
RobustScaler uses median and IQR instead of mean/std, so it’s less sensitive to outliers (extreme values).

In [36]:
# Create a RobustScaler object.
#
# RobustScaler scales features using the median and the interquartile range (IQR),
# instead of the mean and standard deviation.
#
# Formula conceptually:
# (value - median) / IQR
#
# This makes it less sensitive to extreme values (outliers) compared to StandardScaler.

scaler = RobustScaler()

# Fit the scaler on the training data and transform it.
#
# Fitting only on training data prevents data leakage.

X_train_scaled = scaler.fit_transform(X_train[num_cols])

# Transform the test data using the same median and IQR learned from the training data.

X_test_scaled = scaler.transform(X_test[num_cols])

# Convert the scaled NumPy array back into a DataFrame and restore the column names for clarity.

pd.DataFrame(X_train_scaled, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,-0.291776,0.0,-0.5,-0.3125,0.628912
1,-0.428667,-0.52381,0.0,0.4375,-0.394436
2,-0.200109,-0.142857,-0.5,-1.291667,-0.400397
3,-0.583551,-0.47619,-0.5,-0.729167,-0.529558
4,-1.118473,-0.952381,-0.5,-1.0,0.056632


# E4) MaxAbsScaler
MaxAbsScaler scales each feature by its maximum absolute value. It’s often used when working with sparse data, but still useful to know.

In [37]:
# Create a MaxAbsScaler object.
#
# MaxAbsScaler scales each numeric feature by dividing
# by its maximum absolute value.
#
# Formula conceptually:
# value / max(|value|)
#
# This rescales features into the range [-1, 1] while preserving zero values and sparsity.
#
# It is especially useful for sparse data, but can also be applied to general numeric features.

scaler = MaxAbsScaler()


# Fit the scaler on the training data and transform it.
#
# Fitting only on training data prevents data leakage.

X_train_scaled = scaler.fit_transform(X_train[num_cols])


# Transform the test data using the same scaling parameters learned from the training data.

X_test_scaled = scaler.transform(X_test[num_cols])

# Convert the scaled NumPy array back into a DataFrame and restore the column names for readability.

pd.DataFrame(X_train_scaled, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,0.039025,0.223642,0.5,0.973632,0.215057
1,0.03641,0.188498,0.6,0.991542,0.130769
2,0.040777,0.214058,0.5,0.950249,0.130278
3,0.03345,0.191693,0.5,0.963682,0.11964
4,0.023229,0.159744,0.5,0.957214,0.167921


# E5) Normalizer
Normalizer scales each row (each sample) so the row length becomes 1. This is more common in text/vector problems, but it’s good to understand what it does.

In [38]:
# Create a Normalizer object.
#
# Normalizer rescales each individual row (sample), not each column (feature).
#
# It adjusts the values in a row so that the entire row has a unit length (magnitude = 1).
#
# This is commonly used in vector-based data
# (e.g., text features), but can be applied to numeric data as well.

normalizer = Normalizer()


# Replace missing values with 0 before normalization.
#
# Normalizer cannot handle missing values,
# so we temporarily fill them with 0 to allow computation.

X_train_norm = normalizer.fit_transform(X_train[num_cols].fillna(0))


# Transform the test data using the same normalization rule.
# Unlike scalers, Normalizer does not learn statistics like mean or min/max.
# It simply rescales each row independently.

X_test_norm = normalizer.transform(X_test[num_cols].fillna(0))


# Convert the normalized NumPy array back into a DataFrame
# and restore column names for readability.

pd.DataFrame(X_train_norm, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,0.962778,0.008023,0.000573,0.224304,0.150606
1,0.96443,0.007261,0.000738,0.245261,0.098326
2,0.97329,0.00743,0.000554,0.211802,0.088269
3,0.961026,0.008009,0.000667,0.258543,0.097571
4,0.916591,0.009166,0.000917,0.352704,0.188084


# F) Categorical encoding

# F1) pd.get_dummies(..., drop_first=True)
One-hot encoding turns categories into 0/1 columns. drop_first=True avoids creating redundant columns (helps reduce collinearity).

In [39]:
# Apply one-hot encoding to categorical columns using pandas.
#
# pd.get_dummies() converts categorical variables into binary (0/1) columns.
# For each unique category in a column, a new column is created.
#
# columns=cat_cols specifies which columns should be encoded.
# Only the categorical columns are transformed.
#
# drop_first=True removes the first category from each encoded feature.
# This helps prevent multicollinearity (also known as the dummy variable trap),
# which is especially important for Linear and Logistic Regression.
#
# The result is a new DataFrame where categorical features
# are replaced with numeric 0/1 indicator columns.

X_dummies = pd.get_dummies(X, columns=cat_cols, drop_first=True)


# Display the first few rows to observe the new encoded structure.

X_dummies.head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,...,HouseStyle_SLvl,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM
0,8450,65.0,7,2003,856,False,False,False,False,True,...,False,True,False,False,False,False,False,False,True,False
1,9600,80.0,6,1976,1262,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
2,11250,68.0,7,2001,920,False,False,False,False,True,...,False,True,False,False,False,False,False,False,True,False
3,9550,60.0,7,1915,756,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
4,14260,84.0,8,2000,1145,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False


# F2) OneHotEncoder(handle_unknown="ignore")
This is the scikit-learn version of one-hot encoding. handle_unknown="ignore" prevents errors if the test set has a category never seen in training.

In [40]:
# Create a OneHotEncoder object.
#
# OneHotEncoder converts categorical variables into binary (0/1) columns,
# similar to pd.get_dummies(), but designed for use in machine learning pipelines.
#
# handle_unknown="ignore" ensures that if new categories appear in future data (e.g., test set or production),
# the encoder will not raise an error.
#
# sparse_output=False returns a dense NumPy array instead of a sparse matrix,
# making it easier to view and convert into a DataFrame for demonstration.

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)


# Fit the encoder on the categorical columns and transform them.
#
# NOTE: Here we fit on the full dataset only for demonstration purposes.
# In real machine learning workflows, we should fit only on training data to avoid data leakage.

X_ohe = ohe.fit_transform(X[cat_cols])


# Convert the encoded NumPy array into a DataFrame.
#
# get_feature_names_out() retrieves the generated column names
# (e.g., Neighborhood_CollgCr, HouseStyle_2Story, etc.)
# so the output is interpretable.

pd.DataFrame(X_ohe, columns=ohe.get_feature_names_out(cat_cols)).head()


Unnamed: 0,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,...,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_nan,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# F3) OrdinalEncoder with unknown handling
Ordinal encoding replaces categories with integers (e.g., “A”→0, “B”→1). It’s compact, but the model may assume an order that doesn’t really exist—so use it carefully.

In [41]:
# Create an OrdinalEncoder object.
#
# OrdinalEncoder converts each category into an integer value.
#
# handle_unknown="use_encoded_value" ensures that if a new, previously unseen category
# appears during transformation, the encoder will not raise an error.
#
# unknown_value=-1 assigns the value -1 to any unseen category.
# This prevents failures during inference on new data.

ord_enc = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1)

# Fit the encoder on the categorical columns and transform them.
#
# Unlike one-hot encoding, this produces one numeric column per feature.

X_ord = ord_enc.fit_transform(X[cat_cols])


# Convert the encoded NumPy array back into a DataFrame
# and restore the original column names for readability.

pd.DataFrame(X_ord, columns=cat_cols).head()


Unnamed: 0,Neighborhood,HouseStyle,GarageType,MSZoning
0,5.0,5.0,1.0,3.0
1,24.0,2.0,1.0,3.0
2,5.0,5.0,1.0,3.0
3,6.0,5.0,5.0,3.0
4,15.0,5.0,1.0,3.0


# G) Train/test split + fit only on train


# G1) train_test_split(..., random_state=...) for regression
This creates a repeatable split (same result every run). Great for consistent demos and debugging.

In [42]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)
(X_train_r.shape, X_test_r.shape)

((1168, 9), (292, 9))

# G2) train_test_split(..., stratify=y) for classification
Stratification keeps the class balance similar in train and test. This is important when classes are imbalanced.

In [43]:
# stratify=y_clf ensures that the class distribution
# (proportion of 0s and 1s) remains similar in both the training and test sets.
# This is especially important for classification tasks, particularly when classes are imbalanced.

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# Compute the mean of the binary target in both sets.
#
# Since the target is 0 and 1, the mean represents the proportion of class 1 (HighPrice).
#
# Similar values confirm that stratification worked correctly.

(y_train_c.mean(), y_test_c.mean())

(np.float64(0.4982876712328767), np.float64(0.5))

# G3) Fit transformer on train only (example: imputer)
We learn imputation values from the training set only. Then we apply the same learned values to the test set. This prevents test information from influencing training.

In [44]:
# Create a SimpleImputer with strategy="median".
#
# This means missing values in numeric columns
# will be replaced with the median (middle value) of each column.

imp = SimpleImputer(strategy="median")


# Fit the imputer on the training data and transform it.
#
# Fitting only on training data prevents data leakage.

X_train_num_imp = imp.fit_transform(X_train_r[num_cols])


# Apply the same learned medians to the test data.
# We do NOT call fit() again on the test set.

X_test_num_imp = imp.transform(X_test_r[num_cols])


# Convert the imputed NumPy array back into a DataFrame
# and restore the original column names for readability.

pd.DataFrame(X_train_num_imp, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,8400.0,70.0,5.0,1957.0,1314.0
1,7837.0,59.0,6.0,1993.0,799.0
2,8777.0,67.0,5.0,1910.0,796.0
3,7200.0,60.0,5.0,1937.0,731.0
4,5000.0,50.0,5.0,1924.0,1026.0


# G4) Fit transformer on train only (example: scaler)
Scaling must be fit on training data only. Otherwise, the test set influences the scaling parameters (data leakage).

In [45]:
# Create a StandardScaler object.

scaler = StandardScaler()


# Replace missing values with 0 temporarily before scaling.
#
# StandardScaler cannot handle missing values directly,
# so we fill them here to allow the transformation.
# (In practice, imputation should ideally be done separately
# before scaling using a proper imputer.)

X_train_num_scaled = scaler.fit_transform(X_train_r[num_cols].fillna(0))


# Transform the test data using the same scaling parameters learned from the training data.
# We do NOT call fit() on the test data to prevent data leakage.

X_test_num_scaled = scaler.transform(X_test_r[num_cols].fillna(0))


# Convert the scaled NumPy array back into a DataFrame
# and restore the original column names for readability.

pd.DataFrame(X_train_num_scaled, columns=num_cols).head()


Unnamed: 0,LotArea,LotFrontage,OverallQual,YearBuilt,TotalBsmtSF
0,-0.212896,0.359535,-0.820445,-0.455469,0.572612
1,-0.265245,0.048743,-0.088934,0.718609,-0.596547
2,-0.177841,0.274773,-0.820445,-1.988293,-0.603357
3,-0.324474,0.076997,-0.820445,-1.107734,-0.750921
4,-0.529035,-0.205542,-0.820445,-1.531707,-0.081209


# H) Prebuilt sklearn workflow building blocks

# H1) ColumnTransformer (numeric pipeline + categorical pipeline)
Different columns need different preprocessing: numbers get imputed + scaled, categories get imputed + one-hot encoded. ColumnTransformer applies the right steps to the right columns.

In [46]:
# Define the list of numeric and categorical features.
#
# This separation allows us to apply different preprocessing steps to different types of data.

numeric_features = num_cols
categorical_features = cat_cols.tolist()


# Create a preprocessing pipeline for numeric features.
#
# Step 1: Impute missing values using the median.
# Step 2: Scale the features using StandardScaler.
#
# These steps will be applied sequentially to numeric columns.

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


# Create a preprocessing pipeline for categorical features.
#
# Step 1: Impute missing values using the most frequent category.
# Step 2: Apply one-hot encoding to convert categories into 0/1 columns.

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


# Combine both pipelines using ColumnTransformer.
#
# ("num", numeric_pipe, numeric_features)
#   → Apply the numeric pipeline to numeric columns.
#
# ("cat", categorical_pipe, categorical_features)
#   → Apply the categorical pipeline to categorical columns.
#
# remainder="drop" ensures that any columns not specified
# are removed from the output.
#
# This creates a single preprocessing object
# that can handle mixed data types automatically.

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric_features),
        ("cat", categorical_pipe, categorical_features)
    ],
    remainder="drop"
)


# Display the full preprocessing configuration.

preprocess



## **Note**
A **Pipeline** chains multiple preprocessing steps (and optionally a model) into a single object.

It ensures that:

- Steps run in order

- The same transformations are applied consistently

- There is no data leakage

- Code stays clean and reusable

**ColumnTransformer** applies different preprocessing pipelines to different columns.

**Pipeline** = sequential steps

**ColumnTransformer** = parallel processing of column groups

Together they create a structured preprocessing system.

You only need **ColumnTransformer** when: You have different types of columns and they need different preprocessing steps

**Important:** Defining Pipeline or ColumnTransformer does NOT process data.
It only sets up the structure.

# H2) Full Pipeline: preprocessing + LinearRegression
A pipeline bundles preprocessing and modeling together. It ensures the same preprocessing is applied consistently, and it prevents leakage because the pipeline is fit only on training data.

In [49]:
# Create a full pipeline that combines preprocessing and the model.
#
# Step 1: "preprocess" applies all transformations defined earlier
#         (imputation, scaling, encoding).
# Step 2: "model" applies LinearRegression to the transformed data.
#
# This ensures that preprocessing and modeling are executed
# sequentially and consistently.

lin_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])


# Fit the entire pipeline on the training data.
#
# During fit():
# 1. The preprocessing steps learn required statistics from X_train_r
#    (e.g., medians, scaling parameters, category mappings).
# 2. The transformed data is passed to the LinearRegression model.
# 3. The model learns relationships between features and y_train_r.

lin_pipe.fit(X_train_r, y_train_r)


# Generate predictions on the test data.
#
# During predict():
# 1. The test data is transformed using the SAME preprocessing rules
#    learned from training data.
# 2. The trained model makes predictions on the transformed test data.

y_pred = lin_pipe.predict(X_test_r)


# Evaluate model performance.
#
# R^2 measures how much variance in the target is explained by the model (higher is better).
#
# RMSE (Root Mean Squared Error) measures the average prediction error in the same units as the target.
# Lower RMSE indicates better performance.

print("R^2:", r2_score(y_test_r, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test_r, y_pred)))


R^2: 0.7953218012731841
RMSE: 39622.5762884514


# H3) Full Pipeline: preprocessing + LogisticRegression
Logistic regression is sensitive to scaling, so the numeric scaling inside the pipeline is especially helpful. This pipeline handles missing values, encoding, and scaling automatically.

In [50]:
# Create a full pipeline that combines preprocessing and Logistic Regression.
#
# Step 1: "preprocess" applies all transformations
#         (imputation, scaling, encoding).
# Step 2: "model" applies LogisticRegression for binary classification.
#
# max_iter=2000 increases the maximum number of iterations
# allowed for the optimization algorithm to converge.
# This is helpful when working with scaled and encoded data.

log_pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000))
])


# Fit the entire pipeline on the training data.
#
# During fit():
# 1. Preprocessing steps learn parameters from X_train_c.
# 2. The transformed training data is passed to the Logistic Regression model.
# 3. The model learns how features relate to the binary target y_train_c.

log_pipe.fit(X_train_c, y_train_c)


# Generate predictions on the test data.
#
# During predict():
# 1. The test data is transformed using the SAME preprocessing rules.
# 2. The trained logistic model outputs predicted class labels (0 or 1).

y_pred = log_pipe.predict(X_test_c)


# Evaluate classification performance.
#
# Accuracy measures the proportion of correct predictions.
#
# classification_report provides detailed metrics:
# - Precision
# - Recall
# - F1-score
# - Support (number of samples per class)

print("Accuracy:", accuracy_score(y_test_c, y_pred))
print(classification_report(y_test_c, y_pred))


Accuracy: 0.886986301369863
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       146
           1       0.88      0.90      0.89       146

    accuracy                           0.89       292
   macro avg       0.89      0.89      0.89       292
weighted avg       0.89      0.89      0.89       292



# I) Saving artifacts


# I1) joblib.dump(...) — save the trained pipeline
Saving lets you reuse the exact same preprocessing + model later without retraining. This is how real ML systems stay consistent in production.

In [51]:
# Save the trained pipelines to disk.
#
# joblib.dump() serializes (saves) the entire pipeline object,
# including:
# - All preprocessing steps (imputers, scalers, encoders)
# - The trained machine learning model
# - All learned parameters
#
# This allows the exact same pipeline to be loaded later
# for prediction without retraining.
#
# The file extension ".joblib" is commonly used for saved
# scikit-learn models and preprocessing objects.

joblib.dump(log_pipe, "logistic_preprocess_pipeline.joblib")
joblib.dump(lin_pipe, "linear_preprocess_pipeline.joblib")

print("Saved pipelines to disk.")


Saved pipelines to disk.


**Note:** The file is saved in the current working directory (usually the same folder where your notebook is running).

If you want it to be saved in a particular directory, follow the method below.

In [52]:
# Using Path to save it in another directory if required

from pathlib import Path

model_dir = Path("/content/drive/MyDrive/ML")

# Define full paths
log_path = model_dir / "logistic_preprocess_pipeline.joblib"
lin_path = model_dir / "linear_preprocess_pipeline.joblib"

# Save models
joblib.dump(log_pipe, log_path)
joblib.dump(lin_pipe, lin_path)

print("Saved pipelines to:", model_dir.resolve())


Saved pipelines to: /content/drive/MyDrive/ML


# I2) joblib.load(...) — load the pipeline and predict again
Loading proves your preprocessing and model can be reused exactly as-is. This is essential for deployment and reproducibility.

In [53]:
# Load the previously saved pipeline objects from disk.
#
# joblib.load() reads the serialized pipeline file
# and restores the entire object exactly as it was saved,
# including:
# - Preprocessing steps
# - Learned parameters
# - Trained model weights

loaded_log_pipe = joblib.load("logistic_preprocess_pipeline.joblib")
loaded_lin_pipe = joblib.load("linear_preprocess_pipeline.joblib")


# Use the loaded pipelines to make predictions.
#
# Even though we only provide raw feature data (X_test),
# the pipeline automatically:
# 1. Applies preprocessing (imputation, scaling, encoding)
# 2. Passes the transformed data into the trained model
# 3. Returns predictions
#
# This demonstrates that the saved pipeline
# preserves the entire preprocessing + modeling workflow.

print("Logistic predictions (first 5):", loaded_log_pipe.predict(X_test_c.head()))
print("Linear predictions (first 5):", loaded_lin_pipe.predict(X_test_r.head()))


Logistic predictions (first 5): [0 1 0 1 1]
Linear predictions (first 5): [159094.60987531 325807.48417395 117873.40409151 154493.38232904
 278289.12438333]
