# House price prediction dataset preprocessing

In [2]:
import pandas as pd
df = pd.read_csv("house_price_train.csv",encoding="utf-8",delimiter=",")
print(df.head(5))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

#Data Preprocessing

In [3]:
#Handling null values
#Can use ffill but may lead to baised prediction of house price
#So, let's seperate categorical and numerical columns
print("Datatypes group by:")
print(df.columns.to_series().groupby(df.dtypes).apply(list))
num_fea = df.select_dtypes(include=["number"]).columns.tolist()
cat_fea = df.select_dtypes(include=["object","category"]).columns.tolist()
df[num_fea]=df[num_fea].fillna(df[num_fea].median())
df[cat_fea]=df[cat_fea].fillna("Unknown")

Datatypes group by:
int64      [Id, MSSubClass, LotArea, OverallQual, Overall...
float64               [LotFrontage, MasVnrArea, GarageYrBlt]
object     [MSZoning, Street, Alley, LotShape, LandContou...
dtype: object


In [4]:
#Encode categorical features
pd.get_dummies(df,columns=cat_fea,drop_first=True)
print(df.head(5))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street    Alley LotShape  \
0   1          60       RL         65.0     8450   Pave  Unknown      Reg   
1   2          20       RL         80.0     9600   Pave  Unknown      Reg   
2   3          60       RL         68.0    11250   Pave  Unknown      IR1   
3   4          70       RL         60.0     9550   Pave  Unknown      IR1   
4   5          60       RL         84.0    14260   Pave  Unknown      IR1   

  LandContour Utilities  ... PoolArea   PoolQC    Fence MiscFeature MiscVal  \
0         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
1         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
2         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
3         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   
4         Lvl    AllPub  ...        0  Unknown  Unknown     Unknown       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2

In [5]:
#Data Splitting
from sklearn.model_selection import train_test_split
X=df.drop("SalePrice",axis=1)
y=df["SalePrice"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
#stratify=y maintains the same class distribution in train and test sets.
#random_state=42 The number 42 is just a popular arbitrary choice among programmers

## Feature Selection

## Filter Methods for Feature Selection

### Basic Filter Methods
1. Constant Features Removal  
2. Quasi-Constant Features Removal  
3. Duplicate Features Removal  

### Statistical Filter Methods
1. Pearson Correlation  
2. Spearman Rank Correlation  
3. ANOVA F-test  
4. Mutual Information (regression)  
5. Chi-square Test  
6. Mutual Information (classification)  
7. Cramér’s V



In [7]:
# Basic Filter methods
# Identify categorical and numerical columns
cat_fea = X_train.select_dtypes(include=["object"]).columns.tolist()
num_fea = X_train.select_dtypes(include=["number"]).columns.tolist()

# 1. Encoding categorical features using get_dummies
X_train = pd.get_dummies(X_train, columns=cat_fea, drop_first=True)
X_train = X_train.astype(int)  # Optional: converts uint8 to int

X_test = pd.get_dummies(X_test, columns=cat_fea, drop_first=True)
X_test = X_test.astype(int)  # Optional

# Align columns of test set to train set (add missing columns with zeros)
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0
# Reorder test columns to match train
X_test = X_test[X_train.columns]

print(df.dtypes)  # Verify all are numeric

# 2. Removing constant features
const = []
for features in X_train:
    if X_train[features].std() == 0:
        const.append(features)
print("Number of constant features:", len(const))
X_train.drop(labels=const, axis=1, inplace=True)
X_test.drop(labels=const, axis=1, inplace=True)  # Apply same removal to X_test

# 3. Removing quasi-constant features
quasi_constant = []
for feature in X_train.columns:
    predominant = (X_train[feature].value_counts() / float(len(X_train))).sort_values(ascending=False).values[0]
    if predominant > 0.999:
        quasi_constant.append(feature)
print("Number of quasi constant features:", len(quasi_constant))
X_train.drop(labels=quasi_constant, axis=1, inplace=True)
X_test.drop(labels=quasi_constant, axis=1, inplace=True)  # Apply same removal to X_test

# 4. Removing duplicated features
duplicates = []
for i in range(len(X_train.columns)):
    col1 = X_train.columns[i]
    for col2 in X_train.columns[i+1:]:
        if X_train[col1].equals(X_train[col2]):  # Checks content equality
            duplicates.append(col2)
print("Number of duplicate features:", len(duplicates))
X_train.drop(labels=duplicates, axis=1, inplace=True)
X_test.drop(labels=duplicates, axis=1, inplace=True, errors='ignore')  # Ignore missing columns to avoid KeyError



Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object
Number of constant features: 0
Number of quasi constant features: 0
Number of duplicate features: 16


In [8]:
# Statistical Filter Methods
# Even though we’ve already one-hot encoded everything, statistical filter methods still differ based on original feature type
# Target is SalePrice (numerical), and this is a regression problem

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np

# Separate numerical and categorical features based on unique values in X_train
num_features = [col for col in X_train.columns if not set(X_train[col].unique()).issubset({0, 1})]
cat_features = [col for col in X_train.columns if set(X_train[col].unique()).issubset({0, 1})]

# 1. Numerical input and numerical output → Pearson Correlation and Spearman Rank Correlation
pearson_corr = []
spearman_corr = []

for col in num_features:
    p_corr, _ = pearsonr(X_train[col], y_train)
    s_corr, _ = spearmanr(X_train[col], y_train)
    pearson_corr.append((col, p_corr))
    spearman_corr.append((col, s_corr))

pearson_df = pd.DataFrame(pearson_corr, columns=["Feature", "PearsonCorr"]).set_index("Feature")
spearman_df = pd.DataFrame(spearman_corr, columns=["Feature", "SpearmanCorr"]).set_index("Feature")

print("Top 10 features by absolute Pearson correlation:")
print(pearson_df["PearsonCorr"].abs().sort_values(ascending=False).head(10))

print("\nTop 10 features by absolute Spearman correlation:")
print(spearman_df["SpearmanCorr"].abs().sort_values(ascending=False).head(10))

# 2. Numerical input and numerical output → ANOVA F-test (f_regression)
f_values, p_values = f_regression(X_train[num_features], y_train)
anova_df = pd.DataFrame({
    "Numerical Features": num_features,
    "F_values": f_values,
    "P_values": p_values
})
anova_df.sort_values(by="P_values", inplace=True)
significant_numeric_features = anova_df[anova_df["P_values"] < 0.05]["Numerical Features"].tolist()
print("\nSignificant numerical features from ANOVA (p<0.05):")
print(significant_numeric_features)

# 3. Categorical input and numerical output → Mutual Information
# Note: chi2 is not suitable here because it expects positive discrete data and categorical output
selector_mi = SelectKBest(score_func=mutual_info_regression, k="all")
selector_mi.fit(X_train[cat_features], y_train)
mi_scores = selector_mi.scores_

mi_df = pd.DataFrame({
    "Categorical Features": cat_features,
    "Mutual_Info_Scores": mi_scores
})
mi_df.sort_values(by="Mutual_Info_Scores", ascending=False, inplace=True)

# Select categorical features with MI > 0 (non-zero dependence)
significant_cat_features = mi_df[mi_df["Mutual_Info_Scores"] > 0]["Categorical Features"].tolist()
print("\nSignificant categorical features from Mutual Information (MI > 0):")
print(significant_cat_features)

# Final selected features: union of significant numerical and categorical features
final_selected_features = list(set(significant_numeric_features + significant_cat_features))

# Filter the datasets to keep only selected features
X_train_filtered = X_train[final_selected_features]
for col in final_selected_features:
    if col not in X_test.columns:
        X_test[col] = 0
X_test_filtered = X_test[final_selected_features]

Top 10 features by absolute Pearson correlation:
Feature
OverallQual     0.784720
GrLivArea       0.689238
GarageCars      0.642689
GarageArea      0.621937
TotalBsmtSF     0.590017
1stFlrSF        0.583132
FullBath        0.549164
TotRmsAbvGrd    0.519634
YearBuilt       0.512206
YearRemodAdd    0.512190
Name: PearsonCorr, dtype: float64

Top 10 features by absolute Spearman correlation:
Feature
OverallQual     0.804608
GrLivArea       0.722232
GarageCars      0.684316
YearBuilt       0.636620
GarageArea      0.636605
FullBath        0.619064
TotalBsmtSF     0.594487
YearRemodAdd    0.571357
1stFlrSF        0.565796
GarageYrBlt     0.549312
Name: SpearmanCorr, dtype: float64

Significant numerical features from ANOVA (p<0.05):
['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'GarageYrBlt', 'MasVnrArea', 'BsmtFinSF1', 'WoodDeckSF', 'LotFrontage', '2ndFlrSF', 'HalfBath', 'OpenPorchS

In [9]:
#Dataset after preprocessing and feature selection
print(X_train.head(10))

        Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
135    136          20           80    10400            7            6   
1452  1453         180           35     3675            5            5   
762    763          60           72     8640            7            5   
932    933          20           84    11670            9            5   
435    436          60           43    10667            7            6   
629    630          80           82     9020            6            5   
1210  1211          60           70    11218            6            5   
1118  1119          80           85    13825            5            6   
1084  1085          60           69    13031            6            5   
158    159          60          100    12552            7            5   

      YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_Con  \
135        1970          1970         288           0  ...             0   
1452       2005          2005    