In [215]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import plotly.express as px
import seaborn as sns

# Load dataset
df = pd.read_csv('train.csv')  # Replace with your actual file path


In [216]:
# Select only numerical columns with NaN values
numerical_missing = df.select_dtypes(include='number').columns[df.select_dtypes(include='number').isnull().any()]

print(numerical_missing)

Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')


In [217]:
# Select only object or category columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns

# Fill NaN values in only those columns
df[cat_cols] = df[cat_cols].fillna("Missing")

## 1.1 convert types

In [218]:
#from Id	LotArea	LotShape	BldgType	HouseStyle	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	1stFlrSF	2ndFlrSF	
#GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	GarageCars	GarageArea
#we drop

df[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']] = df[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']].astype('object')

In [219]:
threshold = 0.30  # 30%

# Get missing value ratio per column
missing_ratio = df.isna().mean()

# Identify columns to drop
missing_ratio[missing_ratio > threshold].index.tolist()

[]

Keeping features with more than 30% missing values can still improve your model score if those features:

1. Have Strong Predictive Power
Even if many values are missing, the non-missing values might strongly correlate with house prices. For example:

A rare feature like "Luxury Finish Quality" might be present in only 40% of houses but highly predictive of high prices.

2. Missingness Is Informative
The fact that a value is missing can itself be predictive. For instance:

"Pool Size" missing might imply no pool → possibly lower price.

So, missing = meaningful, not just a problem.

3. Effective Imputation
If you handle missing values well (e.g., with mean/median, KNN, or even a separate "missing" category), the feature can still contribute positively without hurting the model.

4. Regularized Models Can Handle Noise
Algorithms like XGBoost, LightGBM, and Random Forest can handle missing values internally or tolerate noisy features due to their robustness.

🔑 In Short:
Dropping features just based on missingness may cause you to lose valuable information. If the signal outweighs the noise, even sparse features can boost prediction accuracy.

## 1.4 keep means for missing quantitative + use them train

In [220]:
#save numeric means for missing test
means = df.select_dtypes(include='number').mean()

#imputer numeric par moyenne dans train
df = df.fillna(0)

  df = df.fillna(0)


## 1.5 create variables

In [221]:
df_dropped = df.copy()

#'YearBuilt', 'YearRemodAdd, 'MoSold', 'YrSold' convert to quantitative
# List of columns to convert
columns_to_convert = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']

# Convert each to integer safely
for col in columns_to_convert:
    df_dropped.loc[:, col] = df_dropped[col].astype(int)

# Set reference year
reference_year = df_dropped['YrSold'].max()

# Create quantitative features safely
df_dropped.loc[:, 'HouseAge'] = reference_year - df_dropped['YearBuilt']
df_dropped.loc[:, 'YearsSinceRemod'] = reference_year -df_dropped['YearRemodAdd']
df_dropped.loc[:, 'TimeIndex'] = (
    (df_dropped['YrSold'] - df_dropped['YrSold'].min()) * 12 + df_dropped['MoSold']
)

df_dropped = df_dropped.drop([
    'YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'
], axis=1) 

# Convert object columns to numeric, forcing errors to NaN
cols_to_convert = ['HouseAge', 'YearsSinceRemod', 'TimeIndex']
df_dropped[cols_to_convert] = df_dropped[cols_to_convert].apply(pd.to_numeric, errors='coerce')

## 1.6 correct skewness

In [222]:
#case 1: only binary no log

#Most houses don't have low-quality finished square footage.
#Very few do, with highly varied amounts.
cols_to_binary_only_0 = ['BsmtHalfBath', 'EnclosedPorch', 'ScreenPorch']

for col in cols_to_binary_only_0:
    df_dropped[f'Has{col}'] = (df_dropped[col] > 0).astype(int)
    df_dropped.drop(columns=[col], inplace=True)

In [223]:
    

# = 1 or not

df_dropped['HasKitchen'] = (df_dropped['KitchenAbvGr'] == 1).astype(int)
df_dropped.drop(columns=['KitchenAbvGr'], inplace=True)


#case 2: binary+log

cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']  # Replace with actual column names

for col in cols:
    df_dropped[f'Has{col}'] = (df_dropped[col] > 0).astype(int)
    df_dropped[f'{col}_log'] = np.log1p(df_dropped[col])

In [224]:
for col in cols:
    non_numeric = df_dropped[col].apply(lambda x: not pd.api.types.is_number(x))
    if non_numeric.any():
        print(f"Column '{col}' contains non-numeric values:")
        print(df_dropped.loc[non_numeric, col].unique())

In [225]:
#case 3: just drop 
df_dropped.drop(columns=['LowQualFinSF','3SsnPorch', 'PoolArea', 'MiscVal'], inplace=True)

# case 4: log only

#correlation = df_cat_quant['HouseAge'].corr(df_cat_quant['YearsSinceRemod'], method='pearson')
#print(f"Pearson correlation: {correlation:.3f}")
df_dropped.drop(columns=[ 'TimeIndex'], inplace=True)# cyclique, pas significative

# List of variables to transform
vars_to_log = ['LotFrontage', 'LotArea', 'TotalBsmtSF', 'GrLivArea',  'SalePrice', 'BsmtUnfSF', '1stFlrSF']

# Create log-transformed versions with "_log" suffix
for col in vars_to_log:
    df_dropped[col + '_log'] = np.log1p(df_dropped[col])  # log1p handles zero safely

# Drop the original columns
df_dropped.drop(columns=vars_to_log, inplace=True)

### cap

## 1.7 cap outliers

In [226]:
# cap outliers
#outliers cap

num_features = [col for col in df_dropped.select_dtypes(include='number') if col != 'Id']
 
# Store the limits for each column
caps = {}

for col in num_features:
    q_low = df_dropped[col].quantile(0.01)
    q_high = df_dropped[col].quantile(0.99)
    
    # Save the thresholds
    caps[col] = (q_low, q_high)
    
    # Apply clipping
    df_dropped[col] = df_dropped[col].clip(lower=q_low, upper=q_high)



In [227]:
print(list(df_dropped.columns)) 

['Id', 'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '2ndFlrSF', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'HouseAge', 'YearsSinceRemod', 'HasBsmtHalfBath', 'HasEnclosedPorch', 'HasScreenPorch', 'HasKitchen', 'HasMasVnrArea', 'MasVnrArea_log', 'HasBsmtFinSF1', 'Bsm

In [228]:
df_dropped['HasScreenPorch'].dtype

dtype('int32')

In [229]:
import pandas as pd

# Assume df is your DataFrame
str_columns = df_dropped.select_dtypes(include='object').columns.tolist()

print(str_columns)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


## 1.8 Categorical variable (missing+rare categories)

In [230]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

X_train = df_dropped.drop(['Id','SalePrice_log'],axis=1)
y_train = df_dropped.SalePrice_log


# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]


# Preprocessing for categorical data
#categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'  # <-- this keeps the numerical columns
)

model_GBR =  GradientBoostingRegressor(n_estimators=1100, loss='squared_error', subsample = 0.35, learning_rate = 0.05,random_state=1)
GBR_Pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model_GBR)])
GBR_Pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [231]:
preprocessor

In [232]:
# 1. Get column names from numerical and categorical features
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# 2. Get the OneHotEncoder from the pipeline
ohe = GBR_Pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']

# 3. Get one-hot encoded feature names
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)

# 4. Combine with numerical columns
all_feature_names = list(numerical_cols) + list(ohe_feature_names)

# Print result
print(f"Number of features used: {len(all_feature_names)}")
print("Example feature names:", all_feature_names)

Number of features used: 299
Example feature names: ['MSSubClass', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', '2ndFlrSF', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'HouseAge', 'YearsSinceRemod', 'MasVnrArea_log', 'BsmtFinSF1_log', 'BsmtFinSF2_log', 'BsmtUnfSF_log', '2ndFlrSF_log', 'WoodDeckSF_log', 'OpenPorchSF_log', 'LotFrontage_log', 'LotArea_log', 'TotalBsmtSF_log', 'GrLivArea_log', '1stFlrSF_log', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_Missing', 'Alley_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Gtl', 'La

In [233]:
 [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

['MSSubClass',
 'OverallQual',
 'OverallCond',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 '2ndFlrSF',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'HouseAge',
 'YearsSinceRemod',
 'MasVnrArea_log',
 'BsmtFinSF1_log',
 'BsmtFinSF2_log',
 'BsmtUnfSF_log',
 '2ndFlrSF_log',
 'WoodDeckSF_log',
 'OpenPorchSF_log',
 'LotFrontage_log',
 'LotArea_log',
 'TotalBsmtSF_log',
 'GrLivArea_log',
 '1stFlrSF_log']

## 1.9 predict

# Test Quantitative columns (outliers capping...)

In [234]:
# test

#from Id	LotArea	LotShape	BldgType	HouseStyle	BsmtFinSF1	BsmtFinType2	BsmtFinSF2	BsmtUnfSF	TotalBsmtSF	1stFlrSF	2ndFlrSF	
#GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	GarageCars	GarageArea
#we drop
d_test = pd.read_csv('test.csv') 

In [235]:
# Select only numerical columns with NaN values
numerical_missing = d_test.select_dtypes(include='number').columns[d_test.select_dtypes(include='number').isnull().any()]

print(numerical_missing)

Index(['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea'],
      dtype='object')


## 2.1 convert types

In [236]:
d_test[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']] = d_test[['MSSubClass', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'YrSold']].astype('object')

## 2.2 drop most missing

In [237]:
# Select only object or category columns
cat_cols = d_test.select_dtypes(include=["object", "category"]).columns

# Fill NaN values in only those columns
d_test[cat_cols] = d_test[cat_cols].fillna("Missing")

  d_test[cat_cols] = d_test[cat_cols].fillna("Missing")


## 2.4 fill na with means of train

In [238]:
# Imputation dans le test set (numeric)
d_test.fillna(0, inplace=True)

## 2.5 create variables

In [239]:
#'YearBuilt', 'YearRemodAdd, 'MoSold', 'YrSold' convert to quantitative
# List of columns to convert
columns_to_convert = ['YearBuilt', 'YearRemodAdd', 'MoSold', 'YrSold']

# Convert each to integer safely
for col in columns_to_convert:
    d_test.loc[:, col] = d_test[col].astype(int)

# Set reference year
reference_year = d_test['YrSold'].max()

# Create quantitative features safely
d_test.loc[:, 'HouseAge'] = reference_year - d_test['YearBuilt']
d_test.loc[:, 'YearsSinceRemod'] = reference_year - d_test['YearRemodAdd']


d_test = d_test.drop([
    'YearBuilt', 'YearRemodAdd', 'YrSold', 'MoSold'
], axis=1) 

# Convert object columns to numeric, forcing errors to NaN
cols_to_convert = ['HouseAge', 'YearsSinceRemod']
d_test[cols_to_convert] = d_test[cols_to_convert].apply(pd.to_numeric, errors='coerce')

## 2.6 correct skewness

In [240]:
#case 1: only binary no log

#Most houses don't have low-quality finished square footage.
#Very few do, with highly varied amounts.
cols_to_binary_only_0 = ['BsmtHalfBath', 'EnclosedPorch', 'ScreenPorch']

for col in cols_to_binary_only_0:
    d_test[f'Has{col}'] = (d_test[col] > 0).astype(int)
    d_test.drop(columns=[col], inplace=True)

# = 1 or not

d_test['HasKitchen'] = (d_test['KitchenAbvGr'] == 1).astype(int)
d_test.drop(columns=['KitchenAbvGr'], inplace=True)


#case 2: binary+log

cols = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']  # Replace with actual column names

for col in cols:
    d_test[f'Has{col}'] = (d_test[col] > 0).astype(int)
    d_test[f'{col}_log'] = np.log1p(d_test[col])

#case 3: just drop 
d_test.drop(columns=['LowQualFinSF','3SsnPorch', 'PoolArea', 'MiscVal'], inplace=True)

# case 4: log only

#correlation = df_cat_quant['HouseAge'].corr(df_cat_quant['YearsSinceRemod'], method='pearson')
#print(f"Pearson correlation: {correlation:.3f}")

# List of variables to transform
vars_to_log = ['LotFrontage', 'LotArea', 'TotalBsmtSF', 'GrLivArea',  'BsmtUnfSF', '1stFlrSF']


# Create log-transformed versions with "_log" suffix
for col in vars_to_log:
    d_test[col + '_log'] = np.log1p(d_test[col])  # log1p handles zero safely

# Drop the original columns
d_test.drop(columns=vars_to_log, inplace=True)

## 2.7 cap outliers

In [241]:
num_features = [col for col in num_features if col != "SalePrice_log"]

for col in num_features:
    q_low, q_high = caps[col]
    d_test[col] = d_test[col].clip(lower=q_low, upper=q_high)


## 2.8 Categorical variable (missing+rare categories)

In [242]:
# One-hot encode test set
X_test = d_test.drop(['Id'],axis=1)

y_pred_log = GBR_Pipeline.predict(X_test)

y_pred = np.expm1(y_pred_log)  # reverse np.log1p()
d_test['SalePrice'] = y_pred
d_test[['Id', 'SalePrice']].to_csv("predictions_g_p.csv", index=False)

In [243]:
# Get the fitted preprocessor from the pipeline
preprocessor = GBR_Pipeline.named_steps['preprocessor']

# Apply the preprocessor to X_test (this will apply one-hot encoding, etc.)
X_test_transformed = preprocessor.transform(X_test)

# Get final feature names used (same as in training)
categorical_cols = [cname for cname in X_test.columns if X_test[cname].dtype == "object"]
numerical_cols = [cname for cname in X_test.columns if X_test[cname].dtype in ['int64', 'float64']]

# Get one-hot encoded feature names
ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)

# Combine with numerical column names
final_feature_names = list(numerical_cols) + list(ohe_feature_names)

print(f"Number of features used: {len(final_feature_names)}")
print("Example feature names used in prediction:", final_feature_names)

Number of features used: 299
Example feature names used in prediction: ['MSSubClass', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', '2ndFlrSF', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'HouseAge', 'YearsSinceRemod', 'MasVnrArea_log', 'BsmtFinSF1_log', 'BsmtFinSF2_log', 'BsmtUnfSF_log', '2ndFlrSF_log', 'WoodDeckSF_log', 'OpenPorchSF_log', 'LotFrontage_log', 'LotArea_log', 'TotalBsmtSF_log', 'GrLivArea_log', '1stFlrSF_log', 'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'Alley_Grvl', 'Alley_Missing', 'Alley_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', '

In [244]:

pd.set_option('display.max_columns', None)
print(X_test_aligned.columns)

NameError: name 'X_test_aligned' is not defined

In [None]:
print(list(df_dropped.columns)) 

In [None]:
import os
os.getcwd()