In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures, TargetEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import  r2_score, mean_squared_error, mean_absolute_error

from sklearn.inspection import permutation_importance

from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

seed = 42

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.shape

(1460, 81)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
X = df.iloc[:,:-1]
y = np.log(df.iloc[:,-1])

In [6]:
X.shape, y.shape

((1460, 80), (1460,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=seed)

#### Imputation Component

In [65]:
binary_imp = ['CentralAir']
mode_imp = ['Electrical']
na_imp = X_train.select_dtypes('object').columns.drop([*mode_imp,*binary_imp])
zero_imp = X_train.select_dtypes('number').columns

impute = ColumnTransformer([
    ('na_imp',SimpleImputer(strategy='constant',fill_value='NA'), na_imp),
    ('zero_imp',SimpleImputer(strategy='constant',fill_value=0), zero_imp),
    ('mode_imp',SimpleImputer(strategy='most_frequent'), mode_imp),
    ('binary_imp',SimpleImputer(strategy='constant',fill_value='N'),binary_imp)
    ], remainder='passthrough', verbose_feature_names_out=False,force_int_remainder_cols=False).set_output(transform='pandas')

#### Encoding Component

In [11]:
ord_feat = ['LotShape','Utilities','LandSlope','BldgType','ExterQual','ExterCond',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
            'HeatingQC','Electrical','KitchenQual','FireplaceQu','GarageFinish',
            'GarageQual','GarageCond','PavedDrive','PoolQC','Fence']
num_feat = X_train.select_dtypes('number').columns
bi_feat=['CentralAir']
cat_feat = X_train.columns.drop([*ord_feat,*num_feat,*bi_feat])

ord_dic = dict(
    LotShape = ['NA','Reg', 'IR1', 'IR2', 'IR3'],
    Utilities =['NA','AllPub', 'NoSeWr','NoSeWa','ELO'],
    LandSlope =['NA','Gtl', 'Mod', 'Sev'],
    BldgType =['NA','1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
    BsmtExposure = ['NA','No', 'Gd','Av','Mn'],
    BsmtFinType1 = ['NA','GLQ', 'ALQ', 'BLQ',  'Rec', 'LwQ','Unf',],
    BsmtFinType2 = ['NA','GLQ', 'ALQ', 'BLQ',  'Rec', 'LwQ','Unf',],
    GarageFinish = ['NA','Fin','RFn','Unf'],
    PavedDrive = ['NA','Y', 'N', 'P'],
    Fence = ['NA','MnPrv', 'GdWo', 'GdPrv', 'MnWw'],
    Electrical = ['NA','SBrkr','FuseA','FuseF','FuseP','Mix'],
    rest =['NA','Ex','Gd','TA', 'Fa','Po']
)

categories = [ord_dic[col] if col in ord_dic.keys() else ord_dic['rest'] for col in ord_feat]

encode = ColumnTransformer([
    ('oe', OrdinalEncoder(categories = categories,handle_unknown='use_encoded_value',unknown_value=-1),ord_feat),
    ('te', TargetEncoder(cv=10,shuffle=True,random_state=seed),cat_feat),
    ('oe_bi', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),bi_feat)
], 
    remainder='passthrough',
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
                          )

In [None]:
en = Pipeline([
    ('impute',impute),
    ('encode',encode),
    ('standard1',standard),
    ('poly',poly),
    ('standard2',standard),
    ('pca',pca),
    ('en',ElasticNet(random_state=seed))
])

#### PCA Component

In [145]:
pca = PCA(random_state=seed)

#### Polynomial Feature Component

In [129]:
poly = PolynomialFeatures(include_bias=False).set_output(transform='pandas')

### Model Training

#### ElasticNet

In [139]:
# !pip install optuna
import optuna 

en = Pipeline([
    ('impute',impute),
    ('encode',encode),
    ('standard1',StandardScaler()),
    ('poly',poly),
    ('standard2',StandardScaler()),
    ('pca',pca),
    ('en',ElasticNet(random_state=seed))
])

def objective(trial):
    model_params = {
        'poly__degree':trial.suggest_int('poly__degree',1,2),
        'pca__n_components':trial.suggest_int('pca__n_components',10,50),
        'en__alpha':trial.suggest_float('en__alpha',0.1,1),
        'en__l1_ratio':trial.suggest_float('en__l1_ratio',0.1,1),
        'en__max_iter':trial.suggest_int('en__max_iter',1e3,1e4,log=True),
        'en__tol':trial.suggest_float('en__tol',1e-5,1e-2,log=True),
        'en__selection':trial.suggest_categorical('en__selection',['random','cyclic'])
        
    }

    en.set_params(**model_params)
    score = cross_val_score(en, X_train, y_train, cv=5, scoring='r2').mean()
    return score
    
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=seed))
study.optimize(objective,n_trials=100)

en.set_params(**study.best_trial.params)
en.fit(X_train,y_train)

In [185]:
from pprint import pprint
print('\033[92mBest Elastic Net Parameters\033[0m')
pprint(study.best_trial.params)

print('Train MSE:               ',mean_squared_error(y_train,en.predict(X_train)))
print('Test MSE:                ',mean_squared_error(y_test,en.predict(X_test)))
print('Trian R2 Score:          ',r2_score(y_train,en.predict(X_train)))
print('Test R2 Score:           ',r2_score(y_test,en.predict(X_test)))
print('Optuna Best Trial Value: ',study.best_trial.value)

[92mBest Elastic Net Parameters[0m
{'en__alpha': 0.1261934209292713,
 'en__l1_ratio': 0.10014738370229638,
 'en__max_iter': 2241,
 'en__selection': 'random',
 'en__tol': 2.9858801743799305e-05,
 'pca__n_components': 36,
 'poly__degree': 1}
Train MSE:                0.023183941933739485
Test MSE:                 0.022206372250748275
Trian R2 Score:           0.8467083629068829
Test R2 Score:            0.8707188531070004
Optuna Best Trial Value:  0.8283418574018248


In [None]:
en = Pipeline([
    ('impute',impute),
    ('encode',encode),
    ('standard',StandardScaler()),
    ('pca',pca),
    ('svr',ElasticNet(random_state=seed))
])

def objective(trial):
    kernel = trial.suggest_category('svr__kernel',['linear','poly','rbf','sigmoid'])
    pca__c_components = trial.suggest_int('pca__c_components',10,60)
    
    kernel_params = {
        'linear': {
            'C':trial.suggest_float('C',1e-2,1,log)
        },
        'poly': {
            'degree':trial.suggest_float('degree',1,2),
            
        },
        'rbf': {},
        'sigmoid': {}
    }
    model_params = {
        'pca__n_components':trial.suggest_int('pca__n_components',10,50),
        'en__alpha':trial.suggest_float('en__alpha',0.1,1),
        'en__l1_ratio':trial.suggest_float('en__l1_ratio',0.1,1),
        'en__max_iter':trial.suggest_int('en__max_iter',1e3,1e4,log=True),
        'en__tol':trial.suggest_float('en__tol',1e-5,1e-2,log=True),
        'en__selection':trial.suggest_categorical('en__selection',['random','cyclic'])
        
    }

    en.set_params(**model_params)
    score = cross_val_score(en, X_train, y_train, cv=5, scoring='r2').mean()
    return score
    
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=seed))
study.optimize(objective,n_trials=100)

en.set_params(**study.best_trial.params)
en.fit(X_train,y_train)