# Iteration 3: One-hot encoding

## Load data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('./data/housing_iteration_3_classification.csv')
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
3,9550,60.0,756,3,1,0,3,0,0,0,RL,Norm,GasA,Pave,Y,BrkTil
4,14260,84.0,1145,4,1,0,3,192,0,0,RL,Norm,GasA,Pave,Y,PConc


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory usage: 182.6+ KB


In [4]:
cat_cols = ['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir']
for col in cat_cols:
    print(df[col].value_counts())

MSZoning
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: count, dtype: int64
Condition1
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: count, dtype: int64
Heating
GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: count, dtype: int64
Street
Pave    1454
Grvl       6
Name: count, dtype: int64
CentralAir
Y    1365
N      95
Name: count, dtype: int64


In [5]:
y = df.pop('Expensive')

In [6]:
X = df.copy()

## Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Build Pipeline

### Prepare imputing and encoding in pipeline branches

In [8]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = Pipeline(
    steps=[('num_imputer', SimpleImputer()), 
           ('num_scaler', MinMaxScaler())
          ])

 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = Pipeline(
    steps=[('cat_imputer', SimpleImputer(strategy='constant', fill_value='N_A')), 
           ('cat_encoder', OneHotEncoder(drop='first', 
                                         sparse_output=False, 
                                         handle_unknown='ignore'#,infrequent_if_exist', 
                                         #min_frequency=0.03
                                        ))
          ])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns), 
        ('cat', categoric_pipe, X_cat_columns)
    ])

### Combine pipeline parts

In [10]:
pipeline = make_pipeline(preprocessor,
                         DecisionTreeClassifier()
                        ).set_output(transform='pandas')

In [11]:
pipeline

### Exploratory hyperparameter search

In [12]:
# Define coarse parameter grid
param_grid = {
    'columntransformer__num__num_imputer__strategy':['mean', 'median'],
    'decisiontreeclassifier__max_depth': range(2, 14),
    'decisiontreeclassifier__min_samples_leaf': range(3, 15, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 30, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

In [13]:
# Define cross validation
search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1)

In [14]:
# Fit CV
search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits




In [15]:
# cross validation average accuracy
search.best_score_

0.9212097868750229

In [16]:
# best parameters
search.best_params_

{'columntransformer__num__num_imputer__strategy': 'mean',
 'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 9,
 'decisiontreeclassifier__min_samples_split': 13}

### Refined hyperparameter search

In [18]:
param_grid_fine = {
    'columntransformer__num__num_imputer__strategy':['mean', 'median'],
    'decisiontreeclassifier__max_depth': range(3, 7),
    'decisiontreeclassifier__min_samples_leaf': range(7, 12),
    'decisiontreeclassifier__min_samples_split': range(11, 16),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

search = GridSearchCV(pipeline, param_grid_fine, cv=5, verbose=1)
search.fit(X_train, y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits




In [19]:
search.best_score_

0.9212097868750229

In [20]:
search.best_params_

{'columntransformer__num__num_imputer__strategy': 'median',
 'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 9,
 'decisiontreeclassifier__min_samples_split': 12}

In [21]:
best_estimator = search.best_estimator_
best_estimator

## Check performance on test data

Either using the best_estimator_ object or explicitly by running .predict on the gridsearchcv object.

In [22]:
y_train_pred = best_estimator.predict(X_train)

In [23]:
best_estimator.score(X_train, y_train)

0.9272260273972602

In [24]:
accuracy_score(y_true=y_train, y_pred=y_train_pred)

0.9272260273972602

In [25]:
y_test_pred = search.predict(X_test)

In [26]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)

0.9315068493150684

In [27]:
best_estimator.score(X_test, y_test)

0.9315068493150684