# Iteration 2: Pipelines and GridSearch

## Load data

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('./data/housing_iteration_0_2_classification.csv')
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0


In [3]:
y = df.pop('Expensive')

In [4]:
X = df.copy()

## Train-test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build Pipeline

### Initialization

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [7]:
imputer = SimpleImputer(strategy='mean')
dtree = DecisionTreeClassifier(max_depth=3, min_samples_leaf=20, random_state=42)

In [8]:
pipe = make_pipeline(imputer, dtree).set_output(transform='pandas')

### Fit and predict pipeline

In [9]:
pipe.fit(X_train, y_train)

In [11]:
y_train_pred = pipe.predict(X_train)

In [12]:
accuracy_score(y_true=y_train, y_pred=y_train_pred)

0.9126712328767124

In [13]:
y_test_pred = pipe.predict(X_test)

In [14]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)

0.9246575342465754

## Define GridSearchCV

In [15]:
# 1. initialize transformers & model without specifying the parameters
imputer = SimpleImputer()
scaler = MinMaxScaler()
dtree = DecisionTreeClassifier()

In [16]:
# 2. Create a pipeline
pipe = make_pipeline(imputer,
                     scaler, 
                     dtree).set_output(transform='pandas')

### Exploratory hyperparameter search

In [17]:
# 3. Define coarse parameter grid
param_grid = {
    'simpleimputer__strategy':['mean', 'median'],
    'decisiontreeclassifier__max_depth': range(2, 14),
    'decisiontreeclassifier__min_samples_leaf': range(3, 15, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 50, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }

In [18]:
# 4. Define cross validation
search = GridSearchCV(pipe, param_grid, cv=5, verbose=1)

In [None]:
# 5. Fit CV
search.fit(X_train, y_train)

Fitting 5 folds for each of 2880 candidates, totalling 14400 fits


In [None]:
# cross validation average accuracy
search.best_score_

In [None]:
# best parameters
search.best_params_

### Refined hyperparameter search

In [None]:
param_grid_fine = {
    'simpleimputer__strategy':['mean', 'median'],
    'decisiontreeclassifier__max_depth': range(2, 7),
    'decisiontreeclassifier__min_samples_leaf': range(11, 16),
    'decisiontreeclassifier__min_samples_split': range(31, 36),
    'decisiontreeclassifier__criterion':['gini']
    }

search = GridSearchCV(pipe, param_grid_fine, cv=5, verbose=1)
search.fit(X_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
best_estimator = search.best_estimator_
best_estimator

### Check performance on test data

Either using the best_estimator_ object or explicitly by running .predict on the gridsearchcv object.

In [None]:
y_train_pred = best_estimator.predict(X_train)

In [None]:
best_estimator.score(X_train, y_train)

In [None]:
accuracy_score(y_true=y_train, y_pred=y_train_pred)

In [None]:
y_test_pred = search.predict(X_test)

In [None]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)

In [None]:
best_estimator.score(X_test, y_test)