## Creating a Pipeline and tuning the model with Grid Search Cross Validation

### Data reading and preprocessing

In [31]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix

In [32]:
#read text file into a Dataframe
df = pd.read_csv("housing-classification-iter-0-2.csv")
df.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive
0,8450,65.0,856,3,0,0,2,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0
2,11250,68.0,920,3,1,0,2,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0
4,14260,84.0,1145,4,1,0,3,192,0,0


## Setting X and y
X: columns that help us make a prediction.
y: the column that we want to predict.

In [33]:
X = df
y = df.pop('Expensive')

### Feature Selection: keeping only numerical features

In [34]:
X_num = X.select_dtypes(include="number")
X.shape

(1460, 9)

## Data Splitting (Train - Test)

In [35]:
from sklearn.model_selection import train_test_split

X_num_train, X_num_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=31416)

In [36]:
y_test

1253    1
202     0
213     0
1331    0
120     0
       ..
978     0
1057    0
782     0
125     0
1409    0
Name: Expensive, Length: 292, dtype: int64

## Dealing with missing Data

#### Imputing missing values
(Fit on train, transform train & test)

In [37]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer() # initialize
my_imputer.fit(X_num_train) # fit on the train set
X_num_imputed_train = my_imputer.transform(X_num_train) # transform the train set
X_num_imputed_test = my_imputer.transform(X_num_test) # transform the test set

## Modelling: Decision Tree

In [39]:
# 1. import the model
from sklearn.tree import DecisionTreeClassifier 

# 2. initialize the model
my_tree = DecisionTreeClassifier(max_depth=5,
                                 min_samples_leaf=12
                                )

# 3. fit the model to the train data
my_tree.fit(X = X_num_imputed_train, 
            y = y_train)

## Check accuracy on the train set

In [44]:
from sklearn.metrics import accuracy_score

y_pred_tree_train = my_tree.predict(X_num_imputed_train)

train_accuracy= accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train)

round(train_accuracy, 3)


0.931

## Check accuracy on the test set

In [47]:
y_pred_tree_test = my_tree.predict(X_num_imputed_test)

test_accuracy = accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test)   

round(test_accuracy, 3)

0.932

## Creating a Pipeline

Before moving forward in our quest to improve the model, take a moment to learn how to use Scikit-Learn Pipelines. They will not increase your performance, but they are a necessary tool to compress all the steps in the data preparation + modelling phases into a single one, and this will become very relevant as we move forward and keep adding new steps:

In [48]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

# 1. initialize transformers &amp; model
imputer = SimpleImputer(strategy="median")
dtree = DecisionTreeClassifier(max_depth=5,
                               min_samples_leaf=12)
 
# 2. Create a pipeline*
pipe = make_pipeline(imputer, dtree)
 
# 3. Fit the pipeline to the training data
pipe.fit(X_num_train, y_train)
 
# 4. Use the pipeline to make predictions
pipe.predict(X_num_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64)

Now, the object pipie can take (almost) raw data as input and output predictions. We no longer need to impute missing values and use the model to make predictions in separate steps.

## Using GridsearchCV to find the best parameters

So far, we tuned the hyperparameters of the decision tree manually. This is not ideal, for two reasons:

It's not efficient in terms of quickly finding the best combination of parameters.
If we keep checking the performance on the test set over and over again, we might end up creating a model that fits that particular test set, but does not generalize as well with new data. Test sets are meant to reamain unseen until the very last moment of ML development —we have been cheating a bit!

In [49]:
# 1. initialize transformers &amp; model
imputer = SimpleImputer()
dtree = DecisionTreeClassifier()
 
# 2. Create a pipeline*
pipe = make_pipeline(imputer, dtree)

param_grid = {
    'decisiontreeclassifier__max_depth': range(2, 12),
    'decisiontreeclassifier__min_samples_leaf': range(3, 10, 2),
    'decisiontreeclassifier__min_samples_split': range(3, 40, 5),
    'decisiontreeclassifier__criterion':['gini', 'entropy']
    }
    
from sklearn.model_selection import GridSearchCV
 
search = GridSearchCV(pipe, # you have defined this beforehand
                      param_grid, # your parameter grid
                      cv=5, # the value for K in K-fold Cross Validation
                      scoring='accuracy', # the performance metric to use, 
                      verbose=1) # we want informative outputs during the training process

In [51]:
search.fit(X_num_imputed_train, y_train)

Fitting 5 folds for each of 640 candidates, totalling 3200 fits


In [52]:
search.best_params_

{'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 5,
 'decisiontreeclassifier__min_samples_leaf': 5,
 'decisiontreeclassifier__min_samples_split': 38}

In [54]:
search.best_score_     

0.9280950808847805

In [57]:
# 1. import the model
from sklearn.tree import DecisionTreeClassifier 

# 2. initialize the model
my_tree = DecisionTreeClassifier(max_depth=5,
                                 min_samples_leaf=5
                                )

# 3. fit the model to the train data
my_tree.fit(X = X_num_imputed_train, 
            y = y_train)

In [60]:
from sklearn.metrics import accuracy_score

y_pred_tree_train = my_tree.predict(X_num_imputed_train)

train_accuracy= accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train)

round(train_accuracy,2)

0.94

In [61]:
from sklearn.metrics import accuracy_score

y_pred_tree_test = my_tree.predict(X_num_imputed_test)

test_accuracy= accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test)

round(test_accuracy,2)

0.93

In [55]:
confusion_matrix(y_test, y_pred_tree_test, labels=None, sample_weight=None, normalize=None)

array([[245,   9],
       [ 11,  27]], dtype=int64)

## Confusion Matrix

In [56]:
from sklearn.metrics import classification_report

In [63]:
print(classification_report(y_test, y_pred_tree_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       369
           1       0.76      0.61      0.68        69

    accuracy                           0.91       438
   macro avg       0.85      0.79      0.81       438
weighted avg       0.90      0.91      0.90       438

