In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(10)

In [2]:
iris_data = pd.read_csv('../data/Iris.csv')

In [3]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


<img height="600" width="750" src="./img/classification/iris_flowers.png">

### Scikit-Learn uses CART (Classification And Regression Tree) algorithm, which will generate binary tree.
  * The algorithm first splits the train set into two subsets using single feature K, threshold $t_k$ (example "petal lenght < 2.45 cm").
  * The algorithm searches for $(K, t_k)$ the purest subset, the cost function is privided by hyper-parameter "criterion" 
    * **For Classification** this can be "gini" for the **Gini impurity** and "entropy" for the **information gain**).
    * **For Regression** this can be "mse" or "mae"
    * CART algorithm is "greedy".
    * Algorithm Complexity: It requires O(exp(m)), m-samples, to find optimal tree. This is the reason way we should settle for "reasonably good" tree. 

<img height="600" width="550" src="./img/classification/dtree_iris_sample.png">

### Regularization parameters
  * It stops recursing once it reaches the maximum depth ("max_depth" hyper parameter), or if it cannot find a split that can reduce impurity.
  * min_samples_split : Minimum number of samples a node must have before it can be split.
  * min_samples_leaf : Minimum number of samples a leaf mush have.
  * min_weight_fraction_leaf: Same as min_samples_leaf but expressed as a freaction of the total number of weighted instances.
  * max_leaf_nodes : Maximum number of leaf nodes.
  * max_features : Maximum number of features that are evaluated for splitting at each node.
  * **Increasing min_* OR reducing max_* hyperparameters will regularize the model.** 


In [4]:
X, y = iris_data.iloc[:,1:-1], iris_data.iloc[:, -1]

In [5]:
y.shape

(150,)

In [6]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state=42)

## DecisionTree Classification

In [7]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')

In [8]:
tree_clf.predict_proba(X_train)[:5, :]

array([[ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.97297297,  0.02702703],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ]])

In [9]:
pred_y_train = tree_clf.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

def printCLFAccuracyScores(y, pred_y):
    conf_matrix = confusion_matrix(y, pred_y)
    print("conf_matrix : ", conf_matrix)
    prec_score = precision_score(y, pred_y, average='macro')
    print("precision_score : ", prec_score)
    rec_score = recall_score(y, pred_y, average='macro')
    print("recall_score : ", rec_score)
    fl_score = f1_score(y, pred_y, average='macro')
    print("fl_score : ", fl_score)

In [None]:
printCLFAccuracyScores(y_train, pred_y_train)

In [None]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(tree_clf, X_train, y_train, cv=10, scoring="f1_macro")
print("cv_score : ", cv_score)

### This looks good lets try GridSearchCV and try to find best parameters.

In [None]:
param_grid = [
    {'max_leaf_nodes':[2,4,5,6,8,10,12],
     'max_depth' : [2,3,4,5,6,7,8]}
  ]

grd_tree_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(grd_tree_clf, param_grid, cv=5,
                           scoring='f1_macro')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
best_tree_clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=8, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
best_tree_clf.fit(X_train, y_train)
pred_y_train = best_tree_clf.predict(X_train)

printCLFAccuracyScores(y_train, pred_y_train)

In [None]:
for (feature, score) in zip (X.columns, best_tree_clf.feature_importances_):
    print("Feature - \"{0}\" - Importance Score {1} : ".format(feature, score))

In [None]:
pred_y_test = best_tree_clf.predict(X_test)
printCLFAccuracyScores(y_test, pred_y_test)

In [19]:
from sklearn.tree import export_graphviz
import pydot
X_features  = [feature for feature in iris_data.columns if feature not in ['Id']]
export_graphviz(
        best_tree_clf,
        out_file="./iris_classification_tree.dot",
        feature_names=X_features,
        class_names=iris_data.Species.unique(),
        rounded=True,
        filled=True
    )      

In [20]:
# To convert dot file to png you may have to install graphviz on OS (Linux - comand prompt/Windows - http://www.graphviz.org/Download_windows.php)
(graph,) = pydot.graph_from_dot_file('./iris_classification_tree.dot')
graph.write_png('./iris_classification_tree.png')

True

## Decision Tree Regression

In [21]:
from sklearn.tree import DecisionTreeRegressor

In [22]:
housing_train_set = pd.read_csv('../data/cleaned_housing_train_set.csv')
X_features  = [feature for feature in housing_train_set.columns if feature not in ['SalePrice', 'Id']]
y_feature = 'SalePrice'

In [23]:
class InputOutputFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, in_attr_list, our_feature):
        self.in_attr_list = in_attr_list
        self.our_feature = our_feature
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.in_attr_list].values, X[self.our_feature].values

In [24]:
def defineInOutPipeling(X_feat, y_feat):
    pipeline_inout_features = Pipeline([
        ('inputOutputFeatureSelector', InputOutputFeatureSelector(X_feat, y_feat))
    ])
    return pipeline_inout_features

In [25]:
pipeline_inout_features = defineInOutPipeling(X_features, y_feature)

In [26]:
X_train, y_train = pipeline_inout_features.fit_transform(housing_train_set)

In [27]:
tree_reg = DecisionTreeRegressor(max_depth=3, random_state=32)
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=32,
           splitter='best')

In [28]:
y_train_pred =tree_reg.predict(X_train)

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def printREGAccuracyScores(y, pred_y):
    mse =  mean_squared_error(y, pred_y)
    print("Root Mean Squared Error : ", np.sqrt(mse))
    r2 = r2_score(y, pred_y)
    print("R-Squared Error : ", r2)

In [30]:
printREGAccuracyScores(y_train, y_train_pred)

Root Mean Squared Error :  41635.144072
R-Squared Error :  0.743141538019


### The above scores doesn't look good, lets try "RandomizedSearchCV" to find best parameters.

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'max_leaf_nodes': randint(low=1, high=200),
        'max_depth': randint(low=1, high=100),
        'min_samples_leaf' : randint(low=1, high=100)
    }
grd_tree_reg = DecisionTreeRegressor(random_state=32)
rnd_tree_search = RandomizedSearchCV(grd_tree_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=32)
rnd_tree_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=32,
           splitter='best'),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_leaf_nodes': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017203AE8320>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017203A720B8>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000017203ADB9E8>},
          pre_dispatch='2*n_jobs', random_state=32, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=0)

In [32]:
rnd_tree_search.best_params_

{'max_depth': 4, 'max_leaf_nodes': 66, 'min_samples_leaf': 35}

In [33]:
rnd_tree_search.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=66, min_impurity_split=1e-07,
           min_samples_leaf=35, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=32,
           splitter='best')

In [34]:
best_tree_reg = DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=66, min_impurity_split=1e-07,
           min_samples_leaf=35, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=32,
           splitter='best')
best_tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=66, min_impurity_split=1e-07,
           min_samples_leaf=35, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=32,
           splitter='best')

In [35]:
best_y_train_pred = best_tree_reg.predict(X_train)

In [36]:
printREGAccuracyScores(y_train, best_y_train_pred)

Root Mean Squared Error :  39307.0434977
R-Squared Error :  0.771063787923


In [37]:
housing_test_set= pd.read_csv('../data/cleaned_housing_test_set.csv')
X_test, y_test = pipeline_inout_features.transform(housing_test_set)

In [38]:
best_y_test_pred = best_tree_reg.predict(X_test)

In [39]:
printREGAccuracyScores(y_test, best_y_test_pred)

Root Mean Squared Error :  36029.2990228
R-Squared Error :  0.718760796865
