In [1]:
import os
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [2]:
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [3]:
iris = load_iris()

In [4]:
X = iris.data[:, 2:] # petal length and width
y = iris.target

In [5]:
tree_clf_2 = DecisionTreeClassifier(max_depth=2)
tree_clf_2.fit(X, y)

DecisionTreeClassifier(max_depth=2)

In [6]:
from sklearn.tree import export_graphviz

In [7]:
export_graphviz(
    tree_clf_2, 
    out_file=os.path.join(IMAGES_PATH, "iris_tree_2.dot"),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [8]:
tree_clf_3 = DecisionTreeClassifier(max_depth=3)
tree_clf_3.fit(X, y)

DecisionTreeClassifier(max_depth=3)

In [9]:
export_graphviz(
    tree_clf_3, 
    out_file=os.path.join(IMAGES_PATH, "iris_tree_3.dot"),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [10]:
tree_clf_2.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [12]:
tree_clf_2.predict([[5, 1.5]])

array([1])

# Regression

In [13]:
from sklearn.tree import DecisionTreeRegressor

In [14]:
tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [15]:
export_graphviz(
    tree_reg, 
    out_file=os.path.join(IMAGES_PATH, "iris_tree_reg_2.dot"),
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

# Excercise:

## Ans.7

In [153]:
from sklearn.datasets import make_moons

In [154]:
moons = make_moons(n_samples=1000, noise=0.4)
type(moons)

tuple

In [155]:
X = moons[0]
y = moons[1]

In [156]:
from sklearn.model_selection import train_test_split

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [158]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [159]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("dt_clf", DecisionTreeClassifier()),
])

In [160]:
params = {
    "dt_clf__max_depth": np.random.randint(3, 10, size=5),
    "dt_clf__max_leaf_nodes": np.random.randint(5, 10, size=4),
    "dt_clf__presort": [True, False],
}

In [161]:
best_model = GridSearchCV(estimator=pipe, param_grid=params, cv=5, scoring='accuracy',                                   verbose=1, return_train_score=True)

In [162]:
best_model.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.8s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('dt_clf', DecisionTreeClassifier())]),
             param_grid={'dt_clf__max_depth': array([6, 4, 9, 3, 3]),
                         'dt_clf__max_leaf_nodes': array([7, 6, 5, 9]),
                         'dt_clf__presort': [True, False]},
             return_train_score=True, scoring='accuracy', verbose=1)

In [163]:
best_model.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('dt_clf',
                 DecisionTreeClassifier(max_depth=6, max_leaf_nodes=7,
                                        presort=True))])

In [164]:
predictions = best_model.predict(X_test)

In [165]:
from sklearn.metrics import accuracy_score

In [166]:
accuracy = accuracy_score(y_test, predictions)

In [167]:
accuracy

0.855

## Ans.8
### Random Forest Classifier

In [168]:
from sklearn.model_selection import ShuffleSplit

In [169]:
splitter = ShuffleSplit(n_splits=1000, train_size=0.1, test_size=0.9)
predictions = np.zeros(shape=(1000, X_test.shape[0]))
acc_scores = np.zeros(shape=(1000))

In [170]:
predictions.shape

(1000, 200)

In [171]:
acc_scores.shape

(1000,)

In [172]:
indices = splitter.split(X)

In [173]:
zip(np.arange(0, 1000), indices)

<zip at 0x7f4dcb7a48c0>

In [174]:
for i, index in zip(np.arange(0, 1000), indices):
    
    # train_index = index[0]
    # test_index = index[1]
    X_train, y_train = X[index[0]], y[index[0]]
    
    # fitting the data on the model having best hyperparameters 
    # from the above questions
    dt_clf = best_model.best_estimator_
    dt_clf.fit(X_train, y_train)
    

    # predicting using test set (the original one used in above ques)
    predictions[i, :] = dt_clf.predict(X_test)
    acc_scores[i] = accuracy_score(y_test, predictions[i, :])

In [175]:
predictions[0]

array([1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1.,
       1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1.])

In [176]:
acc_scores[:10]

array([0.88 , 0.815, 0.825, 0.83 , 0.865, 0.83 , 0.835, 0.78 , 0.84 ,
       0.825])

In [177]:
acc_scores.mean(), acc_scores.max(), acc_scores.min()

(0.82375, 0.88, 0.7)

In [178]:
from scipy.stats import mode

In [179]:
final_predictions = np.array(mode(predictions, axis=0))
final_predictions.shape

(2, 1, 200)

In [180]:
final_accuracy = accuracy_score(y_test, final_predictions[0][0])
final_accuracy

0.87