In [78]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mode
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [4]:
iris = load_iris()
X = iris.data[:, 2:]
y = iris["target"]

In [7]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X,y)

In [10]:
export_graphviz(
    tree_clf,
    out_file="iris_tree.dot",
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [13]:
tree_clf.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [16]:
tree_clf.predict([[5, 1.5]])

array([1])

In [19]:
tree_reg = DecisionTreeRegressor(max_depth=2)
tree_reg.fit(X, y)

In [22]:
export_graphviz(
    tree_reg,
    out_file="iris_tree2.dot",
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

## EXERCISES

### 7

In [25]:
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
param_grid = {"max_leaf_nodes": list(range(2, 100)), "min_samples_split": [2, 3, 4]}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X,y)

In [33]:
best_params = grid_search.best_params_
best_params

{'max_leaf_nodes': 35, 'min_samples_split': 2}

In [35]:
best_tree = DecisionTreeClassifier(**best_params, random_state=42)
best_tree.fit(X_train, y_train)

In [89]:
accuracy = best_tree.score(X_test, y_test)
print(f"Test set accuracy: {accuracy:.2%}") 

Test set accuracy: 87.20%


### 8

In [41]:
n_trees = 1000
n_instances = 100
rs = ShuffleSplit(n_splits=n_trees, train_size=n_instances, random_state=42)
mini_sets = [(X_train[train_idx], y_train[train_idx]) for train_idx, _ in rs.split(X_train)]

In [44]:
tree_clf2 = DecisionTreeClassifier(**best_params, random_state=42)

In [60]:
accuracies = []
forest = []
for X_mini, y_mini in mini_sets:
    tree_clf2.fit(X_mini, y_mini)
    accuracy = tree_clf2.score(X_test, y_test)
    accuracies.append(accuracy)
    forest.append(tree_clf2)

In [91]:
np.mean(accuracies)

0.7981655

In [93]:
predictions = np.array([tree.predict(X_test) for tree in forest])
majority_vote, _ = mode(predictions, axis=0)
y_pred = majority_vote.flatten()

In [94]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2%}")

Random Forest Accuracy: 82.55%
