In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline

In [None]:
# Criando datasets
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=500, centers=4, random_state=101, cluster_std=2.1)

plt.figure(figsize=(6,6))
plt.scatter(X[:,0], X[:,1], c = y, s = 50, cmap='jet')

font = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 18,
        }

plt.xlabel('X', fontdict=font);
plt.ylabel('Y', fontdict=font);


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def visualize_tree(classifier, X, y, 
                   boundaries=True,
                   xlim=None, ylim=None):
    '''
    Visualizes a Decision Tree. 
    INPUTS: Classifier Model, X, y, optional x/y limits.
    OUTPUTS: Meshgrid visualization for boundaries of the Decision Tree
    '''
     
    # Fit the X and y data to the tree
    classifier.fit(X, y)

    
    # Automatically set the x and y limits to the data (+/- 0.1)
    if xlim is None:
        xlim = (X[:, 0].min() - 0.1, X[:, 0].max() + 0.1)
    if ylim is None:
        ylim = (X[:, 1].min() - 0.1, X[:, 1].max() + 0.1)

    # Assign the variables
    x_min, x_max = xlim
    y_min, y_max = ylim
    
    
    # Create a mesh grid
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Define the Z by the predictions (this will color in the mesh grid)
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])

    # Reshape based on meshgrid
    Z = Z.reshape(xx.shape)
    
    # Plot the figure (use)
    plt.figure(figsize=(10,10))
    plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='jet')
    

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='jet')
    
    #Set Limits
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)        
    
    
    font = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 18,
        }
    
    plt.xlabel('X', fontdict=font);
    plt.ylabel('Y', fontdict=font);
    
    def plot_boundaries(i, xlim, ylim):
        '''
        Plots the Decision Boundaries
        '''
        if i < 0:
            return

        # Shorter variable name
        tree = classifier.tree_
        
        # Recursively go through nodes of tree to plot boundaries.
        if tree.feature[i] == 0:
            plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k')
            plot_boundaries(tree.children_left[i],
                            [xlim[0], tree.threshold[i]], ylim)
            plot_boundaries(tree.children_right[i],
                            [tree.threshold[i], xlim[1]], ylim)
        
        elif tree.feature[i] == 1:
            plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k')
            plot_boundaries(tree.children_left[i], xlim,
                            [ylim[0], tree.threshold[i]])
            plot_boundaries(tree.children_right[i], xlim,
                            [tree.threshold[i], ylim[1]])
    
    # Random Forest vs Single Tree
    if boundaries:
        plot_boundaries(0, plt.xlim(), plt.ylim())

In [None]:
clf = DecisionTreeClassifier(max_depth=1, random_state=20, )

visualize_tree(clf, X, y)

In [None]:
# formula calculo gini
def gini(lst):
    soma = sum(lst)
    return 1 - sum([(e/soma) ** 2 for e in lst])

In [None]:
no_raiz = np.bincount(y)
print('Gini de', no_raiz,':', gini(no_raiz))

In [None]:
no_esq = [0, 125, 0, 2]
print('Gini de', no_esq,':', round(gini(no_esq), 5))
no_dir = [125, 0, 125, 123]
print('Gini de', no_dir,':', round(gini(no_dir), 5))

In [None]:
clf = DecisionTreeClassifier(max_depth=2, random_state = 99)

visualize_tree(clf, X, y)

In [None]:
clf = DecisionTreeClassifier(random_state=1000, max_depth = 20)

visualize_tree(clf, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# n_estimators
clf = RandomForestClassifier(n_estimators=50, max_depth = 4,
                             random_state=0, n_jobs=-1)

# As bordas entre regiões adquirem contornos mais 'suaves'
# quebrando a dicotomia inerente ao algoritmo de arvore de decisão.
visualize_tree(clf,X,y,boundaries=False)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import metrics

import seaborn as sns
%matplotlib inline

In [None]:
y_pred = clf.predict(X)

In [None]:
print (accuracy_score(y, y_pred))

In [None]:
confusion_matrix(y, y_pred)

In [None]:
sns.heatmap(confusion_matrix(y, y_pred), annot=True, fmt='d', cmap = 'coolwarm')

In [None]:
print(classification_report(y, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

f1_score(y, y_pred, average = 'weighted')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(n_estimators=25,random_state=0, 
                             max_depth = 1, min_samples_split=250)

visualize_tree(clf,X,y,boundaries=False)