In [34]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.datasets import load_wine
from IPython.display import SVG
from IPython.display import display
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Install graphviz
#conda install python-graphviz
from graphviz import Source  

#Install ipywidgets
#conda install -c conda-forge ipywidgets
from ipywidgets import interactive

In [3]:
df1_train = pd.read_pickle("./df1_train.pkl")
df1_test  = pd.read_pickle("./df1_test.pkl")

df2_train = pd.read_pickle("./df2_train.pkl")
df2_test  = pd.read_pickle("./df2_test.pkl")
print(str(len(df1_train))+" "+str(len(df1_test))+" || "+str(len(df2_train))+" "+str(len(df2_test)))

2644 689 || 5605 1416


In [43]:
'''
criterion         : The function to measure the quality of a split.
                    “gini” for the Gini impurity and “entropy” for the information gain.
splitter          : The strategy used to choose the split at each node
min_samples_split : The minimum number of samples required to split an internal node
                    If int, then consider min_samples_split as the minimum number
                    If float, then min_samples_split is a fraction and ceil(min_samples_split * n_samples) are the minimum number
                    of samples for each split.
    ***[i.e fraction of the training dataset size]
    ***    0.01 of 2644 = 26 i.e < 26 entires and a node cannot be split!!! (56 for df2)
min_samples_leaf : The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered
                    if it leaves at least min_samples_leaf training samples in each of the left and right branches.
                    This may have the effect of smoothing the model, especially in regression.
                    ***Same int/float as above
'''

def run_decission_tree(dataset, criteria, split_type, max_depth, min_split, min_leaf):
    if(dataset == 1):
        X_train       = df1_train.iloc[:,:-1]
        y_train       = df1_train.iloc[:,-1:]
        X_test        = df1_test.iloc[:,:-1]
        y_test        = df1_test.iloc[:,-1:]
        feature_names = df1_train.columns[:-1]
    elif(dataset == 2):
        X_train       = df2_train.iloc[:,:-1]
        y_train       = df2_train.iloc[:,-1:]
        X_test        = df2_test.iloc[:,:-1]
        y_test        = df2_test.iloc[:,-1:]
        feature_names = df2_train.columns[:-1]
        
    decision_tree = DecisionTreeClassifier(criterion = criteria, splitter = split_type, max_depth = max_depth, 
                                       min_samples_split = min_split , min_samples_leaf = min_leaf)
    decision_tree.fit(X_train, y_train)
    graph = Source(tree.export_graphviz(decision_tree , out_file=None, feature_names=feature_names,
                                        class_names=['0','1'] , filled = True))                  #'decision_tree.dot'
    display(SVG(graph.pipe(format='svg')))
    
    class_names=[0,1]
    y_pred_train       = decision_tree.predict(X_train)
    y_pred_test        = decision_tree.predict(X_test)                     
    cnf_matrix_train   = metrics.confusion_matrix(y_train, y_pred_train)
    cnf_matrix_test    = metrics.confusion_matrix(y_test , y_pred_test)
    cnf_matrix_train   = cnf_matrix_train/len(X_train)
    cnf_matrix_test    = cnf_matrix_test/len(y_test)
    
    print("Confusion Matrix Training set")
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(cnf_matrix_train), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()
    
    print("Confusion Matrix Test set")
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    sns.heatmap(pd.DataFrame(cnf_matrix_test), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()
    
    print("=================On test set=================")
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred_test))
    print("Precision:",metrics.precision_score(y_test, y_pred_test))
    print("Recall:",metrics.recall_score(y_test, y_pred_test))
    print("F1 Score:",metrics.f1_score(y_test, y_pred_test))
    
    return decision_tree

inter=interactive(run_decission_tree, dataset = [1,2], criteria = ["gini", "entropy"], 
                  split_type = ["best", "random"],
                  max_depth = [None,1,2,3,4,5,6,7,8,9,10], min_split = (0.01,0.5), min_leaf = (0.01,0.5) )
display(inter)

interactive(children=(Dropdown(description='dataset', options=(1, 2), value=1), Dropdown(description='criteria…