In [76]:
import numpy as np
import pandas as pd

In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, ShuffleSplit
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

# Iris Dataset

In [110]:
df1 = pd.read_csv('data/iris.data', header=None)
df1.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [111]:
features = df1.iloc[:, :-1]
labels = df1[4]

In [112]:
le = LabelEncoder().fit(labels)

In [113]:
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [114]:
labels = le.transform(labels)
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [115]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=7)

In [116]:
dtclf = DecisionTreeClassifier()
dtclf.fit(X_train, y_train)

DecisionTreeClassifier()

In [117]:
dtclf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [118]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, dtclf.predict(X_train))}")

y_pred_dtclf = dtclf.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_dtclf)}")

f1 = f1_score(y_test, y_pred_dtclf, average='weighted')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_dtclf))
print("Confusion Matrix: \n", confusion_matrix(y_pred_dtclf, y_test))

Accuracy Score of Training Set:  1.0
Accuracy Score of Test Set: 0.9
F1 Score of Test Set: 0.9
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.91      0.83      0.87        12
           2       0.83      0.91      0.87        11

    accuracy                           0.90        30
   macro avg       0.91      0.91      0.91        30
weighted avg       0.90      0.90      0.90        30

Confusion Matrix: 
 [[ 7  0  0]
 [ 0 10  1]
 [ 0  2 10]]


In [119]:
param_grid = {
    "max_depth" : [1,3,5,7,9,11,12],
    "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
    "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
}

grid_search = GridSearchCV(estimator=dtclf,
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=5)

In [120]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 12],
                         'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70,
                                            80, 90],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='accuracy')

In [121]:
grid_search.best_params_

{'max_depth': 3, 'max_leaf_nodes': None, 'min_samples_leaf': 2}

In [122]:
grid_search.best_score_

0.9833333333333334

In [123]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.9916666666666667
Accuracy Score of Test Set: 0.9
F1 Score of Test Set: 0.9
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.91      0.83      0.87        12
           2       0.83      0.91      0.87        11

    accuracy                           0.90        30
   macro avg       0.91      0.91      0.91        30
weighted avg       0.90      0.90      0.90        30

Confusion Matrix: 
 [[ 7  0  0]
 [ 0 10  1]
 [ 0  2 10]]


# Diabetes Dataset

In [124]:
diabetes = pd.read_csv('data/diabetes.tab.txt', delimiter = "\t")

In [125]:
features = diabetes.loc[:, diabetes.columns != 'SEX']
labels = diabetes['SEX']

In [126]:
scaler = StandardScaler().fit(features)

In [127]:
X_scaled = scaler.transform(features)

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.2, random_state=8) 

In [129]:
dtclf1 = DecisionTreeClassifier()

In [130]:
dtclf1.fit(X_train, y_train)

DecisionTreeClassifier()

In [131]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, dtclf1.predict(X_train))}")

y_pred_dtclf1 = dtclf1.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_dtclf1)}")

f1 = f1_score(y_test, y_pred_dtclf1, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_dtclf1))
print("Confusion Matrix: \n", confusion_matrix(y_pred_dtclf1, y_test))

Accuracy Score of Training Set:  1.0
Accuracy Score of Test Set: 0.6404494382022472
F1 Score of Test Set: 0.6404494382022472
Classification Report
              precision    recall  f1-score   support

           1       0.65      0.68      0.67        47
           2       0.62      0.60      0.61        42

    accuracy                           0.64        89
   macro avg       0.64      0.64      0.64        89
weighted avg       0.64      0.64      0.64        89

Confusion Matrix: 
 [[32 17]
 [15 25]]


In [132]:
param_grid = {
    "max_depth" : [1,3,5,7,9,11,12],
    "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
    "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
}

grid_search = GridSearchCV(estimator=dtclf1,
                            param_grid=param_grid,
                            scoring='f1_micro',
                            cv=5)

In [133]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 12],
                         'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70,
                                            80, 90],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='f1_micro')

In [134]:
grid_search.best_params_

{'max_depth': 7, 'max_leaf_nodes': 10, 'min_samples_leaf': 6}

In [135]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  0.7790368271954674
Accuracy Score of Test Set: 0.6179775280898876
F1 Score of Test Set: 0.6179775280898876
Classification Report
              precision    recall  f1-score   support

           1       0.64      0.62      0.63        47
           2       0.59      0.62      0.60        42

    accuracy                           0.62        89
   macro avg       0.62      0.62      0.62        89
weighted avg       0.62      0.62      0.62        89

Confusion Matrix: 
 [[29 16]
 [18 26]]


# Breast Cancer Dataset

In [136]:
data = pd.read_csv("data/breast-cancer-wisconsin.data", header=None)

In [137]:
data = data[data[6] != '?']

In [138]:
X = data.iloc[:, 1: -1]
y = data[10]

In [139]:
y = y.replace(2, 0)
y = y.replace(4, 1)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10) 

In [141]:
dtclf2 = DecisionTreeClassifier()
dtclf2.fit(X_train, y_train)

DecisionTreeClassifier()

In [143]:
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, dtclf2.predict(X_train))}")

y_pred_dtclf2 = dtclf2.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_dtclf2)}")

f1 = f1_score(y_test, y_pred_dtclf2, average='weighted')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_dtclf2))
print("Confusion Matrix: \n", confusion_matrix(y_pred_dtclf2, y_test))

Accuracy Score of Training Set:  1.0
Accuracy Score of Test Set: 0.9635036496350365
F1 Score of Test Set: 0.9634135086932353
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.98      0.97        89
           1       0.96      0.94      0.95        48

    accuracy                           0.96       137
   macro avg       0.96      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137

Confusion Matrix: 
 [[87  3]
 [ 2 45]]


In [144]:
param_grid = {
    "max_depth" : [1,3,5,7,9,11,12],
    "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
    "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
}

grid_search = GridSearchCV(estimator=dtclf2,
                            param_grid=param_grid,
                            scoring='f1',
                            cv=5)

In [145]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 12],
                         'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70,
                                            80, 90],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='f1')

In [146]:
grid_search.best_params_

{'max_depth': 11, 'max_leaf_nodes': 80, 'min_samples_leaf': 1}

In [147]:
bestNB = grid_search.best_estimator_
print(f"Accuracy Score of Training Set:  {accuracy_score(y_train, bestNB.predict(X_train))}")

y_pred_bestNB = bestNB.predict(X_test)
print(f"Accuracy Score of Test Set: {accuracy_score(y_test, y_pred_bestNB)}")

f1 = f1_score(y_test, y_pred_bestNB, average='micro')
print(f"F1 Score of Test Set: {f1}")
      
print("Classification Report")    
print(classification_report(y_test, y_pred_bestNB))
print("Confusion Matrix: \n", confusion_matrix(y_pred_bestNB, y_test))

Accuracy Score of Training Set:  1.0
Accuracy Score of Test Set: 0.9708029197080292
F1 Score of Test Set: 0.9708029197080292
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        89
           1       0.96      0.96      0.96        48

    accuracy                           0.97       137
   macro avg       0.97      0.97      0.97       137
weighted avg       0.97      0.97      0.97       137

Confusion Matrix: 
 [[87  2]
 [ 2 46]]
