## Decision Tree
### Our Goal:
To diagnostically predict whether a patient has diabetes.

### To Explore:
1. DecisionTreeClassifier
2. BaggingClassifier
3. AdaBoostClassifier
4. RandomForestClassifier
    

In [1]:
#General libraries needed
import numpy as np
import pandas as pd

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier


# Import train_test_split function
from sklearn.model_selection import train_test_split 
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics 

In [None]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
df = pd.read_csv("Diabetes.csv", header = None, names=col_names)
df.head()

In [None]:
#split dataset in features and target variable
#label is the result we trying to predict [1 = got diabetes, 0 = healthy]

feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = df[feature_cols]          # Features, independent var
y = df.label                  # Target variable, dependent var (result)

In [None]:
# Split dataset into training set and test set
# Random partitions

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [None]:
#Create tree
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#printing the full tree

from sklearn import tree
from sklearn.tree import export_graphviz
tree.export_graphviz(clf, out_file='tree.dot', feature_names=feature_cols) #produces dot file

import pydot

(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('full_tree.png')

In [None]:
#Pruning! basically stop the tree before it max. 
#Directly implement the DecisionTreeClassifier on the training set. To ensure pruning, we set the max_depth=4.
#Prevents overfitting.. but how do we do know when is too much????

dptree = DecisionTreeClassifier(max_depth=4)
dptree.fit(X_train, y_train)

#Test!
# Get the predicted y array

y_pred = dptree.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create Decision Tree classifer object
# possible parameters to pass in and default values:
# SEE: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

# criterion = "gini" or "entropy"
# max_depth: int (default, none -- go as deep)
# min_samples_split: int (default 2), if use float, it will consider the value as a proportion -- relative to dataset I believe)

for depth in range(1, 20):
    
    pred_sum = 0
    
    trials = 100
    
    for trial in range(0, trials):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test
        
        decision_tree_test = DecisionTreeClassifier(
            max_depth = depth,
        )

        dt = decision_tree_test.fit(X_train,y_train)

        dt_pred = dt.predict(X_test)
        pred_sum += metrics.accuracy_score(y_test, dt_pred)
    
    print("Depth = ", depth , "Average Accuracy:", pred_sum/trials)

<h1> Bagging (with Decision Tree) </h1>

In [None]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (number of trees trained) (i.e. weak learners)

model = BaggingClassifier(n_estimators=200)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#Use the trained model to predict the test data
y_pred = model.predict(X_test)

# Find the confusion matrix of the result
cm = metrics.confusion_matrix(y_pred, y_test)
print(cm)

In [None]:
# Find the accuracy of the result

bagged_results = metrics.accuracy_score(y_pred, y_test)
print(bagged_results)

<h1>AdaBoost (with Decision Tree) </h1>

*Note that the default AdaBoost implementation in SKLearn is Decision Tree

In [None]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1

model = AdaBoostClassifier(n_estimators=200, learning_rate = 0.1)  #if you change learning_rates/ tune the no of weak base classifier, the 
#final accuracy will change

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#SVC classifer takes long time to run BUT it actually gives a very high accuracy

In [None]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

# Find the confusion matrix of the result
cm = metrics.confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy of the result
asr = metrics.accuracy_score(y_pred, y_test)
print(asr)

In [None]:
#The bigger the learning rate the more emphathsis on older trees

for learningrate in range(1, 8):
    
    pred_sum = 0
    
    trials = 20
    
    for trial in range(0, trials):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test
        
        decision_tree_test = AdaBoostClassifier(n_estimators=200, learning_rate = 0.3 ** learningrate)

        dt = decision_tree_test.fit(X_train,y_train)

        dt_pred = dt.predict(X_test)
        
        pred_sum += metrics.accuracy_score(y_test, dt_pred)
    
    print("learningrate = ", 0.3 ** learningrate)
    print("Average Accuracy:", pred_sum/trials)
    print("-----------------------------------")

# <h1> Random Forest </h1>

In [None]:
#Create the Random Forest classifier.
#n_estimator is the number of base classifiers (i.e. weak learners) number of trees you have
#default is sqrt of the m var

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

#SVC classifer takes long time to run BUT it actually gives a very high accuracy

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
#put a trail loop inside 
for power in range(7):
    
    n_estimators = 5 ** power

    random_forest_test = RandomForestClassifier(
        n_estimators = n_estimators,
        bootstrap = True,
        n_jobs = -1
    )

    random_forest_test.fit(X_train, y_train)
    y_pred = random_forest_test.predict(X_test)
    print("Accuracy for n_estimators:", n_estimators, " is ", metrics.accuracy_score(y_test, y_pred))

Adapted from your Seniors DA (The Pioneer Batch) : **Ding Yang, Linus Cheng, Tan Kin Meng, and Kaelyn** for these codes, which they used in their DAP sharing :)