In [2]:
#General libraries needed
import numpy as np
import pandas as pd

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


# Import train_test_split function
from sklearn.model_selection import train_test_split 
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics 

In [4]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
df = pd.read_csv("diabetes.csv", header=None, names=col_names)

In [5]:
df.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = df[feature_cols] # Features
y = df.label # Target variable

In [7]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [8]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [9]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7142857142857143


In [10]:
#printing the full tree
from sklearn import tree
from sklearn.tree import export_graphviz
tree.export_graphviz(clf, out_file='tree.dot', feature_names=feature_cols) #produces dot file
import pydot
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('full_tree.png')

In [11]:
#Well, you got a classification rate of 67.53%, considered as good accuracy. 

In [12]:
#Pruning! basically stop the tree before it max. 
#Train
#Directly implement the DecisionTreeClassifier on the training set. To ensure pruning, we set the max_depth=4.
dptree = DecisionTreeClassifier(max_depth=4, criterion="entropy")
dptree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [13]:
#Test!
# Get the predicted y array
y_pred = dptree.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0], dtype=int64)

In Scikit-learn, optimization of decision tree classifier performed by only pre-pruning. Maximum depth of the tree can be used as a control variable for pre-pruning. In the following the example, you can plot a decision tree on the same data with max_depth=3. Other than pre-pruning parameters, You can also try other attribute selection measure such as entropy.

In [14]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

#Damn son we got an accuracy of 77.1%! That's even better.

Accuracy: 0.7705627705627706


In [15]:
#printing the pruned tree
tree.export_graphviz(clf, out_file='tree.dot', feature_names=feature_cols) #produces dot file
(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('pruned_tree.png')

<h1> Bagging (with Decision Tree) </h1>

In [16]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
model = BaggingClassifier(n_estimators=50)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=50, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [17]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

In [18]:
model = GaussianNB()
#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [19]:
# Find the confusion matrix of the result
cm = metrics.confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy of the result
asr = metrics.accuracy_score(y_pred, y_test)
print(asr)

#Accuracy of 77.9%!~ DUPER HIGH SON.

[[128  35]
 [ 18  50]]
0.7705627705627706


<h1>AdaBoost (with Decision Tree) </h1>

*Note that the default AdaBoost implementation in SKLearn is Decision Tree

In [20]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
model = AdaBoostClassifier(n_estimators=50,learning_rate=1)  #if you change learning_rates/ tune the no of weak base classifier, the 
#final accuracy will change 

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#SVC classifer takes long time to run BUT it actually gives a very high accuracy

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)

In [21]:
#Use the trained model to predict the test data
y_pred = model.predict(X_test)

In [22]:
# Find the confusion matrix of the result
cm = metrics.confusion_matrix(y_pred, y_test)
print(cm)

# Find the accuracy of the result
asr = metrics.accuracy_score(y_pred, y_test)
print(asr)

#Accuracy of 77.9%!~ also duper high 

[[129  34]
 [ 17  51]]
0.7792207792207793


In [23]:
#Thanks frens!

#Create the Random Forest classifier.
# - n_estimator is the number of base classifiers (i.e. weak learners)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#SVC classifer takes long time to run BUT it actually gives a very high accuracy

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7878787878787878


In [None]:
#Random Forest 0.805% Accuracy!