In [6]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier        # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # Import scikit-learn metrics module for accuracy calculation

In [7]:
# load dataset
pima = pd.read_csv("datasets/diabetes.csv")
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
pima.columns.tolist()

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [9]:
# split dataset in features and target variable
feature_cols = pima.columns.tolist()
X = pima.drop(['Outcome'], axis=1)  # Features
y = pima[['Outcome']]              # Target variable

In [4]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [8]:
y.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


In [10]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)  # 70% training and 30% test

In [11]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [12]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.7012987012987013


export_graphviz function converts decision tree classifier into dot file and pydotplus convert this dot file to png or displayable form on Jupyter.

In [13]:
from sklearn.tree import export_graphviz

export_graphviz(
    clf, 
    out_file="results/pima_tree.dot", 
    filled=True, 
    rounded=True, 
    special_characters=True, 
    feature_names = X.columns.tolist(), 
    class_names=['0','1']
    )

# then go to cmd.exe, cd folder of .dot : dot -Tpng pima_tree.dot -o pima_tree.png

"graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \ngraph.write_png('diabetes.png')\nImage(graph.create_png())"

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]  # petal length and width
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(max_depth=2)

In [4]:
# visualize the trained Decision Tree
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf, 
    out_file = "results/iris_tree.dot",
    feature_names = iris.feature_names[2:],
    class_names = iris.target_names,
    rounded=True,
    filled=True
   )

# then go to cmd.exe, cd folder of .dot : dot -Tpng iris_tree.dot -o iris_tree.png

In [14]:
tree_clf.predict_proba([[5, 1.5]])   # petals are 5 cm long and 1.5 cm wide

array([[0.        , 0.90740741, 0.09259259]])

In [15]:
tree_clf.predict([[5, 1.5]])

array([1])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting = 'hard')
voting_clf.fit(X_train, y_train)