In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report

In [2]:
# load dataset
pima = pd.read_csv("https://raw.githubusercontent.com/manishanker/stats_ml_jun2020/master/diabetes_csv.csv")

In [3]:
pima.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [4]:
pima.columns

Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [8]:
pima.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [9]:
#split dataset in features and target variable
#feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age','Glucose','BloodPressure', 
#                'SkinThickness', 'DiabetesPedigreeFunction']
X = pima.drop(["class"] , axis =1)
y = pima["class"]

In [10]:
X.describe()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [11]:
y.describe()

count                 768
unique                  2
top       tested_negative
freq                  500
Name: class, dtype: object

In [13]:
y.value_counts()

tested_negative    500
tested_positive    268
Name: class, dtype: int64

In [14]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, random_state=555) # 70% training and 30% test

In [15]:
pima.isna().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [16]:
pima.shape

(768, 9)

In [17]:
y_train.value_counts()

tested_negative    340
tested_positive    197
Name: class, dtype: int64

In [18]:
y_test.value_counts()

tested_negative    160
tested_positive     71
Name: class, dtype: int64

In [19]:
# Create Decision Tree classifer object
model = DecisionTreeClassifier()

# Train Decision Tree Classifer
model.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [20]:
# Model Accuracy, how often is the classifier correct?
print("test Accuracy:", metrics.accuracy_score(y_test, y_pred))

test Accuracy: 0.7272727272727273


In [21]:
y_pred_train = model.predict(X_train)

In [22]:
print("Train Accuracy:", metrics.accuracy_score(y_pred_train, y_train))

Train Accuracy: 1.0


In [23]:
print(classification_report(y_test, y_pred))

                 precision    recall  f1-score   support

tested_negative       0.81      0.79      0.80       160
tested_positive       0.55      0.58      0.57        71

       accuracy                           0.73       231
      macro avg       0.68      0.69      0.68       231
   weighted avg       0.73      0.73      0.73       231



In [24]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = X.columns,
                class_names=['tested_positive','tested_negative'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes_pruned.png')
Image(graph.create_png())

ModuleNotFoundError: No module named 'sklearn.externals.six'

In [35]:
from sklearn.ensemble import RandomForestClassifier
clf_random_forest = RandomForestClassifier(random_state=71)
clf_random_forest.fit(X_train, y_train)
y_pred = clf_random_forest.predict(X_test)

print("test Accuracy:",metrics.accuracy_score(y_test, y_pred))

test Accuracy: 0.8008658008658008


In [29]:
#from sklearn.svm import SVC
# clf_svm = SVC()
# clf_svm.fit(X_train, y_train)
# y_pred = clf_svm.predict(X_test)
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [36]:
import pickle
# create your model as above

from sklearn.ensemble import RandomForestClassifier
clf_random_forest = RandomForestClassifier(random_state=71)
clf_random_forest.fit(X_train, y_train)
y_pred = clf_random_forest.predict(X_test)

pkl_filename = "rf_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf_random_forest, file)


In [61]:
from sklearn.ensemble import RandomForestClassifier
clf_random_forest = RandomForestClassifier(n_estimators=170)
clf_random_forest.fit(X_train, y_train)
y_pred = clf_random_forest.predict(X_test)

print("test Accuracy:",metrics.accuracy_score(y_test, y_pred))

test Accuracy: 0.7922077922077922


In [62]:
#Load from pickle file

with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [64]:
pickle_model

RandomForestClassifier(random_state=71)

In [68]:
clf_random_forest.feature_importances_

array([0.09252112, 0.24920664, 0.09274662, 0.07158762, 0.06966791,
       0.17084083, 0.12195373, 0.13147553])

In [69]:
pima.columns

Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')

In [None]:
# plas has most imporatcnce on target var then followed 'mass' ,'age'