In [1]:
#Pima Indian Diabetes Prediction
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline

In [2]:
#Loading and Reviewing the Data
data_frame = pd.read_csv("D:\Jupyter\dataset.csv")

In [3]:
data_frame.shape

(768, 9)

In [4]:
data_frame.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,skin_thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
print (data_frame.isnull().values.any())

False


In [6]:
num_obs = len(data_frame)
num_true = len(data_frame.loc[data_frame['diabetes'] == 1])
num_false = len(data_frame.loc[data_frame['diabetes'] == 0])
print("Number of True cases:  {0} ({1:2.2f}%)".format(num_true, ((1.00 * num_true)/(1.0 * num_obs)) * 100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (( 1.0 * num_false)/(1.0 * num_obs)) * 100))

Number of True cases:  268 (34.90%)
Number of False cases: 500 (65.10%)


In [9]:
#Spliting the data 70% for training, 30% for testing

from sklearn.model_selection import train_test_split

feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'skin_thickness', 'insulin', 'bmi', 'diab_pred', 'age']
predicted_class_names = ['diabetes']

X = data_frame[feature_col_names].values     # predictor feature columns (8 X m)
y = data_frame[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42) 
                            # test_size = 0.3 is 30%, 42 is the answer to everything

In [10]:
#We check to ensure we have the the desired 70% train, 30% test split of the data

trainval = (1.0 * len(X_train)) / (1.0 * len(data_frame.index))
testval = (1.0 * len(X_test)) / (1.0 * len(data_frame.index))
print("{0:0.2f}% in training set".format(trainval * 100))
print("{0:0.2f}% in test set".format(testval * 100))

69.92% in training set
30.08% in test set


In [36]:
#1. Logistic Regression

from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression object
lr_model = LogisticRegression(C=0.7, random_state=42)
lr_model.fit(X_train, y_train.ravel())



LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
lr_predict_train = lr_model.predict(X_train)

# training metrics
print ("Accuracy of LogisticRegression : {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_train)))

Accuracy of LogisticRegression : 0.7784


In [38]:
#2. Decision Tree

from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree object

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train.ravel())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [45]:
dt_predict_train = dt_model.predict(X_train)

# training metrics
print ("Accuracy of Decision Tree : {0:.4f}".format(metrics.accuracy_score(y_train, dt_predict_train)))

Accuracy of Decision Tree : 1.0000


In [40]:
#3. RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier object
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [46]:
rf_predict_train = rf_model.predict(X_train)

# training metrics
print ("Accuracy of RandomForestClassifier : {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

Accuracy of RandomForestClassifier : 0.9888


In [42]:
#4. Support Vector Classifier

from sklearn.svm import SVC
# Create a Support Vector Classifier object
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train, y_train.ravel())

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

In [47]:
svm_predict_train = svm_model.predict(X_train)

# training metrics
print ("Accuracy of SupporVectorClassifier : {0:.4f}".format(metrics.accuracy_score(y_train, svm_predict_train)))

Accuracy of SupporVectorClassifier : 0.7803
