# Machine learning program to find if someone will get affected  from diabetes in near future

# Importing important libraries
#Pandas to read data from CSV file
#numpy to deal with arrays
#matplotlib.pyplot to visualize data

In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=pd.read_csv("/Users/macair/Desktop/rishu/AI/diabetes.csv")

number of rows or number of data in our dataset

In [91]:
len(dataset)

768

To find number of rows and columns in our dataset

In [92]:
dataset.shape

(768, 9)

To look at first 10 datas of our dataset

In [93]:
dataset.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


To check if our dataset contains any null values

In [94]:
dataset.isnull().values.any()

False

Finding correlation between each variables and output

In [95]:
dataset.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


To check whether our data is proper, means if our data contains enough number of diabetic and non-diabetic persons data

In [99]:

dataset.pivot_table(index = ['Outcome'], aggfunc ='size')

Outcome
0    500
1    268
dtype: int64

Since 500 people are non-diabetic and 268 people are diabetic. So somehow our data is proper

We will split our data into input and output for our model

In [100]:
## Train Test Split

from sklearn.model_selection import train_test_split
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age',]
predicted_class = ['Outcome']

X = dataset[feature_columns].values
y = dataset[predicted_class].values

We will split our data into 70% trainig data and 30% test data

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

# Checking if  how many other missing(zero) values

In [102]:
print("total number of rows : {0}".format(len(dataset)))
print("number of rows missing glucose_conc: {0}".format(len(dataset.loc[dataset['Glucose'] == 0])))
print("number of rows missing glucose_conc: {0}".format(len(dataset.loc[dataset['Glucose'] == 0])))
print("number of rows missing diastolic_bp: {0}".format(len(dataset.loc[dataset['BloodPressure'] == 0])))
print("number of rows missing insulin: {0}".format(len(dataset.loc[dataset['Insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(dataset.loc[dataset['BMI'] == 0])))
print("number of rows missing diab_pred: {0}".format(len(dataset.loc[dataset['Outcome'] == 0])))
print("number of rows missing age: {0}".format(len(dataset.loc[dataset['Age'] == 0])))
print("number of rows missing skin: {0}".format(len(dataset.loc[dataset['SkinThickness'] == 0])))

total number of rows : 768
number of rows missing glucose_conc: 5
number of rows missing glucose_conc: 5
number of rows missing diastolic_bp: 35
number of rows missing insulin: 374
number of rows missing bmi: 11
number of rows missing diab_pred: 500
number of rows missing age: 0
number of rows missing skin: 227


Since we don't have a lot of data(if we have a lot of data then we call remove all the rows, which doesn't contain proper data value) so for missing values, we will fill it with average of that column

In [103]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

Now we will try different classification algorithm and will calculate accuracy. In last we will select the model with highest accuracy

In [104]:
## Apply Random Forest Algorithm

from sklearn.ensemble import RandomForestClassifier
# Create Random Forest classifer object
random_forest_model = RandomForestClassifier(random_state=10)
# Train Random Forest classifer
random_forest_model.fit(X_train, y_train.ravel())

RandomForestClassifier(random_state=10)

In [105]:
#Predict the response for test dataset
predict_train_data = random_forest_model.predict(X_test)

calculating accuracy of model

In [106]:
from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, predict_train_data)))

Accuracy = 0.749


So this random forest model has accuracy of around 75%

 Let's try Decision Tree classifier model

In [107]:
from sklearn.tree import DecisionTreeClassifier

In [108]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

calculating accuracy of model

In [109]:
from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy = 0.710


 So this Decision Tree classifier model has accuracy of around 70%

Let's try Logistic Regression classification model

In [110]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg=logreg.fit(X_train,y_train)

#predict the response for test dataset
y_pred=logreg.predict(X_test)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


calculating accuracy of model

In [111]:
from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy = 0.740


 So this logistic regression classifier model has accuracy of around 70%

Accuracy of Different Models

Random Forest=75%

Decision Tree=70%

Logistic Regression=74%

# Out of these three models, we will select Random Forest as it gives maximum accuracy