Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Processing

In [None]:
# loading the csv data to a Pandas DataFrame
data = pd.read_csv('diabetes_prediction_dataset.csv' )

In [3]:
# print first 5 rows of the dataset
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
# print last 5 rows of the dataset
data.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [5]:
# number of rows and columns in the dataset
data.shape

(100000, 9)

In [6]:
# getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [7]:
data.duplicated().sum()

3854

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
# checking for missing values
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [10]:
# statistical measures about the data
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0,96146.0
mean,41.794326,0.077601,0.040803,27.321461,5.532609,138.218231,0.08822
std,22.462948,0.267544,0.197833,6.767716,1.073232,40.909771,0.283616
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.4,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,59.0,0.0,0.0,29.86,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [11]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99994,Female,36.0,0,0,No Info,24.60,4.8,145,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [12]:
# checking the distribution of Target Variable
data['diabetes'].value_counts()

diabetes
0    87664
1     8482
Name: count, dtype: int64

In [13]:
from imblearn.over_sampling import RandomOverSampler
X = data.drop(columns='diabetes', axis=1)
Y = data['diabetes']
ros = RandomOverSampler(random_state=42)
X, Y = ros.fit_resample(X, Y)

In [14]:
X

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Female,80.0,0,1,never,25.19,6.6,140
1,Female,54.0,0,0,No Info,27.32,6.6,80
2,Male,28.0,0,0,never,27.32,5.7,158
3,Female,36.0,0,0,current,23.45,5.0,155
4,Male,76.0,1,1,current,20.14,4.8,155
...,...,...,...,...,...,...,...,...
175323,Female,66.0,0,1,former,32.67,7.0,280
175324,Male,71.0,0,0,ever,28.69,9.0,145
175325,Male,63.0,0,0,not current,21.46,9.0,126
175326,Female,57.0,0,0,never,30.93,6.0,300


In [15]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X["gender"]=le.fit_transform(X["gender"])
X["smoking_history"]=le.fit_transform(X["smoking_history"])
X

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,0,80.0,0,1,4,25.19,6.6,140
1,0,54.0,0,0,0,27.32,6.6,80
2,1,28.0,0,0,4,27.32,5.7,158
3,0,36.0,0,0,1,23.45,5.0,155
4,1,76.0,1,1,1,20.14,4.8,155
...,...,...,...,...,...,...,...,...
175323,0,66.0,0,1,3,32.67,7.0,280
175324,1,71.0,0,0,2,28.69,9.0,145
175325,1,63.0,0,0,5,21.46,9.0,126
175326,0,57.0,0,0,4,30.93,6.0,300


In [16]:
print(Y)

0         0
1         0
2         0
3         0
4         0
         ..
175323    1
175324    1
175325    1
175326    1
175327    1
Name: diabetes, Length: 175328, dtype: int64


Splitting the Data into Training data & Test Data

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(175328, 8) (140262, 8) (35066, 8)


Model Training

Logistic Regression

In [28]:
model = LogisticRegression()

In [29]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [30]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [31]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8514850779255964


In [40]:
# accuracy on test data
X_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_pred, Y_test)

In [41]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8494267951862202


In [45]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test, X_test_pred))

[[15285  2248]
 [ 3032 14501]]


In [38]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, Y_train)
X_test_prediction = rf_model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9916728454913591


In [39]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test, X_test_prediction))

[[17245   288]
 [    4 17529]]


Building a Predictive System

Saving the trained model

In [47]:
import pickle

In [48]:

pickle.dump(rf_model, open("rf_model_Diabetes", 'wb'))