In [20]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Importing dataset

In [2]:
df = pd.read_csv('./database/csv_files/pima.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data cleaning & summery

In [3]:
# No null values

df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
# summery

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
# statistical summery

df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [14]:
# checking for any imbalance in datset

df.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [16]:
# it is a little bit biased ,i.e, for every diabetic patient there exist 2 non-diabetic patient
500/268

1.8656716417910448

# Normalizing

In [18]:
scaler = MinMaxScaler()
df2 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df2.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0


# Train Test Split

In [21]:
# We used stratify to maintain the same ratio of imbalance we have in the OG dataset in both train and test dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop('Outcome', axis = 1), df.Outcome, stratify= df.Outcome,random_state= 45)

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [26]:
scores = cross_val_score(LogisticRegression(max_iter = 1000), X_train,y_train,cv = 10)
scores

array([0.75862069, 0.77586207, 0.87931034, 0.68965517, 0.74137931,
       0.81034483, 0.73684211, 0.70175439, 0.77192982, 0.71929825])

In [37]:
# cross_val score
scores.mean()

0.7584996975196612

In [41]:
# test scores
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8020833333333334

# Logistic Regression with Bagging

In [29]:
from sklearn.ensemble import BaggingClassifier

In [43]:
# base_estimator : ML model used
# n_estimators : number of sub-datasets needed to be created
# max_samples : percentage of samples selected for each sub-datset
# oob_score : to enable oob_scoring

bag_model = BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000),
                              n_estimators= 10,
                              max_samples=0.8,
                              oob_score= True,
                              random_state=45)

In [46]:
# oob score
bag_model.fit(X_train,y_train)
bag_model.oob_score_

  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /


0.7586805555555556

In [47]:
bag_scores = cross_val_score(bag_model,X_train,y_train,cv = 10)
bag_scores

  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /
  warn("Some inputs do not have OOB scores. "
  oob_decision_function = (predictions /


array([0.75862069, 0.79310345, 0.84482759, 0.70689655, 0.75862069,
       0.81034483, 0.75438596, 0.70175439, 0.77192982, 0.75438596])

In [48]:
# cross_val score
bag_scores.mean()

0.7654869933454326

In [49]:
# test score
bag_model.score(X_test,y_test)

0.796875