In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
df = pd.read_csv('diabetes-dataset.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2000 non-null   int64  
 1   Glucose                   2000 non-null   int64  
 2   BloodPressure             2000 non-null   int64  
 3   SkinThickness             2000 non-null   int64  
 4   Insulin                   2000 non-null   int64  
 5   BMI                       2000 non-null   float64
 6   DiabetesPedigreeFunction  2000 non-null   float64
 7   Age                       2000 non-null   int64  
 8   Outcome                   2000 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 140.8 KB


In [5]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
x = df.drop(['Outcome'], axis=1)
y = df.Outcome

In [8]:
x_norm = StandardScaler().fit_transform(x)
x_norm = pd.DataFrame(x_norm)
x_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.515394,0.524553,-0.372481,0.873645,-0.722016,0.172683,-1.063246,1.180424
1,-1.120495,-1.159756,0.67008,0.625186,0.402563,0.737249,-0.735551,-0.856326
2,-1.120495,0.74289,-3.604422,-1.300374,-0.722016,1.473638,0.491759,-0.177409
3,-1.120495,0.43098,-0.059713,1.308449,1.527142,1.240448,-0.327478,-0.771462
4,-0.817945,0.555744,-0.372481,1.246334,3.596367,1.044077,0.201161,-1.026055


In [9]:
summary = sm.Logit(y, sm.add_constant(x_norm)).fit()
print(summary.summary())

Optimization terminated successfully.
         Current function value: 0.478583
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                 2000
Model:                          Logit   Df Residuals:                     1991
Method:                           MLE   Df Model:                            8
Date:                Fri, 16 Apr 2021   Pseudo R-squ.:                  0.2550
Time:                        01:27:56   Log-Likelihood:                -957.17
converged:                       True   LL-Null:                       -1284.7
Covariance Type:            nonrobust   LLR p-value:                3.333e-136
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8848      0.059    -14.936      0.000      -1.001      -0.769
0              0.4177      0.

In [10]:
ds = x_norm.drop(columns=[3])
ds

Unnamed: 0,0,1,2,4,5,6,7
0,-0.515394,0.524553,-0.372481,-0.722016,0.172683,-1.063246,1.180424
1,-1.120495,-1.159756,0.670080,0.402563,0.737249,-0.735551,-0.856326
2,-1.120495,0.742890,-3.604422,-0.722016,1.473638,0.491759,-0.177409
3,-1.120495,0.430980,-0.059713,1.527142,1.240448,-0.327478,-0.771462
4,-0.817945,0.555744,-0.372481,3.596367,1.044077,0.201161,-1.026055
...,...,...,...,...,...,...,...
1995,-0.515394,-1.440474,-0.268225,-0.227201,-0.305970,-0.312021,-0.007680
1996,1.299907,1.803381,0.148800,0.447546,0.062225,0.766899,0.246914
1997,0.694807,-1.128565,0.461568,-0.722016,-0.121872,-0.274924,0.756101
1998,-1.120495,0.243835,2.129667,0.447546,4.284191,-0.469686,-0.601732


In [13]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.5, random_state=0)

lr = LogisticRegression()
LogReg = lr.fit(x_train, y_train)

LogReg.intercept_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([-3.83668195])

In [14]:
pred = lr.predict(x_test)

y_pred_proba = lr.predict_proba(x_test)[:,1]

df_prob = pd.DataFrame(x_test)
df_prob['Probability Value'] = y_pred_proba
df_prob['Probability'] = pred

df_prob

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Probability Value,Probability
405,2,123,48,32,165,42.1,0.520,26,0,0.014363,0
1190,2,106,56,27,165,29.0,0.426,22,0,0.012419,0
1132,6,195,70,0,0,30.9,0.328,31,1,0.978728,1
731,8,120,86,0,0,28.4,0.259,22,1,0.968943,1
1754,4,145,82,18,0,32.5,0.235,70,1,0.992555,1
...,...,...,...,...,...,...,...,...,...,...,...
693,7,129,68,49,125,38.5,0.439,43,1,0.987472,1
914,4,117,62,12,0,29.7,0.380,30,1,0.983926,1
1683,2,99,0,0,0,22.2,0.108,23,0,0.028508,0
1668,3,173,84,33,474,35.7,0.258,22,1,0.977691,1


In [15]:
print(df_prob['Probability'].value_counts())

0    668
1    332
Name: Probability, dtype: int64


In [16]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       668
           1       1.00      1.00      1.00       332

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

