In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection  import train_test_split


In [32]:
df = pd.read_csv("/content/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [34]:
from sklearn.preprocessing import MinMaxScaler
Scaler = MinMaxScaler()
data = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','Age']
df[data] = Scaler.fit_transform(df[data])

In [35]:
x = df.drop(columns = "Outcome")
y = df["Outcome"]

In [36]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)# here we have to split it into tranning and testing model

In [37]:
import statsmodels.api as sm
logm1 = sm.GLM(y_train,(sm.add_constant(x_train)), family = sm.families.Binomial())
logm1.fit().summary()

0,1,2,3
Dep. Variable:,Outcome,No. Observations:,576.0
Model:,GLM,Df Residuals:,567.0
Model Family:,Binomial,Df Model:,8.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-277.93
Date:,"Mon, 20 Mar 2023",Deviance:,555.86
Time:,03:28:35,Pearson chi2:,616.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.2876
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-7.9592,0.798,-9.979,0.000,-9.522,-6.396
Pregnancies,1.4779,0.604,2.448,0.014,0.295,2.661
Glucose,6.6072,0.847,7.802,0.000,4.947,8.267
BloodPressure,-1.3659,0.753,-1.815,0.070,-2.841,0.109
SkinThickness,0.5827,0.800,0.728,0.466,-0.985,2.151
Insulin,-0.8714,0.862,-1.011,0.312,-2.562,0.819
BMI,5.9036,1.157,5.103,0.000,3.636,8.171
DiabetesPedigreeFunction,0.8935,0.342,2.613,0.009,0.223,1.564
Age,1.3216,0.645,2.049,0.040,0.058,2.586


In [38]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
from sklearn.feature_selection import RFE
rfe = RFE(logreg,n_features_to_select= 5)           
rfe = rfe.fit(x_train, y_train)

In [39]:
rfe.support_

array([ True,  True, False, False, False,  True,  True,  True])

In [40]:
list(zip(x_train.columns, rfe.support_, rfe.ranking_))

[('Pregnancies', True, 1),
 ('Glucose', True, 1),
 ('BloodPressure', False, 3),
 ('SkinThickness', False, 2),
 ('Insulin', False, 4),
 ('BMI', True, 1),
 ('DiabetesPedigreeFunction', True, 1),
 ('Age', True, 1)]

In [41]:
col = x_train.columns[rfe.support_]

In [42]:
x_train.columns[~rfe.support_]

Index(['BloodPressure', 'SkinThickness', 'Insulin'], dtype='object')

In [43]:
X_train_sm = sm.add_constant(x_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

0,1,2,3
Dep. Variable:,Outcome,No. Observations:,576.0
Model:,GLM,Df Residuals:,570.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-280.02
Date:,"Mon, 20 Mar 2023",Deviance:,560.03
Time:,03:28:35,Pearson chi2:,618.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.2824
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-8.3357,0.752,-11.089,0.000,-9.809,-6.862
Pregnancies,1.4452,0.595,2.427,0.015,0.278,2.612
Glucose,6.2507,0.784,7.976,0.000,4.715,7.787
BMI,5.7073,1.060,5.386,0.000,3.631,7.784
DiabetesPedigreeFunction,0.8712,0.337,2.584,0.010,0.210,1.532
Age,1.1429,0.620,1.844,0.065,-0.072,2.358


In [44]:
y_train_pred = res.predict(X_train_sm)

In [45]:
y_train_pred_final = pd.DataFrame({'Churn':y_train.values, 'Churn_Prob':y_train_pred})
y_train_pred_final.head()

Unnamed: 0,Churn,Churn_Prob
762,0,0.075231
127,0,0.190565
564,0,0.110702
375,1,0.829338
663,1,0.754822


In [46]:
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

Unnamed: 0,Churn,Churn_Prob,predicted
762,0,0.075231,0
127,0,0.190565,0
564,0,0.110702,0
375,1,0.829338,1
663,1,0.754822,1


In [47]:
from sklearn import metrics
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)

[[326  44]
 [ 87 119]]


In [48]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFsvif = pd.DataFrame()
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = x_train[col].columns
vif['VIF'] = [variance_inflation_factor(x_train[col].values, i) for i in range(x_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Unnamed: 0,Features,VIF
1,Glucose,13.04
2,BMI,11.86
4,Age,3.19
0,Pregnancies,3.14
3,DiabetesPedigreeFunction,3.05


In [49]:
col = col.drop('Glucose', 1)
col

Index(['Pregnancies', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype='object')

In [50]:
X_train_sm = sm.add_constant(x_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

0,1,2,3
Dep. Variable:,Outcome,No. Observations:,576.0
Model:,GLM,Df Residuals:,571.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-319.63
Date:,"Mon, 20 Mar 2023",Deviance:,639.25
Time:,03:28:35,Pearson chi2:,558.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.1766
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-5.1954,0.563,-9.234,0.000,-6.298,-4.093
Pregnancies,1.1597,0.546,2.125,0.034,0.090,2.229
BMI,6.9144,0.996,6.939,0.000,4.961,8.867
DiabetesPedigreeFunction,0.8809,0.299,2.950,0.003,0.296,1.466
Age,2.2584,0.557,4.053,0.000,1.166,3.350


In [51]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [52]:
y_train_pred_final['Outcome_Prob'] = y_train_pred

In [53]:
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['predicted'] = y_train_pred_final.Outcome_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

Unnamed: 0,Churn,Churn_Prob,predicted,Outcome_Prob
762,0,0.075231,0,0.156284
127,0,0.190565,0,0.199324
564,0,0.110702,0,0.249477
375,1,0.829338,1,0.820602
663,1,0.754822,1,0.645731


In [54]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

0.703125


In [55]:
from sklearn.metrics import classification_report
print(classification_report(y_train,y_train_pred_final['predicted'] ))


              precision    recall  f1-score   support

           0       0.73      0.86      0.79       370
           1       0.63      0.42      0.50       206

    accuracy                           0.70       576
   macro avg       0.68      0.64      0.65       576
weighted avg       0.69      0.70      0.69       576

