In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
import seaborn as sns
from sklearn.metrics import accuracy_score,mean_squared_error,classification_report,confusion_matrix,precision_score,recall_score,roc_curve,auc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
data=pd.read_csv('health care diabetes.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
data['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
x=data.drop(['Outcome'],axis=1)
x.head()
y=data.Outcome

In [5]:
data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [6]:
import statsmodels.api as sm
reg=sm.Logit(y,x)
result=reg.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.608498
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  768
Model:                          Logit   Df Residuals:                      760
Method:                           MLE   Df Model:                            7
Date:                Mon, 14 Nov 2022   Pseudo R-squ.:                 0.05922
Time:                        23:33:28   Log-Likelihood:                -467.33
converged:                       True   LL-Null:                       -496.74
Covariance Type:            nonrobust   LLR p-value:                 2.583e-10
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Pregnancies                  0.1284      0.029      4.484      0.000       0.072

In [7]:
trainx,testx,trainy,testy=train_test_split(x,y,test_size=0.20,random_state=44)

In [8]:
#print("Before OverSampling, counts of label '1': {}".format(sum(trainy == 1)))
#print("Before OverSampling, counts of label '0': {} \n".format(sum(trainy == 0)))
  
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state =63)
trainx_res,trainy_res = sm.fit_resample(trainx,trainy)  
#print('After OverSampling, the shape of train_X: {}'.format(trainx_res.shape))
#print('After OverSampling, the shape of train_y: {} \n'.format(trainy_res.shape))
  

#print("After OverSampling, counts of label '1': {}".format(sum(trainy_res == 1)))
#print("After OverSampling, counts of label '0': {}".format(sum(trainy_res == 0)))

## LogisticRegression

In [9]:
logreg=LogisticRegression()

In [10]:
logreg.fit(trainx_res,trainy_res)

In [11]:
logreg_test_pred=logreg.predict(testx)
logreg_train_pred=logreg.predict(trainx)

In [12]:
print(accuracy_score(testy,logreg_test_pred))
print(accuracy_score(trainy,logreg_train_pred))

0.7987012987012987
0.755700325732899


In [13]:
confusion_matrix(testy,logreg_test_pred)

array([[78, 12],
       [19, 45]], dtype=int64)

In [14]:
print(classification_report(testy,logreg_test_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83        90
           1       0.79      0.70      0.74        64

    accuracy                           0.80       154
   macro avg       0.80      0.78      0.79       154
weighted avg       0.80      0.80      0.80       154



## RandomForestClassifier

In [15]:
rf=RandomForestClassifier(random_state=70)

In [16]:
rf.fit(trainx_res,trainy_res)

In [17]:
rf_test_pred=rf.predict(testx)
rf_train_pred=rf.predict(trainx)

In [18]:
print(accuracy_score(testy,rf_test_pred))
print(accuracy_score(trainy,rf_train_pred))

0.8181818181818182
1.0


In [38]:
import pickle
filename='rf_healthcare.pkl'
pickle.dump(rf,open(filename,'wb'))

In [41]:
model=pickle.load(open('rf_healthcare.pkl','rb'))

In [48]:
ans=model.predict([[6,148,72,35,0,33.6,0.627,50]])
ans
if ans == 0:
    print('non-diabetics')
else:
    print('diabetics')

diabetics
