<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:white;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="padding: 10px;color:#3cb371;font-size:200%;text-align:center">Logistic Regression, in order to predict if the patient is diabetic or not</p>
</div>  


<p align="center">
  <img width="1000" height="900" src="https://www.niddk.nih.gov/-/media/Images/Health-Information/Diabetes/diabetes-monitor-fruits-vegetables-small_597x347.png">
</p>



In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns 
import pandas_profiling as pp 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,f1_score,precision_score,recall_score,roc_curve,roc_auc_score

import warnings
warnings.filterwarnings('ignore')


In [None]:
df=pd.read_csv("../input/diabetes-dataset/diabetes2.csv")

**ABOUT COLUMNS**
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: Class variable (0 or 1)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
#Counting Zero Values for each column:Glucose,BloodPressure,SkinThickness,Insulin,BMI
zero_attributes=["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
def zero_values(df,zero_attributes):
    for i in zero_attributes:
        df_count=df.loc[df[i] == 0]
        x=(df_count[i].count()/df[i].count())*100
        x=round(x,2)
        print(f'The Number of zero values in column {i} is {x}')
zero_values(df,zero_attributes)

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = \
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
100 * df.isna().sum() / df.shape[0]

In [None]:
df['Glucose'].fillna(df['Glucose'].mean(), inplace = True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace = True)
#df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace = True)
# df['Insulin'].fillna(df['Insulin'].median(), inplace = True)
df['BMI'].fillna(df['BMI'].mean(), inplace = True)

In [None]:
df.drop(['SkinThickness', 'Insulin'], axis = 1, inplace = True)

In [None]:
df.isna().sum()

In [None]:
de=df.copy()

In [None]:
pp.ProfileReport(df)

In [None]:
c=[0,1,2]
r=[0,1]
cols_index=0
cols=df.columns[:-1]
fig,axs = plt.subplots(2, 4,figsize=(20,12))
for i in r:
    for j in c:              
        box_plot = sns.boxplot(x="Outcome",y=cols[cols_index],data=df,ax=axs[i,j])
        medians = df.groupby(["Outcome"])[cols[cols_index]].median()
        vertical_offset = df[cols[cols_index]].median() * 0.05 
        cols_index+=1

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
X = de.drop("Outcome",axis =1)
y = de["Outcome"]

datalist = df.columns.values.tolist()
datalist.remove("Outcome")

Model building

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size= 0.25, random_state=120)

In [None]:
scaler = MinMaxScaler()
#continuous
data_scaled =['Glucose', 'BloodPressure', 'BMI',
        'DiabetesPedigreeFunction', 'Age']

X_train[data_scaled] = scaler.fit_transform(X_train[data_scaled])

In [None]:
model= LogisticRegression()

model.fit(X_train, y_train)
trainscore =  model.score(X_train,y_train)

In [None]:

#continuous
X_test[data_scaled] = scaler.transform(X_test[data_scaled]) 

testscore =  model.score(X_test,y_test)  

In [None]:
print("test score: {} \ntrain score: {}".format(testscore*100,trainscore*100),'\n')

y_pred =  model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

print("Confusion Matrix : \n",confusion_matrix(y_test, y_pred))

In [None]:
print(' f1 score: ',f1_score(y_test, y_pred)*100,'\n')
print(' Accuracy: ',accuracy_score(y_test, y_pred)*100,'\n')
print(' precision score: ',precision_score(y_test, y_pred)*100,'\n')
print(' recall score: ',recall_score(y_test, y_pred)*100,'\n')
print(" Classification report: \n",classification_report(y_test, y_pred))

In [None]:
probabilityValues = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred)
print("AUC Score: ",auc*100)


In [None]:
fpr,tpr, threshold =  roc_curve(y_test,probabilityValues)
plt.plot([0,1],[0,1], linestyle = '--')
plt.plot(fpr,tpr)

![](https://thebritishschoolofetiquette.com/wp-content/uploads/2018/12/Article-Size-Pictures7.webp)