In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
diab_df= pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
diab_df.head(5)

Checking for duplicated values in dataset

In [None]:
diab_df.duplicated().sum()

In [None]:
diab_df.shape

Checking for null values in dataset

In [None]:
diab_df.isnull().sum()

So, there 768 records in 9 columns. Also, there are no null records as well as duplicate values.

But we can observe that, there are lot of 0s present in the dataset. It is better to replace zeros with NaN and after that counting them would be easier and 0s need to be replaced with suitable values.

In [None]:
diab_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diab_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
diab_df.isnull().sum()

In [None]:
diab_df.hist(figsize = (11,11))

Replacing NaN values with mean for numerical attributes

In [None]:
diab_df['Glucose'].fillna(diab_df['Glucose'].mean(), inplace = True)
diab_df['BloodPressure'].fillna(diab_df['BloodPressure'].mean(), inplace = True)
diab_df['SkinThickness'].fillna(diab_df['SkinThickness'].mean(), inplace = True)
diab_df['Insulin'].fillna(diab_df['Insulin'].mean(), inplace = True)
diab_df['BMI'].fillna(diab_df['BMI'].mean(), inplace = True)

In [None]:
diab_df.isnull().sum()

After replacing NaN Values, the dataset is almost clean now. We can move ahead with our EDA.

In [None]:
#Sample proportion of diabetes:
import matplotlib.pyplot as plt
labels = ['No Diabetes','Diabetes']
colormap = {'lightgrey','tab:orange'}
diab_df['Outcome'].value_counts().plot.pie(startangle=90, colors=colormap, labels=labels)
plt.title("Diabetes Overall Sample Proportion", fontsize=15)
plt.ylabel('')
circle = plt.Circle((0,0),0.7,color="white")
p = plt.gcf()
p.gca().add_artist(circle)
plt.show()

In [None]:
diab_df['Outcome'].value_counts()

From above plot, we can say that there are less number of diabetic patients in the data set.

EDA Features against Outcome target

In [None]:
import seaborn as sns

fig, axes = plt.subplots(2, 4, figsize=(18, 12))
fig.suptitle('Diabetes Outcome Distribution WRT All Independent Variables', fontsize=16)

sns.boxplot(ax=axes[0, 0], x=diab_df['Outcome'], y=diab_df['Pregnancies'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 0].set_title("Diabetes Outcome vs Pregnancies", fontsize=12)

sns.boxplot(ax=axes[0, 1], x=diab_df['Outcome'], y=diab_df['Glucose'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 1].set_title("Diabetes Outcome vs Glucose", fontsize=12)

sns.boxplot(ax=axes[0, 2], x=diab_df['Outcome'], y=diab_df['BloodPressure'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 2].set_title("Diabetes Outcome vs BloodPressure", fontsize=12)

sns.boxplot(ax=axes[0, 3], x=diab_df['Outcome'], y=diab_df['SkinThickness'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[0, 3].set_title("Diabetes Outcome vs SkinThickness", fontsize=12)

sns.boxplot(ax=axes[1, 0], x=diab_df['Outcome'], y=diab_df['Insulin'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 0].set_title("Diabetes Outcome vs Insulin", fontsize=12)

sns.boxplot(ax=axes[1, 1], x=diab_df['Outcome'], y=diab_df['BMI'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 1].set_title("Diabetes Outcome vs BMI", fontsize=12)

sns.boxplot(ax=axes[1, 2], x=diab_df['Outcome'], y=diab_df['DiabetesPedigreeFunction'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 2].set_title("Diabetes Outcome vs DiabetesPedigreeFunction", fontsize=12)

sns.boxplot(ax=axes[1, 3], x=diab_df['Outcome'], y=diab_df['Age'], hue=diab_df['Outcome'], palette=('#23C552','#C52219'))
axes[1, 3].set_title("Diabetes Outcome vs Age", fontsize=12)

From above Boxplot, we can see that those who are diabetic tends to have higher Glucose levels, Age, BMI, Pregnancies and Insulin measures.

In [None]:
sns.pairplot(diab_df, hue='Outcome', palette=('#23C552','#C52219'))

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(diab_df.corr(), annot=True, cmap='RdYlGn')
plt.title("Feature Correlation Matrix",fontsize=20)
plt.show()

Inference: We can see that a few of the features are moderately correlated - Age and number of Pregnancies, Insulin and Glucose levels, Skin Thickness and BMI - but not so much as to cause concern.

Splitting data into x and y

In [None]:
from sklearn.model_selection import train_test_split
x = diab_df.drop(['Outcome'],axis=1)
y = diab_df['Outcome']

Scaling data 

In [None]:
from sklearn.preprocessing import StandardScaler

sc= StandardScaler()
x_scaled= sc.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=0)

Training Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

Performing 10-fold cross validation (k-fold CV)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(logreg, x_train, y_train, cv=10)
scores

Coefficients of trained Logistic Regression model

In [None]:
print(logreg.coef_, logreg.intercept_)

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
confmat = confusion_matrix(y_pred, y_test)
confmat

In [None]:
from sklearn import metrics
cm=metrics.ConfusionMatrixDisplay(confusion_matrix=metrics.confusion_matrix(y_pred,y_test,labels=logreg.classes_),
                              display_labels=logreg.classes_)
cm.plot(cmap="magma")

Computing accuracy of logistic regression model

In [None]:
accuracy_score(y_pred, y_test)

THUS, Model Accuracy with Logistic Regression: 76.19%