In [None]:
!pip install -q chart_studio

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
from IPython.display import HTML, Image
#reading dataset
df = pd.read_csv('/content/drive/My Drive/diabetes.csv')
#checking data head and info
display(df.info(),df.head())
#summary
df.describe()


In [None]:
#univariate variable analysis
#Helps us to flexibly plot a univariate distribution of observations.
fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.distplot(df.Age, bins = 20, ax=ax[0,0]) 
sns.distplot(df.Glucose, bins = 20, ax=ax[0,1]) 
sns.distplot(df.BloodPressure, bins = 20, ax=ax[1,0]) 
sns.distplot(df.SkinThickness, bins = 20, ax=ax[1,1]) 
sns.distplot(df.Insulin, bins = 20, ax=ax[2,0])
sns.distplot(df.DiabetesPedigreeFunction, bins = 20, ax=ax[2,1])
sns.distplot(df.BMI, bins = 20, ax=ax[3,0]) 
sns.distplot(df.Outcome, bins = 20, ax=ax[3,1]) 




In [None]:
#Data Preparation & Outlier Detection
df.isnull().sum()


In [None]:
#there is no null values so we'll check for outliers/missing values
columns = ['Glucose','BloodPressure','SkinThickness',
       'Insulin','BMI', 'DiabetesPedigreeFunction','Age','Outcome']
for col in columns:
    sns.boxplot(x = df[col])
    plt.show()
#replacing missing values with their mean
df['Glucose'].fillna(df['Glucose'].mean(), inplace = True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace = True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace = True)
df['Insulin'].fillna(df['Insulin'].median(), inplace = True)
df['BMI'].fillna(df['BMI'].median(), inplace = True)
df['DiabetesPedigreeFunction'].fillna(df['DiabetesPedigreeFunction'].median(), inplace = True)
df['Age'].fillna(df['BMI'].median(), inplace = True)
#checking for missing values again
import missingno as msno
p=msno.bar(df)
#you can see that there is no missing value exist now

In [None]:
#data visualization
f, ax = plt.subplots(1, 2, figsize = (15, 7))
f.suptitle("Diabetes?", fontsize = 18.)
_ = df.Outcome.value_counts().plot.bar(ax = ax[0], rot = 0, color = (sns.color_palette()[0], sns.color_palette()[2])).set(xticklabels = ["No", "Yes"])
_ = df.Outcome.value_counts().plot.pie(labels = ("No", "Yes"), autopct = "%.2f%%", label = "", fontsize = 13., ax = ax[1],\
colors = (sns.color_palette()[0], sns.color_palette()[2]), wedgeprops = {"linewidth": 1.5, "edgecolor": "#F7F7F7"}), ax[1].texts[1].set_color("#F7F7F7"), ax[1].texts[3].set_color("#F7F7F7")

In [None]:
#now after visualisation we can see the result before implementation of ml model
#PAIR PLOTS
#to see the correlation of variables with eah other
sns.pairplot(data=df,hue='Outcome')
#correlation between features
#blocks with solid colour shows that those variables are highly corelated
corr=df.corr()

sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))

sns.heatmap(corr, vmax=.8, linewidths=0.01,
            square=True,annot=True,cmap='YlGnBu',linecolor="black")
plt.title('Correlation between features');


In [None]:
#predictive modelling
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#splitting dataset
X = df.iloc[:, :-1]
y = df.iloc[:, -1]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print("X_train: ", len(X_train))
print("X_test: ", len(X_test))
print("y_train: ", len(y_train))
print("y_test: ", len(y_test))
#logistics regression
LR = LogisticRegression()

#fiting the model
LR.fit(X_train, y_train)

#prediction
y_pred = LR.predict(X_test)

#Accuracy
print("Logistic Regression's Accuracy is", LR.score(X_test, y_test)*100)


In [None]:
#interpreting ml model
coeff = list(LR.coef_[0])
labels = list(X_train.columns)
features = pd.DataFrame()
features['Features'] = labels
features['importance'] = coeff
features.sort_values(by=['importance'], ascending=True, inplace=True)
features['positive'] = features['importance'] > 0
features.set_index('Features', inplace=True)
features.importance.plot(kind='barh', figsize=(11, 6),color = features.positive.map({True: 'blue', False: 'red'}))
plt.xlabel('Importance')
