In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import missingno as msno

import warnings 
warnings.simplefilter('ignore')

plt.style.use("dark_background")
%matplotlib inline

**Preprocessing**

---



---



In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205fA2').background_gradient(subset=['std'], cmap='Reds').background_gradient(subset=['50%'], cmap='cividis')

In [None]:
feat = df.columns
col = (df[feat]==0).sum()
print(col)

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
df.isnull().sum()

**Data Vizualization**

---



---



In [None]:
msno.matrix(df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']], figsize=(12,8))
plt.grid()

In [None]:
df['Glucose'].fillna(df['Glucose'].median(), inplace=True)
df['BloodPressure'].fillna(df['BloodPressure'].median(), inplace=True)
df['BMI'].fillna(df['BMI'].median(), inplace=True)

In [None]:
Glucose_Age_Insulin = df.groupby(['Glucose'])
def fillna_insulin(series):
    return series.fillna(series.median())
df['Insulin'] = Glucose_Age_Insulin['Insulin'].transform(fillna_insulin)
df['Insulin'] = df['Insulin'].fillna(df['Insulin'].mean())

In [None]:
df['SkinThickness'].fillna(df['SkinThickness'].median(),inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.isnull().values.any()

In [None]:
from matplotlib.pyplot import figure, show
plt.style.use("ggplot")
figure(figsize=(7,4))
ax = sb.countplot(x=df['Outcome'], data=df,palette="husl", edgecolor='black', lw=3)
ax.set_xticklabels(["Healthy","Diabetic"])
healthy, diabetics = df['Outcome'].value_counts().values
print("Number of healthy people = ",healthy)
print("Number of diabetic people = ",diabetics)

In [None]:
plt.style.use("dark_background")
labels = ['Healthy','Diabetic']
df['Outcome'].value_counts().plot(kind='pie',labels=labels,subplots=True,autopct='%10.0f%%',labeldistance=2,figsize=(3,3))

In [None]:
plt.style.use("dark_background")
sb.pairplot(df, hue="Outcome",palette="dark")

In [None]:
plt.style.use("dark_background")
plt.figure(dpi=90,figsize=(4,4))
mask = np.triu(np.ones_like(df.corr(),dtype=bool))
sb.heatmap(df.corr(),mask = mask , fmt=".1f", annot=True, lw=1, cmap='BuGn')
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.title('Correlation Heatmap')
plt.show()

**Spliting the dataset**


---



---

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8, random_state=42)

In [None]:
print("xtrain data : ",xtrain.shape)
print("ytrain data : ",ytrain.shape)
print("xtest data : ",xtest.shape)
print("ytest data : ",ytest.shape)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(xtrain,ytrain)
model.score(xtest,ytest)

**Distribuion**

---



---



Distribution of pregnancies



In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['Pregnancies'][df.Outcome==1],rug=True)
sb.distplot(df['Pregnancies'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of glucose

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['Glucose'][df.Outcome==1],rug=True)
sb.distplot(df['Glucose'][df.Outcome==0],rug =True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of bloodPressure

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['BloodPressure'][df.Outcome==1],rug=True)
sb.distplot(df['BloodPressure'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of Insulin

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['Insulin'][df.Outcome==1],rug=True)
sb.distplot(df['Insulin'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of Age

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['Age'][df.Outcome==1],rug=True)
sb.distplot(df['Age'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of BMI

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['BMI'][df.Outcome==1],rug=True)
sb.distplot(df['BMI'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of SkinThickness

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['SkinThickness'][df.Outcome==1],rug=True)
sb.distplot(df['SkinThickness'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

Distribution of DiabetesPedigreeFunction

In [None]:
plt.style.use("dark_background")
plt.figure()
ax = sb.distplot(df['DiabetesPedigreeFunction'][df.Outcome==1],rug=True)
sb.distplot(df['DiabetesPedigreeFunction'][df.Outcome==0],rug=True)
plt.legend(['Diabetes','No Diabetes'])

**Standard Scaling**

---



---



In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xtrain = sc.fit_transform(xtrain)
xtest = sc.transform(xtest)

In [None]:
from sklearn.metrics import confusion_matrix,auc,accuracy_score,classification_report,roc_curve

**SVM Model**

---



---



In [None]:
from sklearn.svm import SVC
svmm = SVC(kernel='rbf')
svmm.fit(xtrain,ytrain)

In [None]:
ypred = svmm.predict(xtest)
accuracy_score(ytest,ypred)

In [None]:
print(classification_report(ytest,ypred))

In [None]:
fpr, tpr,_=roc_curve(ytest,ypred)
roc_auc = auc(fpr,tpr)
plt.style.use("dark_background")
plt.figure()
plt.plot(fpr,tpr,label='ROC Curve(area=%2.f)'%roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')

**Random Forest Model**

---



---



In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(xtrain,ytrain)

In [None]:
Ypred = classifier.predict(xtest)
accuracy_score(ytest,Ypred)

In [None]:
print(classification_report(ytest,Ypred))

In [None]:
fpr, tpr,_=roc_curve(ytest,Ypred)
roc_auc = auc(fpr,tpr)
plt.style.use("dark_background")
plt.figure()
plt.plot(fpr,tpr,label='ROC Curve(area=%2.f)'%roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')

**KNN Model**

---



---



In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(xtrain,ytrain)
print(clf.score(xtest,ytest))

In [None]:
Yypred = clf.predict(xtest)
print(classification_report(ytest,Yypred))

In [None]:
fpr, tpr,_=roc_curve(ytest,Yypred)
roc_auc = auc(fpr,tpr)
plt.style.use("dark_background")
plt.figure()
plt.plot(fpr,tpr,label='ROC Curve(area=%2.f)'%roc_auc)
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.title("ROC Curve")
plt.legend(loc='lower right')