In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# Import useful libraries for data understanding ,pre-processing,data exploartion, 
# Feature engineering and selection.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler

In [None]:
diabetes_ds=pd.read_csv('../input/diabetes-pred/diabetes_pred.csv')
diabetes_ds.head(10)

In [None]:
# to check row and column size 
diabetes_ds.shape

In [None]:
# to check if any missing values in the dataset . 
# After runing below command we can say , dataset doesn't have any missing values.
diabetes_ds.isnull().sum()

In [None]:
# To Check datatype of all features.
# After runing below command we can say , dataset have only descreate and continuos features.
diabetes_ds.info()

In [None]:
# Statistical anlysis using below commands
diabetes_ds.describe()

In [None]:
# for outlier analysis in the dataset .
for i in diabetes_ds.columns:
    sns.boxplot(diabetes_ds.Outcome,diabetes_ds[i],color='r')
    plt.show()

In [None]:
sns.pairplot(diabetes_ds,height=2.5)

In [None]:
for i in diabetes_ds.columns:
    sns.distplot(diabetes_ds[i],color='y')
    plt.show()

In [None]:
for i in diabetes_ds.columns:
    sns.barplot(diabetes_ds.Outcome,diabetes_ds[i])
    plt.show()

In [None]:
for i in diabetes_ds.columns:
    sns.boxenplot(diabetes_ds.Outcome,diabetes_ds[i])
    plt.show()

In [None]:
for i in diabetes_ds.columns:
    sns.violinplot(diabetes_ds.Outcome,diabetes_ds[i])
    plt.show()

In [None]:
# Dependent and Independent features seprations 

Diab_X=diabetes_ds.iloc[:,:-1]
Diab_y=diabetes_ds.iloc[:,-1]
print(Diab_X.head())
print(Diab_y.head())

In [None]:
# To view corelation between the features 

sns.heatmap(Diab_X.corr(),annot=True,cmap = "RdYlGn")

In [None]:
# Scaling the features values using standard scaler
SS=StandardScaler()
Scaled_Diab_X=SS.fit_transform(Diab_X)

In [None]:
# Splitting the dataset train and test 
X_Train,X_Test,y_Train,y_Test=train_test_split(Scaled_Diab_X,Diab_y,test_size=0.2,random_state=0)

In [None]:
# To check importance of features 
from sklearn.ensemble import ExtraTreesClassifier
ETC=ExtraTreesClassifier()
ETC.fit(Scaled_Diab_X,Diab_y)
ETC.feature_importances_

In [None]:
plt.figure(figsize = (12,8))
feat_importances = pd.Series(ETC.feature_importances_, index=Diab_X.columns)
feat_importances.nlargest(9).plot(kind='barh')
plt.show()

In [None]:
# To import all classification algo 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Initialization of models
KNN_Model=KNeighborsClassifier()
LR_Model=LogisticRegression()
SVC_Model=SVC()
DTC_Model=DecisionTreeClassifier()
NBC=GaussianNB()

In [None]:
# Will use kfold mathod with cross validation score 
# to check which model is giving good score 

from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

cv=KFold(n_splits=10, random_state=1, shuffle=True)
model=[KNN_Model,LR_Model,SVC_Model,DTC_Model,NBC]
for i in model:
    scores = cross_val_score(i, X_Train, y_Train, scoring='accuracy', cv=cv, n_jobs=-1)
    print('Accuracy: %.3f (%.3f)'  % (np.mean(scores), np.std(scores)))

In [None]:
# Now will fit data with KNN and test our model.

KNN_Model.fit(X_Train, y_Train)
KNN_y_predict=KNN_Model.predict(X_Test)
knn_AS=accuracy_score(y_Test,KNN_y_predict)
KNN_CM=confusion_matrix(y_Test,KNN_y_predict)
KNN_CR=classification_report(y_Test,KNN_y_predict)

In [None]:
print(knn_AS)
print('*****')
print(KNN_CM)
print('*****')
print(KNN_CR)

In [None]:
# Now will fit data with Logistic Regression and test our model.

LR_Model.fit(X_Train, y_Train)
LR_y_predict=LR_Model.predict(X_Test)
LR_AS=accuracy_score(y_Test,LR_y_predict)
LR_CM=confusion_matrix(y_Test,LR_y_predict)
LR_CR=classification_report(y_Test,LR_y_predict)

In [None]:
print(LR_AS)
print('*****')
print(LR_CM)
print('*****')
print(LR_CR)

In [None]:
# Now will fit data with decision tree and test our model.

DTC_Model.fit(X_Train, y_Train)
DT_y_predict=DTC_Model.predict(X_Test)
DT_AS=accuracy_score(y_Test,DT_y_predict)
DT_CM=confusion_matrix(y_Test,DT_y_predict)
DT_CR=classification_report(y_Test,DT_y_predict)

In [None]:
print(DT_AS)
print('*****')
print(DT_CM)
print('*****')
print(DT_CR)

In [None]:
# Now will fit data with SVM and test our model.

SVC_Model.fit(X_Train, y_Train)
SVC_y_predict=SVC_Model.predict(X_Test)
SVC_AS=accuracy_score(y_Test,SVC_y_predict)
SVC_CM=confusion_matrix(y_Test,SVC_y_predict)
SVC_CR=classification_report(y_Test,SVC_y_predict)

In [None]:
print(SVC_AS)
print('*****')
print(SVC_CM)
print('*****')
print(SVC_CR)

In [None]:

#output = pd.DataFrame({'Pregnancies': X_Test.Pregnancies,'Glucose': X_Test.Glucose,'BloodPressure': X_Test.BloodPressure,'SkinThickness': X_Test.SkinThickness,'Insulin': X_Test.Insulin,'BMI': X_Test.BMI,'DiabetesPedigreeFunction': X_Test.DiabetesPedigreeFunction,'Age': X_Test.Age,'Outcome':LR_y_predict})
output = pd.DataFrame({'Outcome':LR_y_predict})
output["id"] = output.index + 1
output.to_csv('Diabetes_my_submission.csv', index=False)
print("Your submission was successfully saved!")