**Importing Dataset**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

**Additional details about the attributes:**-

1)Pregnancies: Number of times pregnant

2)Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test

3)BloodPressure: Diastolic blood pressure (mm Hg)

4)SkinThickness: Triceps skin fold thickness (mm)

5)Insulin: 2-Hour serum insulin (mu U/ml)

6)BMI: Body mass index (weight in kg/(height in m)^2)

7)DiabetesPedigreeFunction: Diabetes pedigree function

8)Age: Age (years)

9)Outcome: Class variable (0 or 1)

**importing dataset**

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv',encoding='utf8', engine='python')
df.head(5)

In [None]:
df.shape

In [None]:
df.isnull().values.any()

In [None]:
df.groupby("Outcome").size()  

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
print("maximum value=",df['Glucose'].max())
print("maximum value=",df['BloodPressure'].max())
print("maximum value=",df['SkinThickness'].max())
print("maximum value=",df['Insulin'].max())
print("maximum value=",df['BMI'].max())
print("maximum value=",df['DiabetesPedigreeFunction'].max())
print("maximum value=",df['Age'].max())

In [None]:
print("manimum value=",df['Glucose'].min())
print("manimum value=",df['BloodPressure'].min())
print("manimum value=",df['SkinThickness'].min())
print("manimum value=",df['Insulin'].min())
print("manimum value=",df['BMI'].min())
print("manimum value=",df['DiabetesPedigreeFunction'].min())
print("manimum value=",df['Age'].min())

**Count plot**

In [None]:
f,ax=plt.subplots(1,1,figsize=(10,4))
sns.countplot(x="Outcome",data=df,palette="plasma")
#df['Outcome'].value_counts().plot(kind='bar')

**Relation between pregnancies and diebetes**

In [None]:
f,ax=plt.subplots(1,1,figsize=(20,4))
sns.countplot(x="Pregnancies",data=df,hue="Outcome",palette="plasma")

**Histogram**

In [None]:
df.hist(figsize=(12,12))  

**Correlation matrix**

In [None]:
def plot_corr(df,size=11): 
    corr = df.corr() # calling the correlation function on the datafrmae
    fig, ax = plt.subplots(figsize=(size,size))
    ax.matshow(corr) # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)),corr.columns) # draw x tickmarks
    plt.yticks(range(len(corr.columns)),corr.columns)
plot_corr(df)    


In [None]:
df.corr()


**Boxplot**

In [None]:
df.plot(kind= 'box' , subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(12,12)) 

**KDE Plot**

In [None]:
f,ax=plt.subplots(1,1,figsize=(25,4))
sns.kdeplot(df.loc[(df['Outcome']==1), 'Glucose'], color='r', shade=True, Label='1')
sns.kdeplot(df.loc[(df['Outcome']==0), 'Glucose'], color='g', shade=True, Label='0')
plt.xlabel('Glucose') 

In [None]:
f,ax=plt.subplots(1,1,figsize=(25,4))
sns.kdeplot(df.loc[(df['Outcome']==1), 'BloodPressure'], color='c', shade=True, Label='1')
sns.kdeplot(df.loc[(df['Outcome']==0), 'BloodPressure'], color='m', shade=True, Label='0')
plt.xlabel('BloodPressure') 

**Scatter matrix**

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df, figsize = (20,20),color='m')
plt.show()

**Pairplot**

In [None]:
sns.pairplot(df,hue = 'Outcome', vars = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] )

**Violinplot**

In [None]:
f,ax=plt.subplots(1,2,figsize=(20,5))
box1=sns.violinplot(x="Outcome",y="Glucose",data=df,ax=ax[0])
box2=sns.violinplot(x="Outcome",y="BloodPressure",data=df,ax=ax[1])

**Train Test split**

In [None]:
attributes = list(df.columns[:8])  # creates a list of all paramter names
X = df[attributes].values  # masking the parameter values
y= df['Outcome'].values  # Just picking up values from Outcome.

In [None]:
from sklearn.preprocessing import StandardScaler 
sc_X = StandardScaler() 
X = sc_X.fit_transform(X) 

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state =0)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV

In [None]:
models = []
models.append(("LR",LogisticRegression()))
models.append(("GNB",GaussianNB()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("LDA",  LinearDiscriminantAnalysis()))
models.append(("QDA",  QuadraticDiscriminantAnalysis()))
models.append(("AdaBoost", AdaBoostClassifier()))
models.append(("SVM Linear",SVC(kernel="linear")))
models.append(("SVM RBF",SVC(kernel="rbf")))
models.append(("Random Forest",  RandomForestClassifier()))
models.append(("Bagging",BaggingClassifier()))
models.append(("Calibrated",CalibratedClassifierCV()))
models.append(("GradientBoosting",GradientBoostingClassifier()))
models.append(("LinearSVC",LinearSVC()))
models.append(("Ridge",RidgeClassifier()))

In [None]:
results = []
for name,model in models:
    kfold = KFold(n_splits=10, random_state=0)
    cv_result = cross_val_score(model,X_train,y_train, cv = kfold,scoring = "accuracy")
# It gives you an unbiased estimate of the actual performance you will get at runtime
    results.append(tuple([name,cv_result.mean(), cv_result.std()]))
    results.sort(key=lambda x: x[1], reverse = True)    
for i in range(len(results)):
    print('{:20s} {:2.2f} (+/-) {:2.2f} '.format(results[i][0] , results[i][1] * 100, results[i][2] * 100))

In [None]:
from sklearn.model_selection import GridSearchCV
model = SVC()
paramaters = [
             {'C' : [0.01, 0.1, 1, 10, 100, 1000], 'kernel' : ['linear']}   
             ]
grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  
print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

In [None]:
#Predicting output for test set. 
final_model = SVC(C = 0.1, kernel = 'linear')
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cf = confusion_matrix(y_test, y_pred)
print(cf)
print(accuracy_score(y_test, y_pred) * 100) 
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot= True)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, auc

In [None]:
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.figure(figsize = (10,7))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, color = 'red', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
train_score = final_model.score(X_train,y_train)
test_score = final_model.score(X_test,y_test)
print(f'Training Accuracy of our model is: {train_score}')
print(f'Test Accuracy of our model is: {test_score}')