In [None]:
#importing required librabries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score,roc_curve,recall_score,accuracy_score,precision_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from warnings import filterwarnings
filterwarnings('ignore')

### Loading data

In [None]:
#loading the data using pandas
df = pd.read_csv ('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
#preview the data
df.head()

### Understanding Data

In [None]:
# preview the shape of data
df.shape

In [None]:
#previewing the datatypes of data
df.info()

In [None]:
# statistical analysis
df.describe()

### Univariant Analysis

In [None]:
#Check for Outlier detections on all the independent features
for i in df.columns[:-1]:
    sns.boxplot(df[i],orient='v')
    plt.show()

- Above plot show no abnormal outliers. So, not need to treat for outliers

In [None]:
# Check the distribution of data across the independent features
for i in df.columns[:-1]:
    sns.distplot(df[i])
    plt.show()

## Data processing

#### Check for Missing Value

In [None]:
#Check for missing value
df.isnull().sum()

- No missing value present in the data 
- No need of missing value treatment

### Spliting the data

In [None]:
#outcome is the feature to classify
y=df.pop('Outcome')
# rest of the columns will be independent features
X=df

In [None]:
# Spliting the data using stratified splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
#preview the X train data
X_train.head()

In [None]:
#preview y train data
y_train.head()

In [None]:
# shapes of the train and test data
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### Data Scaling

In [None]:
## Scaling the data using standard scaler to normalize the data

In [None]:
#Instantiation of standardscaler
scaler=StandardScaler()

- Applying fit and transform on training data using standard scaler object

In [None]:
# fit and transform train dat using standardscalar object
X_train=pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)

In [None]:
#preview after scaling
X_train.head()

- Apply only transform on test data to prevent memory leakage using existing scaler object

In [None]:
# transform test dat using same standardscalar object
X_test=scaler.transform(X_test)

In [None]:
#Checking the distribution of data across the independent features after scaling
for i in X_train.columns:
    sns.distplot(df[i])
    plt.show()

## KNN Classifier

In [None]:
#created dataframe to store results
df_output=pd.DataFrame({})

In [None]:
#Instantiation of Knn classifier
knn=KNeighborsClassifier(n_jobs=-1)

In [None]:
#fit training data on knn model
knn.fit(X_train,y_train)

In [None]:
## created a evaluation function to print Classification report, ROC curve, Accuracy,Specificity,Sensitivity, Recall,Precision,F1score
def evaluation(df_output,X_test,y_test,model,ModelName):
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:,1] 
    print("-"*15,"Evaluation Details on Test data set","-"*15,"\n")
    print("-"*10,"Confusion matrix","-"*10,"\n")
    cm=confusion_matrix(y_pred,y_test)
    print(cm,"\n")
    total=sum(sum(cm))
    accuracy=(cm[0,0]+cm[1,1])/total
    sensitivity=cm[1,1]/(cm[1,0]+cm[1,1])
    specificity=cm[0,0]/(cm[0,1]+cm[0,0])
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1score= 2*((precision * recall)/(precision + recall))
    print("-"*10,"Sensitivity  Specificity f1- score recall precision","-"*10)
    print("Accuracy: {} \nSensitivity: {} \nSpecificity: {} \nF1-score: {} \nRecall: {} \nPrecision: {}\n".format(accuracy,sensitivity,specificity,f1score,precision,recall))
    
    print("-"*10,"classification report","-"*10,"\n")
    print(classification_report(y_pred,y_test))
    accuracy=accuracy_score(y_pred,y_test)
    print("Accuracy score : ",accuracy,"\n")
    area=roc_auc_score(y_test,y_pred_prob)
    print("Area under ROC curve : ", area,"\n")
    fpr,tpr,thrs=roc_curve(y_test,y_pred_prob,drop_intermediate=False)
    plt.plot(fpr,tpr,label='ROC curve (area = %0.2f)' %area)
    plt.plot([1,0],[1,0],'k--',color='r')
    plt.title("ROC Curve")
    plt.legend(loc=4)# to print legend at lower right
    plt.show()
    threshold=thrs[np.argmax(tpr-fpr)]
    print("Threshold ",thrs[np.argmax(tpr-fpr)])
    df_output= df_output.append(pd.DataFrame({'ModelName':ModelName,'Accuracy': accuracy,'Sensitivity':sensitivity,'Specificity':specificity,'precision':precision,'recall':recall,'f1score':f1score,'roc_value': area,'threshold': threshold}, index=[0]),ignore_index=True)
    return df_output

In [None]:
# evaluate base knn model
df_output=evaluation(df_output,X_test,y_test,knn,'KNN_base')

### Hypertuning KNN

In [None]:
## Function to do hypertuning of model
def hypertunemodel(params,basemodel,X_train,y_train,cv_num=5):
    skf=StratifiedKFold(n_splits=cv_num,random_state=42,shuffle=True)
    print("Hypertuning model started")
    model_cv=GridSearchCV(basemodel,param_grid=params,n_jobs=-1,cv=skf,return_train_score=True,scoring='roc_auc',verbose=2)
    model_results=model_cv.fit(X_train,y_train)
    print("Best roc_auc score",model_results.best_score_)
    print(model_cv.best_params_)
    print("Hypertuning model finished")
    return model_cv,model_results

In [None]:
# hyperparameters for hypertuning knn model
knn_params={'n_neighbors':range(5,26),'weights':['uniform', 'distance'],'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],'p':[1,2,3]}

In [None]:
#hypertuning the KNN model
model_knn_cv,model_knn_results=hypertunemodel(knn_params,knn,X_train,y_train)

In [None]:
#best model after hypertuning (gridsearchcv)
best_knn_model=model_knn_cv.best_estimator_

In [None]:
#hyperparameters value in the best knn model
model_knn_cv.best_params_

In [None]:
# evaluate best hypertuned knn model
df_output=evaluation(df_output,X_test,y_test,best_knn_model,'KNN_hypertuned')

### Decision Tree Classifier

In [None]:
#Instantiation of decision tree classifier
dtc=DecisionTreeClassifier(random_state=42)

In [None]:
#fit training data on decision tree model
dtc.fit(X_train,y_train)

In [None]:
# evaluate base decision tree model
df_output=evaluation(df_output,X_test,y_test,dtc,'DecisionTree')

### Hyper tunning Decision tree

In [None]:
# hyperparameters for hypertuning decision tree model
dtc_params={'criterion':['gini','entropy'],'max_depth':[4,8],'min_samples_leaf':range(1, 11, 2),'min_samples_split': range(1, 11, 2)}

In [None]:
#hypertuning the decision tree model
model_dtc_cv,model_dct_results=hypertunemodel(dtc_params,dtc,X_train,y_train)

In [None]:
#best decision tree model after hypertuning (gridsearchcv)
best_dtc_model=model_dtc_cv.best_estimator_

In [None]:
#hyperparameters value in the best decision tree model
model_dtc_cv.best_params_

In [None]:
# evaluate best hypertuned decision tree model
df_output=evaluation(df_output,X_test,y_test,best_dtc_model,'DecisionTree_hypertuned')

In [None]:
df_output

### SVC

In [None]:
#Instantiation of svc classifier
svc=SVC(random_state=42,probability=True)
#fit training data on svc model
svc.fit(X_train,y_train)
# evaluate base svc model
df_output=evaluation(df_output,X_test,y_test,svc,'SVC_base')

### Hypertuning SVC

In [None]:
## hyperparameters for hypertuning svc model
svm_params={'kernel':['linear', 'poly', 'rbf', 'sigmoid'],'C':[0.01,0.1,1],'gamma':['scale', 'auto'] }
#hypertuning the svc model
model_svc_cv,model_svc_results=hypertunemodel(svm_params,svc,X_train,y_train)

In [None]:
#best svc model after hypertuning ( through gridsearchcv)
best_svc_model=model_svc_cv.best_estimator_

In [None]:
#hyperparameters value in the best svc model
model_svc_cv.best_params_

In [None]:
# evaluate best hypertuned decision tree model
df_output=evaluation(df_output,X_test,y_test,best_svc_model,'SVC_hypertuned')

In [None]:
df_output

### Random Forest

In [None]:
#Instantiation of random forest classifier
rfc=RandomForestClassifier(n_jobs=-1)
#fit training data on random forest classifier model
rfc.fit(X_train,y_train)

In [None]:
# evaluate base random forest classifier model
df_output=evaluation(df_output,X_test,y_test,rfc,'RandomForest_base')

### Hypertuning Randomforest

In [None]:
## hyperparameters for hypertuning random forest classifier model
rfc_params={'max_depth': [4,8],'min_samples_leaf': range(1, 11, 2),'criterion' :["gini", "entropy"],'min_samples_split': range(1, 11, 2),'n_estimators': [100,200],'max_features': ["auto", "sqrt", "log2"]}
#hypertuning the random forest classifier model
model_rfc_cv,model_rfc_results=hypertunemodel(rfc_params,rfc,X_train,y_train)

In [None]:
#best random forest classifier model after hypertuning ( through gridsearchcv)
best_rfc_model=model_rfc_cv.best_estimator_

In [None]:
#hyperparameters value in the best random forest classifier model
model_rfc_cv.best_params_

In [None]:
# evaluate best hypertuned random forest classifier model
df_output=evaluation(df_output,X_test,y_test,best_rfc_model,'RFC_hypertuned')

In [None]:
df_output

## Conclusion

- Based on the above results
 - SVC hypertuned model gives best ROC score of all the models.
 - Random Forest classifier provides the second best ROC score
- But based on sensitivity and specificity which needs to be high
  - Random forest classifier is the best model to use for this classification