In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Get the dataset and information

In [None]:
df=pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

# Finding the Null Value

In [None]:
df.isna().sum()

# Statistical Table

In [None]:
df.describe()

# Correlation Matrix

In [None]:
# Generate and visualize the correlation matrix
corr = df.corr().round(2)

# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set figure size
f, ax = plt.subplots(figsize=(20, 20))

# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

plt.tight_layout()

# Heatmap 


In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), cmap='Blues', annot=True)
plt.tight_layout()

# Kde Plot for visualization

In [None]:
plt.figure(figsize=(10,10))
sns.kdeplot(df['Age'], shade=True, Label="Age")
plt.xlabel('Age')
plt.ylabel('Probability Density')
plt.show()

* Inference- Age is within 20 to 37 years 

# Age Statistical Value

In [None]:
df['Age'].describe(percentiles=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,1])


In [None]:
df['Outcome'].value_counts()

# Visualization of the Outcomes

In [None]:
print("The outcome values",df['Outcome'].value_counts())
plt.figure(figsize=(10,5))
sns.countplot(x='Outcome', data=df)
plt.show()


# Getting the arrya of the feature and Target columns

In [None]:
X=df.drop(['Outcome'], axis=1).values
y=df['Outcome'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.45, random_state=45, stratify=y)

Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples. In such cases it is recommended to use stratified sampling 

# KNeighbours Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neighbors=np.arange(1,9)
train_accuracy=np.empty(len(neighbors))
test_accuracy=np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    knn=KNeighborsClassifier(n_neighbors=k)
    
    knn.fit(X_train, y_train)
    
    train_accuracy[i]=knn.score(X_train, y_train)
    
    test_accuracy[i]=knn.score(X_test, y_test)

In [None]:
plt.figure(figsize=(10,10))
plt.title("K-NN varying number of neighbour")
plt.plot(neighbors, test_accuracy, label="Testing Accuracy")
plt.plot(neighbors, train_accuracy, label="Train Accuracy")
plt.legend()
plt.xlabel("Number of Neighbour")
plt.ylabel("Accuracy")
plt.show()

# Inference 
As test accracy is high at k=7, We adopt the KNeighborsClassifier with number of neighbours as 7

In [None]:
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [None]:
print(knn.score(X_test, y_test)*100)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
y_pred=knn.predict(X_train)

# Evaluation of the model

In [None]:
print("Accuracy Score:-", metrics.accuracy_score(y_train, y_pred))
print("F1 Score:-", metrics.f1_score(y_train, y_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_train, y_pred))
print("Log Loss:-", metrics.log_loss(y_train, y_pred))
print("Precision Score:-", metrics.precision_score(y_train, y_pred))
print("Recall Score:-", metrics.recall_score(y_train, y_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_train, y_pred))

# Evaluation of the test dataset

In [None]:
# Predicting the Test data with model 
y_test_pred=knn.predict(X_test)

# Evaluation Model

In [None]:
knn_acc=metrics.accuracy_score(y_test, y_test_pred)
print("Accuracy Score:-",knn_acc)
print("F1 Score:-", metrics.f1_score(y_test, y_test_pred))
print("Average Precision Score:-", metrics.average_precision_score(y_test, y_test_pred))
print("Log Loss:-", metrics.log_loss(y_test, y_test_pred))
print("Precision Score:-", metrics.precision_score(y_test, y_test_pred))
print("Recall Score:-", metrics.recall_score(y_test, y_test_pred))
print("ROC-AUC Score:-", metrics.roc_auc_score(y_test, y_test_pred))

# Confusion Matrix
* Calculation

In [None]:
cfm=confusion_matrix(y_test, y_test_pred)
trueNegative=cfm[0][0]
falsePossitive=cfm[0][1]
false_negative=cfm[1][0]
truePositive=cfm[1][1]

In [None]:
print("Confusion Matrix", cfm)

# Visulalize the Confusion Matrix

In [None]:
cfm_df=pd.DataFrame(cfm, range(2), range(2))
plt.figure(figsize=(10,10))
sns.heatmap(cfm_df, cmap='Reds', annot=True)
plt.show()

# Table of confusion Matrix

In [None]:
pd.crosstab(y_test, y_test_pred, rownames=['True'], colnames=['Predicted'], margins=True)


In [None]:
print("true negative", trueNegative)
print("False Positive", falsePossitive)
print("false Negative", false_negative)
print("True Positive", truePositive)

# Accuracy 

In [None]:
print("correct prediction", 
      round((trueNegative+truePositive)/len(y_test_pred)*100, 1),'%')

In [None]:
from sklearn.metrics import classification_report

* There are four ways to check if the predictions are right or wrong:
* TN / True Negative: the case was negative and predicted negative
* TP / True Positive: the case was positive and predicted positive
* FN / False Negative: the case was positive but predicted negative
* FP / False Positive: the case was negative but predicted positive
* Precision — What percent of your predictions were correct?
* Precision is the ability of a classifier not to label an instance positive that is actually negative. For each class, it is defined as the ratio of true positives to the sum of a true positive and false positive.
* Precision:- Accuracy of positive predictions.
* Precision = TP/(TP + FP)
* Recall — What percent of the positive cases did you catch?
* Recall is the ability of a classifier to find all positive instances. For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.
* Recall:- Fraction of positives that were correctly identified.
* Recall = TP/(TP+FN)
* F1 score — What percent of positive predictions were correct?
* The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.
* F1 Score = 2*(Recall * Precision) / (Recall + Precision)
* Support
* Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing. Support doesn’t change between models but instead diagnoses the evaluation process.
* 

# Summary table of Precision, Recall, F1-Score, Support

In [None]:
print(classification_report(y_test, y_test_pred))

# ROC(Reciever Operating Curve)

In [None]:
y_test_pred_prob=knn.predict_proba(X_test)[:,1]


In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr, tpr,thresholds=roc_curve(y_test,y_test_pred_prob)

In [None]:
plt.figure(figsize=(10,10))
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr, label='knn')
plt.xlabel("fpr (False Possitive rate)")
plt.ylabel("tpr-(True Positive rate)")
plt.title("ROC_AUC (k_nn=7)")
plt.show()

# Inference
* The k-nn is very godd as we can see that it is not dumb model as AUC is more than 0.5 , Hence it is performing very good.

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_test_pred_prob)*100)

In [None]:
k_range = list(range(1, 26))
scores = []
for i in k_range:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))
plt.plot(k_range, scores)
plt.show()

# Cross Validation

Now before getting into the details of Hyperparamter tuning, let us understand the concept of Cross validation.

The trained model's performance is dependent on way the data is split. It might not representative of the model’s ability to generalize.

The solution is cross validation.

Cross-validation is a technique to evaluate predictive models by partitioning the original sample into a training set to train the model, and a test set to evaluate it.
# K-fold cross-validation
In k-fold cross-validation, the original sample is randomly partitioned into k equal size subsamples. Of the k subsamples, a single subsample is retained as the validation data for testing the model, and the remaining k-1 subsamples are used as training data. The cross-validation process is then repeated k times (the folds), with each of the k subsamples used exactly once as the validation data. The k results from the folds can then be averaged (or otherwise combined) to produce a single estimation. The advantage of this method is that all observations are used for both training and validation, and each observation is used for validation exactly once.

# Hyperparameter tuning

The value of k (i.e 7) we selected above was selected by observing the curve of accuracy vs number of neighbors. This is a primitive way of hyperparameter tuning.

There is a better way of doing it which involves:

1) Trying a bunch of different hyperparameter values

2) Fitting all of them separately

3) Checking how well each performs

4) Choosing the best performing one

5) Using cross-validation every time

Scikit-learn provides a simple way of achieving this using GridSearchCV i.e Grid Search cross-validation.

In almost any Machine Learning project, we train different models on the dataset and selecting the one with the best performance. However, there is almost a room for improvement as we cannot say for sure that this particular model is best for the problem at hand, hence our aim is to improve the model in any way possible. One important factor in the performances of these models are their hyperparameters, once we set appropriate values for these hyperparameters, the performance of a model can improve significantly. In this article, we will find out how we can find optimal values for the hyperparameters of a model by using GridSearchCV.

# GridSearchCV
* It is the process of performing hyperparameter tuning in order to determine the optimal values for a given model
* As mentioned above, the performance of a model significantly depends on the value of hyperparameters.
*  Note that there is no way to know in advance the best values for hyperparameters so ideally, we need to try all possible values to know the optimal values.
* Doing this manually could take a considerable amount of time and resources and thus we use GridSearchCV to automate the tuning of hyperparameters.
* This method of classifier is optimized by cross-validation, which is done using the GridSearchCV object on a development set that comprises only half of the available labeled data.
* The performance of the selected hyper-parameters and trained model is then measured on a dedicated evaluation set that was not used during the model selection step.


# How does GridSearchCV work?
* GridSearchCV tries all the combinations of the values passed in the dictionary and evaluates the model for each combination using the Cross-Validation method. 
* Hence after using this function we get accuracy/loss for every combination of hyperparameters and we can choose the one with the best performance.

# Use of GridSearch done below

In [None]:
from sklearn.model_selection import  GridSearchCV

In [None]:
param_grid={'n_neighbours':np.arange(1,50)}

In [None]:
k_range = list(range(1, 31))
print(k_range)

In [None]:
param_grid = dict(n_neighbors=k_range)
print(param_grid)

In [None]:
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X, y)

1. estimator: Pass the model instance for which you want to check the hyperparameters.
2. params_grid: the dictionary object that holds the hyperparameters you want to try
3. scoring: evaluation metric that you want to use, you can simply pass a valid string/ object of evaluation metric
4. cv: number of cross-validation you have to try for each selected set of hyperparameters
5. verbose: you can set it to 1 to get the detailed print out while you fit the data to GridSearchCV
6. n_jobs: number of processes you wish to run in parallel for this task if it -1 it will use all available processors. 

In [None]:
grid_mean_scores = grid.cv_results_['mean_test_score']
print(grid_mean_scores)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

# Conclusion
* As we can see that the accuracy score after the GridSerach is a bit high
* Gridseach worked in our case
* ROC-AUC is more than 0.5 , hence it is very good
