### Import packages

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

### Import data

In [None]:
data = pd.read_csv("../input/diabetes.csv")
data.head()

In [None]:
data.shape

### Data exploration

In [None]:
sns.countplot(x="Outcome", data= data)
plt.show()

### Split the dataset

In [None]:
X = data.drop("Outcome", axis = 1)
y = data["Outcome"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20, test_size=0.2)

### Build Model and prediction

In [None]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# store the predictions
y_pred=logreg.predict(X_test)

### Model Evaluation using Confusion Matrix

A confusion matrix is a table that is used to evaluate the performance of a classification model. 

In [None]:
from sklearn import metrics
cf_matrix = metrics.confusion_matrix(y_test, y_pred)
cf_matrix

In [None]:
sns.heatmap(pd.DataFrame(cf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

Let's evaluate the model using model evaluation metrics such as accuracy, precision, and recall.

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Precision: Precision is about being precise, i.e., how accurate your model is. In other words, you can say, when a model makes a prediction, how often it is correct. 

Recall: If there are patients who have diabetes in the test set and your Logistic Regression model can identify it 50% of the time.

### ROC Curve

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

* AUC score for the case is **0.766**
* AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

## Feature Engineering

### Recursive Feature Elimination

**The Recursive Feature Elimination** (or RFE) works by recursively removing attributes and building a model on those attributes that remain.
It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [None]:
from sklearn.feature_selection import RFE

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 5)
fit = rfe.fit(X_train, y_train)
print("Num Features: ", fit.n_features_)
print("Selected Features: ",  fit.support_)
print("Feature Ranking: ", fit.ranking_)

**RFE** chose the the top 5 features as Pregnancies, Glucose, BloodPressure, BMI, DiabetesPedigreeFunction.

In [None]:
X_train_f = X_train[["Pregnancies", "Glucose", "BloodPressure", "BMI", "DiabetesPedigreeFunction"]]
X_test_f  = X_test[["Pregnancies", "Glucose", "BloodPressure", "BMI", "DiabetesPedigreeFunction"]]

# store the predictions
logreg.fit(X_train_f,y_train)

# store the predictions
y_pred=logreg.predict(X_test_f)

In [None]:
cf_matrix = metrics.confusion_matrix(y_test, y_pred)
cf_matrix

In [None]:
sns.heatmap(pd.DataFrame(cf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
y_pred_proba = logreg.predict_proba(X_test_f)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

* AUC score for this case is **0.7677** with just 5 variables
* AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

### Grid SearchCV

* Looking at the misclassified instances, we can observe that 27 Diabetes cases have been classified incorrectly as Non-Diabetes (False negatives).

* A false negative is more serious as a disease has been ignored, which can lead to the death of the patient. At the same time, a false positive would lead to an unnecessary treatment — incurring additional cost.

* Let’s try to minimize the false negatives by using Grid Search to find the optimal parameters. Grid search can be used to improve any specific evaluation metric.

**The metric we need to focus on to reduce false negatives is Recall.**

The hyperparameters we tuned are:

    Penalty: l1 or l2 which species the norm used in the penalization.
    C: Inverse of regularization strength- smaller values of C specify stronger regularization.

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
clf = LogisticRegression()
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall')
grid_clf.fit(X_train, y_train)

#Predict values based on new parameters
y_pred = grid_clf.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(metrics.accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(metrics.precision_score(y_test,y_pred)))
print('Recall Score : ' + str(metrics.recall_score(y_test,y_pred)))

In [None]:
#Logistic Regression (Grid Search) Confusion matrix
cf_matrix = metrics.confusion_matrix(y_test,y_pred)
cf_matrix

In [None]:
sns.heatmap(pd.DataFrame(cf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

**Now the misclassified instances reduce to 24 Diabetes cases that classified incorrectly as Non-Diabetes (False negatives).**

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()