## Importing required libraries 

In [None]:
# Importing required libraries

%matplotlib inline
# import necessary libraries and specify that graphs should be plotted inline. 
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV,KFold
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix,roc_curve, auc,matthews_corrcoef
from matplotlib.legend_handler import HandlerLine2D
from sklearn.preprocessing import Normalizer,MinMaxScaler
import scikitplot as skplt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Loading Data 

In [None]:
# Loading the data set from sklearn. As we won't be using Id column, we can use data file from sklearn.
# Further, both sklearn and the given data are same.

from sklearn.datasets import load_breast_cancer
wdbc = load_breast_cancer()
print(wdbc.DESCR)

In [None]:
## Explore the data set
n_samples, n_features = wdbc.data.shape

#print(type(wdbc))
#print(wdbc.keys())

print ('The dimensions of the data set are', n_samples, 'by', n_features)
print('*'*75)
print('The classes are: ', wdbc.target_names)
print('*'*75)
print('The features in the data set are:', wdbc.feature_names)


In [None]:
## Explore the data set
print('Data:',wdbc.data[:2])
print('*'*75)
print('Target:',wdbc.target[:2])

## Decision Tree

In [None]:
##************************************************************************************
## 
## DECISION TREE
## 
##************************************************************************************
train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

# Splitting data into train and test in 80-20 ratio.
X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2)

max_depths = range(1,30) 
min_leaf_size = range(1,30)

for depth in max_depths:
    # Decision tree for varying depth
    clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=depth)
    
    ## ******************************************************
    # We can use AUC-ROC curve to select our hyperparameter.
    # Here, ROC is a probability curve and AUC represents degree of separability.
    # It tells how much model is capable of distinguishing between True and Flase output.
    # Higher the AUC means the model is better at predicting 0s as 0s and 1s as 1s.
    ## ******************************************************
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results  
}
depths = pd.DataFrame(dict(mydict),index=max_depths)

train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

for leaf in min_leaf_size:
    # Decision tree for varying depth
    clf = tree.DecisionTreeClassifier(criterion="gini", min_samples_leaf=leaf)
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
}
leafs = pd.DataFrame(dict(mydict),index=min_leaf_size)    
# skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

for impurity in [.0001,.001,.01,.1,1,10,]:
    # Decision tree for varying depth
    clf = tree.DecisionTreeClassifier(criterion="gini", min_impurity_decrease=impurity)
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
    
}
impurity = pd.DataFrame(dict(mydict),index=[.0001,.001,.01,.1,1,10,])  

## ***************************************************************************************

fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(231)
ax3 = fig.add_subplot(232)
ax5 = fig.add_subplot(233)
ax2 = fig.add_subplot(234)
ax4 = fig.add_subplot(235)
ax6 = fig.add_subplot(236)

ax5.set_xscale('log')
ax6.set_xscale('log')

depths[['Train accuracies','Test accuracies']].plot(ax=ax1)
depths[['AUC train results','AUC test results']].plot(ax=ax2)
leafs[['Train accuracies','Test accuracies']].plot(ax=ax3)
leafs[['AUC train results','AUC test results']].plot(ax=ax4)
impurity[['Train accuracies','Test accuracies']].plot(ax=ax5)
impurity[['AUC train results','AUC test results']].plot(ax=ax6)
ax1.title.set_text('Accuracy vs Tree Depth')
ax2.title.set_text('AUC vs Tree Depth')
ax3.title.set_text('Accuracy vs Min leaf size')
ax4.title.set_text('AUC vs min leaf size')
ax5.title.set_text('Accuracy vs Min impurity decrease')
ax6.title.set_text('AUC vs vs Min impurity decrease')
plt.show()

print('*'*100,'\n')
print('Using GridSearchCV fidning the best model')
# We use Grid search to optimize mutiple paramers:
parameters = {'min_impurity_decrease':[.01,.1,0,1,10], 'max_depth':list(range(1,10)),'min_samples_leaf':[1,5,10,15,20]}
dtree = tree.DecisionTreeClassifier()
clf = GridSearchCV(dtree, parameters, cv=7,scoring='f1')
clf.fit(wdbc.data, wdbc.target)
print('The best hyperparameters are:',clf.best_params_)
print('The best score is:',clf.best_score_)

In [None]:
# Final Model
clf = tree.DecisionTreeClassifier(criterion="gini", max_depth=9,min_samples_leaf=5,min_impurity_decrease=0)
clf = clf.fit(wdbc.data, wdbc.target)
scores = cross_val_score(clf, wdbc.data, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

print('*'*100)

X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state = 1)
clf = clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
probas = clf.predict_proba(X_test)
print(classification_report(y_test,predictions))
print('*'*100)
print('Confusion Matrix\n',confusion_matrix(y_test, predictions))
print('*'*100)
print('Matthews Corrcoef',matthews_corrcoef(y_test,predictions))
print('*'*100)

### ROC, Lift and Precision Recall Curve

In [None]:
## ******************************************************
# We can use AUC-ROC curve to select our hyperparameter.
# Here, ROC is a probability curve and AUC represents degree of separability.
# It tells how much model is capable of distinguishing between True and Flase output.
# Higher the AUC means the model is better at predicting 0s as 0s and 1s as 1s.
## ******************************************************
    
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
skplt.metrics.plot_precision_recall_curve(y_test, probas,ax=ax1)
skplt.metrics.plot_roc(y_test, probas,ax=ax2)
skplt.metrics.plot_lift_curve(y_test, probas,ax=ax3)
plt.show()

### Decision Tree Description and Conclusison

**Model perform for several different parameter values**

In the decision tree, I tried to explore multiple hyperparameters calculating their individual and combined effect(cross producing using GridSearchCV). The hyperparameters which I selected are the depth of the tree, minimum sample size, and minimum impurity decrease. To access the hyperparameter performance I iterated over each hyperparameter and plotted the accuracy and area under the ROC curve (area under the true positive rate vs false positive rate curve)*(have described what is a ROC-AUC in the comments when used above)*. 

I split the data into a test and train with a 20-80 ratio. I used the training data to fit the model and then predicted the outcome. The test data and the prediction are used to plot the accuracy and AUC graphs. This process was repeated for the other two parameters. 

To compare different model, we can refer to the polts which evaluates different model performances for there parameter values. 

**Overfitting and underfitting?**

From the graphs, we can see that when the tree depth is less (i.e. 1 or 2) or the min sample size is high, we observe under-fitting. This makes sense as when the tree depth is less or min sample size is more, the tress will be short and will result in very poor predictions. Similarly, when the tree depth is high (i.e. 20-30) or the min sample size is less, we can observe over-fitting. Again, this is expected as the tree would have grown big that it perfectly fits the training data and fails to perform with testing data. To determine a sweet spot, we need high AUC and good accuracy. Hence, the maximum depth would be around 5, the min sample size would be around 10 and min impurity decrease would be close to .1. 

However, there is a possibility that the model might perform better if we tune multiple hyperparameters together. Hence, we cross product to generate the combination using GridsearchCV to find the model with a high f1 score. I chose f1 score over accuracy as our data is related to breast cancer and sensitivity is an important factor. If we imagined a cost matrix, we need to penalize a decision 'where a patient has cancer and we predict not cancer' higher over the other predictions. Hence, I chose f1 over accuracy to evaluate the best model. The best hyperparameters combination is {'max_depth': 8, 'min_impurity_decrease': 0, 'min_samples_leaf': 5}. 

**Goodness of model**

Finally, we need to evaluate the goodness of our best model. We build a model with the best hyperparameters settings and test it with the 80-20 train and test split to obtain confusion matrix, classification error, precision, recall, f-measure, and MCC score. Further, to determine the mean accuracy we run cross-validation.

We obtain a mean accuracy as 0.94 with a standard deviation of +/- 0.06. Since the data is realted to a diagnostic of Breast cancer, it becomes important to have a good recall or sensitivity. From the classification report and confusion matrix, we obtain recall, precision and F1 score. The above model shows good perfomance with the cancer data and accuracy might not be the best indicator as mentioned above. Finally, we calculate the Matthews correlation coefficient score to understand a balanced view about true postive and false postive rate: 0.886.

**ROC, Precision-Recall and Lift Curve**

To evaluate the goodness of the final established model, I plotted the ROC, Precision-Recall and lift curve for my best model. Both ROC and Precision-Recall show very close to ideal classification with an AUC of 0.95. The lift curve generated also has a lift ratio, i.e. the model performs well with respect to random guess.

## Logistic regression

In [None]:
##************************************************************************************
## 
## Logistic regression
## 
##************************************************************************************

c_set = range(1,20)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state=1)

train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]
for c in c_set:
    # logit with varying c value (Inverse of regularization strength)
    logreg = LogisticRegression(C=c, penalty='l1')
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
}

print('Logistic Regression for non scaled data')
c_change_non_normal = pd.DataFrame(dict(mydict),index=c_set)  
logreg = LogisticRegression(C=5, penalty='l1')
scores = cross_val_score(logreg, wdbc.data, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("\nF1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# ********************************************************************************************

# Scaling the data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]
for c in c_set:
    # logit with varying c value (Inverse of regularization strength)
    logreg = LogisticRegression(C=c, penalty='l2')
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
}
c_change_normal = pd.DataFrame(dict(mydict),index=c_set)  

# ********************************************************************************************
print('*'*100)
print('Logistic Regression for scaled data')
logreg = LogisticRegression(C=5, penalty='l2')
scores = cross_val_score(logreg, wdbc.data, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("\nF1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print('*'*100)

train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]
for  p in ['l1','l2']:
    # logit with varying c value (Inverse of regularization strength)
    logreg = LogisticRegression(penalty=p)
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results,
}

penalties  = pd.DataFrame(dict(mydict),index=['l1','l2'])

## ***************************************************************************************
fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(231)
ax3 = fig.add_subplot(232)
ax5 = fig.add_subplot(233)
ax2 = fig.add_subplot(234)
ax4 = fig.add_subplot(235)
ax6 = fig.add_subplot(236)

ax5.set_xscale('log')
ax6.set_xscale('log')

c_change_non_normal[['Train accuracies','Test accuracies']].plot(ax=ax1)
c_change_non_normal[['AUC train results','AUC test results']].plot(ax=ax2)
c_change_normal[['Train accuracies','Test accuracies']].plot(ax=ax3)
c_change_normal[['AUC train results','AUC test results']].plot(ax=ax4)
penalties[['Train accuracies','Test accuracies']].plot(ax=ax5,kind = 'barh')
penalties[['AUC train results','AUC test results']].plot(ax=ax6,kind = 'barh')
ax1.title.set_text('Accuracy vs C(Inverse of regularization strength) Non Scaled Data')
ax2.title.set_text('AUC vs C(Inverse of regularization strength) Non Scaled Data')
ax3.title.set_text('Accuracy vs C(Inverse of regularization strength) Scaled Data')
ax4.title.set_text('AUC vs min C(Inverse of regularization strength) Scaled Data')
ax5.title.set_text('Accuracy vs Penality')
ax6.title.set_text('AUC vs vs Penality')
plt.show()

In [None]:
# GRIDSEARCH
print('*'*100,'\n')
print('Using GridSearchCV fidning the best model')
# We use Grid search to optimize mutiple paramers:
parameters = {'C':range(1,10), 'penalty':['l1','l2']}
logit = LogisticRegression()
clf = GridSearchCV(logit, parameters, cv=7, scoring='f1')
clf.fit(wdbc.data, wdbc.target)
print('The best hyperparameters are:',clf.best_params_)
print('The best score is:',clf.best_score_)

In [None]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state = 1)

# Final Model - LOGIT
clf = LogisticRegression(C=8, penalty='l1')
scores = cross_val_score(logreg, wdbc.data, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("\nF1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

print('*'*100)

X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state = 1)
clf = clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
probas = clf.predict_proba(X_test)
print(classification_report(y_test,predictions))
print('*'*100)
print('Confusion Matrix\n',confusion_matrix(y_test, predictions))
print('*'*100)
print('Matthews Corrcoef',matthews_corrcoef(y_test,predictions))

### ROC, Lift and Precision Recall RECALL Curve

In [None]:
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
skplt.metrics.plot_precision_recall_curve(y_test, probas,ax=ax1)
skplt.metrics.plot_roc(y_test, probas,ax=ax2)
skplt.metrics.plot_lift_curve(y_test, probas,ax=ax3)
plt.show()

### Logit Description and Conclusion

**Model perform for several different parameter values**

In the logistic regression, I tried to explore multiple hyperparameters calculating their individual and combined effect(cross producing using GridSearchCV). The hyperparameters which I selected are the C(Inverse of regularization strength) and penalty for scaled and unscaled data. To access the hyperparameter performance I iterated over each hyperparameter and plotted the accuracy and area under the ROC curve (area under the true positive rate vs false positive rate curve)*(have described what is a ROC-AUC in the comments when used above)*. 

I split the data into test and train with a 20-80 ratio. I used the training data to fit the model and then predict the outcome. The test data and the prediction are used to plot the accuracy and AUC graphs. This process was repeated for the other parameter. 

**Overfitting and when is underfitting?**

From the graphs, we can see that when penalties are low we can observe under-fitting. As when the C value is smaller that is stronger regularization we see the accuracy and AUC curve to show lower values. Similarly, when we have higher C value which is lower regularization we can observe over-fitting. The classification would not perform well and hence we observe lower accuracy. Further, we explore the different penalties('l1', 'l2') which can be attributed to the model. Both the penalties try to bring the beta coefficients of the logit model toward zero. But the difference is that 'l1' penalizes the coefficients linearly and 'l2' penalizes the coefficients in a parabolic curve where outlier are more penalized. From the observation, we don't find a big difference between both the penalties.

Again, there might be a possibility that the model might be better if we tune multiple hyperparameters together. Hence, we cross product to generate the combination using GridsearchCV to find the model with high accuracy. I chose f1 score over accuracy as our data is related to breast cancer and sensitivity is an important factor. If we imagined a cost matrix, we need to penalize a decision where a patient has cancer and we predict not cancer higher over the other predictions. Hence, I chose f1 over accuracy to evaluate the best model. The best hyperparameters combination is: {'C': 8, 'penalty': 'l1'}

**Goodness of model**

Finally, we need to evaluate the goodness of our best model. We build a model with the best hyperparameters settings and test it with the 80-20 train and test split to obtain confusion matrix, classification error, precision, recall, f-measure, and MCC score. Further, to determine the mean accuracy we run cross-validation.

Our logit has a high mean accuracy of 0.95 with a range +/- .05. We obtain Matthews Corrcoef 0.9 which is desirable. From the classification report and confusion matrix, we obtain recall, precision, and F score. For the above model accuracy might not be the best indicator as mentioned above. Hence, we considered the f1 score to evaluate the model.

**ROC, Precision-Recall and Lift Curve**

To evaluate the goodness of the final established model, I plotted the ROC, Precision-Recall and lift curve for my best model. Both ROC and Precision-Recall show very close to ideal classification with an AUC of 0.99. The lift curve generated also has a lift ratio, i.e. the model performs well with respect to random guess.

## K-Nearest Neighbors

In [None]:
##******************************************************
## 
## KNN
## 
##******************************************************

# Optimize KNN classifier and detect (potential) over-fitting

neighbors_values = range(1,30)
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2)
# Scaling the features to fit it into KNN
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

cross_validation_score = []
for n in neighbors_values:
    # KNN with varying n value
    knn = KNeighborsClassifier(n_neighbors = n)
    # Cross validation scrore
    cross_validation_score.append(cross_val_score(knn, wdbc.data, wdbc.target, cv=7,scoring = 'f1').mean())

mydict_ = {
    'Cross validation score':cross_validation_score
}

cross_validation_score = []
for wt in ["uniform", "distance"]:
    # KNN with varying n value
    knn = KNeighborsClassifier(weights = wt)
    # Cross validation scrore
    cross_validation_score.append(cross_val_score(knn, wdbc.data, wdbc.target, cv=7,scoring = 'f1').mean())
    
mydict = {
    'Cross validation score':cross_validation_score
}

neighbors = pd.DataFrame(dict(mydict_),index=neighbors_values)  
weights = pd.DataFrame(dict(mydict),index=["uniform", "distance"])  

fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)


neighbors[['Cross validation score']].plot(ax=ax1)
weights[['Cross validation score']].plot(kind='bar',ax=ax2)
ax1.title.set_text('F1 Score vs Neighbours')
ax2.title.set_text('AUC vs Weights')
plt.show()


In [None]:
# GRIDSEARCH
scaler = MinMaxScaler()
scaler.fit(wdbc.data)
x = scaler.transform(wdbc.data)
print('*'*100,'\n')
print('Using GridSearchCV fidning the best model')
# We use Grid search to optimize mutiple paramers:
parameters = {'weights':["uniform", "distance"], 'n_neighbors' : range(5,15)}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, cv=7,scoring='f1')
clf.fit(x, wdbc.target)
print('The best hyperparameters are:',clf.best_params_)
print('The best score is:',clf.best_score_)

In [None]:
# Final Model - KNN
knn = KNeighborsClassifier(n_neighbors= 12, weights= 'distance')
scores = cross_val_score(knn, wdbc.data, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("\nF1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state = 2)
# Scaling the features to fit it into KNN
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
probas = knn.predict_proba(X_test)
print('*'*100)
print(classification_report(y_test,predictions))
print('*'*100)
print('Confusion Matrix\n',confusion_matrix(y_test, predictions))
print('*'*100)
print('Matthews Corrcoef',matthews_corrcoef(y_test,predictions))

### ROC, Lift and Precision Recall RECALL Curve

In [None]:
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
skplt.metrics.plot_precision_recall_curve(y_test, probas,ax=ax1)
skplt.metrics.plot_roc(y_test, probas,ax=ax2)
skplt.metrics.plot_lift_curve(y_test, probas,ax=ax3)
plt.show()

### KNN Description and Conclusion

**Model perform for several different parameter values**

In the KNN, I tried to explore multiple hyperparameters calculating their individual and combined effect(cross producing using GridSearchCV). The hyperparameters which I selected are the number of neighbors and weight of the distance calculation. To access the hyperparameter performance I iterated over each hyperparameter and plotted the cross-validation mean score. The scoring is based on the f1 parameter, as we are concerned about the recall and precision.

To compare different parameters, we can refer to the polts which evaluates different model performances for there parameter values. 

**Overfitting and when is underfitting?**

From the graphs, we can see that when the knn has very fewer neighbors (i.e. 1 or 2), we observe over-fitting. This makes sense as when the neighbors are less the classifier will account for all trivial classification and will result in very poor predictions. Similarly, when the neighbors are more (i.e. 25-30), we can observe under-fitting. Again, this is expected as the knn would have generalized the sample more vaguely and fails to perform with testing data. To determine a sweet spot, we need a high F1 score. Hence, the neighbors would be around 12.

However, there is a possibility that the model might perform better if we tune multiple hyperparameters together. Hence, we cross product to generate the combination using GridsearchCV to find the model with a high f1 score. I chose f1 score over accuracy as our data is related to breast cancer and sensitivity is an important factor. If we imagined a cost matrix, we need to penalize a decision where a patient has cancer and we predict not cancer higher over the other predictions. Hence, I chose f1 over accuracy to evaluate the best model. The best hyperparameters combination is {'n_neighbors': 12, 'weights': 'distance'}

**Goodness of model**

Finally, we need to evaluate the goodness of our best model. We build a model with the best hyperparameters settings and test it with the 80-20 train and test split to obtain confusion matrix, classification error, precision, recall, f-measure, and MCC score. Further, to determine the mean accuracy we run cross-validation.

We obtain a mean F1 score as 0.95 with a standard deviation of +/- 0.05. Since the data is related to a diagnostic of Breast cancer, it becomes important to have a good recall or sensitivity. From the classification report and confusion matrix, we obtain recall, precision. The above decision classifier performs well with the cancer data and accuracy might not be the best indicator as mentioned above. Finally, we calculate the Matthews correlation coefficient score to understand a balanced view about true positive and false positive rate: 0.907

**ROC, Precision-Recall and Lift Curve**

To evaluate the goodness of the final established model, I plotted the ROC, Precision-Recall and lift curve for my best model. Both ROC and Precision-Recall show very close to ideal classification with an AUC of 0.99. The lift curve generated also has a lift ratio, i.e. the model performs well with respect to random guess.

In [None]:
##************************************************************************************
## 
## SVM
## 
##************************************************************************************
train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

# Splitting data into train and test in 80-20 ratio.
X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

C = range(1,30) 
gamma = [.001,.01,.1,1,10,100]

for i in C:
    # Decision tree for varying depth
    clf = SVC(kernel="linear", C=i,probability=True)
    
    ## ******************************************************
    # We can use AUC-ROC curve to select our hyperparameter.
    # Here, ROC is a probability curve and AUC represents degree of separability.
    # It tells how much model is capable of distinguishing between True and Flase output.
    # Higher the AUC means the model is better at predicting 0s as 0s and 1s as 1s.
    ## ******************************************************
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results  
}
depths = pd.DataFrame(dict(mydict),index=C)

train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

for i in C:
    # Decision tree for varying depth
    clf = SVC(kernel="rbf", C=i,probability=True)
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
}
leafs = pd.DataFrame(dict(mydict),index=C)    
# skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
train_accuracies,test_accuracies = [],[]
auc_train_results,auc_test_results = [],[]

for g in gamma:
    # Decision tree for varying depth
    clf = SVC(kernel="rbf", gamma=g,probability=True)
    
    # auc score for training data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, clf.fit(X_train, y_train).predict_proba(X_train)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_train_results.append(roc_auc)
    
    # auc score for testing data
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, clf.fit(X_train, y_train).predict_proba(X_test)[:,1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    auc_test_results.append(roc_auc)
    
    # accuracy for testing data
    test_accuracies.append(clf.fit(X_train, y_train).score(X_test, y_test))
    
    # accuracy for tarining data
    train_accuracies.append(clf.fit(X_train, y_train).score(X_train, y_train))
    
mydict = {
    'Train accuracies':train_accuracies,\
    'Test accuracies':test_accuracies,\
    'AUC train results':auc_train_results,\
    'AUC test results':auc_test_results
    
}
impurity = pd.DataFrame(dict(mydict),index=gamma)  

## ***************************************************************************************

fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(231)
ax3 = fig.add_subplot(232)
ax5 = fig.add_subplot(233)
ax2 = fig.add_subplot(234)
ax4 = fig.add_subplot(235)
ax6 = fig.add_subplot(236)

ax5.set_xscale('log')
ax6.set_xscale('log')


depths[['Train accuracies','Test accuracies']].plot(ax=ax1)
depths[['AUC train results','AUC test results']].plot(ax=ax2)
leafs[['Train accuracies','Test accuracies']].plot(ax=ax3)
leafs[['AUC train results','AUC test results']].plot(ax=ax4)
impurity[['Train accuracies','Test accuracies']].plot(ax=ax5)
impurity[['AUC train results','AUC test results']].plot(ax=ax6)
ax1.title.set_text('Accuracy vs C - Linear SVM')
ax2.title.set_text('AUC vs C - Linear SVM')
ax3.title.set_text('Accuracy vs C - Non Linear SVM')
ax4.title.set_text('AUC vs C - Non Linear SVM')
ax5.title.set_text('Accuracy vs Gamma - Non Linear SVM')
ax6.title.set_text('AUC vs Gamma - Non Linear SVM')
plt.show()

In [None]:
# GRIDSEARCH
scaler = MinMaxScaler()
scaler.fit(wdbc.data)
X = scaler.transform(wdbc.data)

print('*'*100,'\n')
print('Using GridSearchCV fidning the best model for Linear SVC')
# We use Grid search to optimize mutiple paramers:
parameters = {'kernel':['linear','rbf'],'C':range(1,30)}
clf = SVC()
clf = GridSearchCV(clf, parameters, cv=7, scoring='f1')
clf.fit(X, wdbc.target)
print('The best hyperparameters are:',clf.best_params_)
print('The best score is:',clf.best_score_)


print('*'*100,'\n')
print('Using GridSearchCV fidning the best model for Non-Linear SVC')
parameters = {'kernel':['rbf','sigmoid'],'C':range(1,30),'gamma':[.001,.01,.1,1,10,100]}
clf = SVC()
clf = GridSearchCV(clf, parameters, cv=7, scoring='f1')
clf.fit(X, wdbc.target)
print('The best hyperparameters are:',clf.best_params_)
print('The best score is:',clf.best_score_)

In [None]:
# Final Model
clf = SVC(kernel="linear", C=4,probability=True)
clf = clf.fit(wdbc.data, wdbc.target)
scores = cross_val_score(clf, X, wdbc.target, cv=7,scoring='f1')
print(scores)
# The mean score and the 95% confidence interval of the score estimate are hence given by:
print("F1 Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

print('*'*100)

X_train, X_test, y_train, y_test = train_test_split(wdbc.data, wdbc.target, test_size=0.2,random_state = 1)

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

clf = clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
probas = clf.predict_proba(X_test)
print(classification_report(y_test,predictions))
print('*'*100)
print('Confusion Matrix\n',confusion_matrix(y_test, predictions))
print('*'*100)
print('Matthews Corrcoef',matthews_corrcoef(y_test,predictions))
print('*'*100)

### ROC, Lift and Precision Recall RECALL Curve

In [None]:
fig = plt.figure(figsize=(15,4))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
skplt.metrics.plot_precision_recall_curve(y_test, probas,ax=ax1)
skplt.metrics.plot_roc(y_test, probas,ax=ax2)
skplt.metrics.plot_lift_curve(y_test, probas,ax=ax3)
plt.show()

### SVC Description and Conclusion

**Model perform for several different parameter values**

In the SVC, I tried to explore multiple hyperparameters calculating their individual and combined effect(cross producing using GridSearchCV). The hyperparameters which I selected are the number of neighbors and weight of the distance calculation. To access the hyperparameter performance I iterated over each hyperparameter and plotted the cross-validation mean score. The scoring is based on the f1 parameter, as we are concerned about the recall and precision.

To compare different parameters, we can refer to the polts which evaluates different model performances for there parameter values. 

**Overfitting and when is underfitting?**

From the graphs, we can see that when the knn has very fewer neighbors (i.e. 1 or 2), we observe over-fitting. This makes sense as when the neighbors are less the classifier will account for all trivial classification and will result in very poor predictions. Similarly, when the neighbors are more (i.e. 25-30), we can observe under-fitting. Again, this is expected as the knn would have generalized the sample more vaguely and fails to perform with testing data. To determine a sweet spot, we need a high F1 score. Hence, the neighbors would be around 12.

However, there is a possibility that the model might perform better if we tune multiple hyperparameters together. Hence, we cross product to generate the combination using GridsearchCV to find the model with a high f1 score. I chose f1 score over accuracy as our data is related to breast cancer and sensitivity is an important factor. If we imagined a cost matrix, we need to penalize a decision where a patient has cancer and we predict not cancer higher over the other predictions. Hence, I chose f1 over accuracy to evaluate the best model. The best hyperparameters combination is {'n_neighbors': 12, 'weights': 'distance'}

**Goodness of model**

Finally, we need to evaluate the goodness of our best model. We build a model with the best hyperparameters settings and test it with the 80-20 train and test split to obtain confusion matrix, classification error, precision, recall, f-measure, and MCC score. Further, to determine the mean accuracy we run cross-validation.

We obtain a mean F1 score as 0.95 with a standard deviation of +/- 0.05. Since the data is related to a diagnostic of Breast cancer, it becomes important to have a good recall or sensitivity. From the classification report and confusion matrix, we obtain recall, precision. The above decision classifier performs well with the cancer data and accuracy might not be the best indicator as mentioned above. Finally, we calculate the Matthews correlation coefficient score to understand a balanced view about true positive and false positive rate: 0.907

**ROC, Precision-Recall and Lift Curve**

To evaluate the goodness of the final established model, I plotted the ROC, Precision-Recall and lift curve for my best model. Both ROC and Precision-Recall show very close to ideal classification with an AUC of 0.99. The lift curve generated also has a lift ratio, i.e. the model performs well with respect to random guess.

# Final Model Comparison


|Models|Best F1 Score|Mean F1 Score|Nested F1 Score|Matthews Corrcoef|AUC of tuned model|
|---|---|---|---|---|---|
|Decision Tree|0.95|0.94|0.942|.88|.95|
|Logit|0.97|0.96|0.960|0.905|.99|
|KNN|0.9779|0.95|0.9739|0.907|.99|
|SVM Linear|0.984|0.98|0.9805|0.94|.99|
|SVM Non Linear|0.986|0.98|0.9825|0.94|.99|

In conclusion, after comparing all the three models, and their F1 scores and MCC score we can say SVM is performing better, followed by Logit and KNN. Further, the AUC of all the models is quite good. SVM has a slightly better Precision-Recall curve.