**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

**Importing the Dataset **

In [None]:
df = pd.read_csv('../input/breast-cancer-csv/breastCancer.csv')
df.head(5)

**Exploring the Dataset**

In [None]:
df.shape

In [None]:
df.info()

In the above output, it can be clearly seen that all the features except "bare_nucleoli" have int64 as their datatype, whereas the feature "bare_nucleoli" has an object datatype, indicating that it has data other than numerical values.

The column "id" plays no role in training/testing, hence it is dropped.

In [None]:
df = df.drop(columns = ['id'])

In [None]:
df['bare_nucleoli'].value_counts()

Upon exploring the column "bare_nucleoli", it was observed that it has 16 missing values, indicated by a "?", which explains why it had an object datatype and not int64.  

**Handling the missing data**

The data is now divided into two subsets, one subset comprising of the rows which have missing values in them, and the other subset comprising of all the remaining rows in it. 

In [None]:
df_absent = df[df['bare_nucleoli']=='?']
df_absent = df_absent.reset_index()
df_absent = df_absent.drop(columns=['index'])
df_absent

In [None]:
df_present = df[df['bare_nucleoli']!='?']
df_present = df_present.reset_index()
df_present = df_present.drop(columns=["index"])
df_present = df_present.astype(np.float64)
df_present.head()

Since, the dataset is too small (comprising of only 699 samples), the missing values cannot be removed from the dataset. Instead, the missing vales are predicted using the K-Nearest Neighbors algorithm. The subsets created previously are used as the training and testing data for predicting the mising values. For choosing the optimum value of K, Mean Absolute Error is used as the metric. The value of K is varied, and the K value giving the least MAE is chosen as the optimum value. 

In [None]:
df_present_temp = df_present.drop(columns=['bare_nucleoli'])
xm = df_present_temp.values
ym = df_present['bare_nucleoli'].values

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(xm, ym, test_size=0.2, random_state=4)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

k_min = 2
test_MAE_array = []
k_array = []
MAE = 2

for k in range(2, 20):
    model = KNeighborsRegressor(n_neighbors=k).fit(train_x, train_y)
    
    y_predict = model.predict(test_x)
    y_true = test_y

    test_MAE = mean_absolute_error(y_true, y_predict)
    if test_MAE < MAE:
        MAE = test_MAE
        k_min = k

    test_MAE_array.append(test_MAE)
    k_array.append(k)

plt.plot(k_array, test_MAE_array,'r')
plt.show()
print("Best k parameter is ",k_min )

In [None]:
final_model = KNeighborsRegressor(n_neighbors=16).fit(xm,ym)

df_absent_temp = df_absent.drop(columns=['bare_nucleoli'])
df_absent_temp = df_absent_temp.astype(np.float64)
df_absent_temp.head()

In [None]:
x_am = df_absent_temp.values
y_am = final_model.predict(x_am)
y_am

In [None]:
y_am = np.round(y_am)
y_am = y_am.astype(np.int64)
y_am

In [None]:
df_pred = pd.DataFrame({'bare_nucleoli':y_am})
df_pred

After predicting the missing values, they are merged with the subset of the datatset comprising of rows that did not have any missing values. 

In [None]:
data_frame1 =  df_absent_temp.join(df_pred)
data_frame1 = data_frame1.astype(np.int64)
data_frame1

In [None]:
df_join_2 = df_present['bare_nucleoli']
data_frame_2 = df_present_temp.join(df_join_2)
data_frame_2 = data_frame_2.astype(np.int64)
data_frame_2.head()

In [None]:
dataset = [data_frame1, data_frame_2]
dataset = pd.concat(dataset)
dataset.head()

In the given dataset, the classes are labelled as 2 and 4, representing 0 and 1 respectively, where 0 states that the given sample represents a cell which is benign i.e. not cancerous, and 1 states that the given sample is cancerous i.e. malignant. Class labels having a value 2 are encoded to 0 annd class labels having a value 4 are encoded to 1.  

In [None]:
dataset.columns

In [None]:
cols = ['clump_thickness', 'size_uniformity', 'shape_uniformity',
       'marginal_adhesion', 'epithelial_size', 'bland_chromatin',
       'normal_nucleoli', 'mitoses', 'bare_nucleoli', 'class']
dataset = dataset[cols]
dataset.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
dataset.iloc[:,9] = labelencoder_Y.fit_transform(dataset.iloc[:,9].values)
dataset.head()

**Exploratory Data Analysis**

In [None]:
dataset['class'].value_counts()

In [None]:
count_plot = dataset.iloc[:,9]
sb.countplot(count_plot)
plt.show()

In [None]:
fig, ax = plt.subplots(1,1)
ax.pie(dataset['class'].value_counts(),autopct='%1.1f%%', labels=['Class = 0','Class = 1'])
plt.axis = 'equal'

In [None]:
for i in range(9):
    column = dataset.iloc[:,i]
    graph = pd.crosstab(column,dataset['class'])
    graph.plot.bar(stacked=True)
    plt.show()

In [None]:
for i in range(9):
    distribution = dataset.iloc[:,i]
    graph = sb.displot(distribution)
    plt.show()

In [None]:
for i in range(9):
    x = dataset.iloc[:,i]
    for j in range(i+1,9):
        y = dataset.iloc[:,j]
        hue_parameter = dataset['class']
        ax = sb.scatterplot(x=x, y=y, hue=hue_parameter)
        plt.show()

In [None]:
dataset.iloc[:,0:10].corr()

In [None]:
plt.figure(figsize = (10,10))
sb.heatmap(dataset.iloc[:,0:10].corr(), annot = True,fmt = '.0%')
plt.show()

In [None]:
data = pd.melt(dataset,id_vars="class",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,5))
sb.boxplot(x="features", y="value", hue="class", data=data)
plt.xticks(rotation=90)

**Splitting the data into train and test data**

In [None]:
X = dataset.drop(columns='class').values
Y = dataset['class'].values
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2, random_state=0)

**Training the machine using various algorithms -**

**Naive Bayes Algorithm**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
nb_classifier = GaussianNB()
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(nb_classifier, X_Train, Y_Train, cv=kfold, scoring='accuracy')
print('Naive Bayes Accuracy on Training Data after 10 Fold Cross Validation is :',cv_results.mean())
print()
nb_classifier.fit(X_Train, Y_Train)
Y_Pred_nb = nb_classifier.predict(X_Test)
cm_nb = confusion_matrix(Y_Test, Y_Pred_nb)
print(cm_nb)
print()
TP_nb = cm_nb[0][0]
FP_nb = cm_nb[0][1]
TN_nb = cm_nb[1][1]
FN_nb = cm_nb[1][0]
    
print('Success Rate = ',(TP_nb+TN_nb)/(TP_nb+TN_nb+FN_nb+FP_nb))
print('Misclassificate Rate = ',(FP_nb+FN_nb)/(TP_nb+TN_nb+FN_nb+FP_nb))
print('Sensitivity/tp_rate = ', TP_nb/(TP_nb+FN_nb))
print('Specificity/tn_rate = ', TN_nb/(TN_nb+FP_nb))
print('fp rate = ',FP_nb/(TN_nb+FP_nb))
print('fn rate = ',FN_nb/(TP_nb+FN_nb))

**K-Nearest Neighbors Algorithm**

In [None]:
k_min = 2
test_MAE_array = []
k_array = []
MAE = 1

for k in range(2, 20):
    model = KNeighborsRegressor(n_neighbors=k, metric = 'minkowski').fit(X_Train, Y_Train)
    Predict_Y = model.predict(X_Test)
    True_Y = Y_Test
    test_MAE = mean_absolute_error(True_Y, Predict_Y)
    if test_MAE < MAE:
        MAE = test_MAE
        k_min = k
    test_MAE_array.append(test_MAE)
    k_array.append(k)
plt.plot(k_array, test_MAE_array,'r')
plt.show()
print("Best k parameter is ",k_min )

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 5,metric='minkowski')
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(knn_classifier, X_Train, Y_Train, cv=kfold, scoring='accuracy')
print('K-Nearest Neighbors Accuracy on Training Data after 10 Fold Cross Validation is :',cv_results.mean())
knn_classifier.fit(X_Train, Y_Train)
Y_Pred_knn = knn_classifier.predict(X_Test)
print()
cm_knn = confusion_matrix(Y_Test, Y_Pred_knn)
print(cm_knn)
print()
TP_knn = cm_knn[0][0]
FP_knn = cm_knn[0][1]
TN_knn = cm_knn[1][1]
FN_knn = cm_knn[1][0]
    
print('Success Rate = ',(TP_knn+TN_knn)/(TP_knn+TN_knn+FN_knn+FP_knn))
print('Misclassificate Rate = ',(FP_knn+FN_knn)/(TP_knn+TN_knn+FN_knn+FP_knn))
print('Sensitivity/tp_rate = ', TP_knn/(TP_knn+FN_knn))
print('Specificity/tn_rate = ', TN_knn/(TN_knn+FP_knn))
print('fp rate = ',FP_knn/(TN_knn+FP_knn))
print('fn rate = ',FN_knn/(TP_knn+FN_knn))

**Logistic Regression Algorithm**

In [None]:
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
sc = StandardScaler()
X_Train_Scaled = sc.fit_transform(X_Train)
X_Test_Scaled = sc.fit_transform(X_Test)
lr_classifier = LogisticRegression()
param_grid = {
            'penalty' : ['l2','l1'],  
            'C' : [0.001, 0.01, 0.1, 1, 10, 100]
            }

CV_lr_grid = GridSearchCV(estimator = lr_classifier, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1, cv=10)
CV_lr_grid.fit(X_Train_Scaled, Y_Train)
best_parameters = CV_lr_grid.best_params_
print('The best parameters for using this model is', best_parameters)
logistic_classifier = LogisticRegression(C = best_parameters['C'], 
                                penalty = best_parameters['penalty'], 
                                random_state = 0)
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(logistic_classifier, X_Train_Scaled, Y_Train, cv=kfold, scoring='accuracy')
print('Logistic Regression Accuracy on Training Data with best parameters after 10 Fold Cross Validation is :',cv_results.mean())
logistic_classifier.fit(X_Train_Scaled, Y_Train)
Y_Pred_lr = logistic_classifier.predict(X_Test_Scaled)
print()
cm_lr = confusion_matrix(Y_Test, Y_Pred_lr)
print(cm_lr)
print()
TP_lr = cm_lr[0][0]
FP_lr = cm_lr[0][1]
TN_lr = cm_lr[1][1]
FN_lr = cm_lr[1][0]
    
print('Success Rate = ',(TP_lr+TN_lr)/(TP_lr+TN_lr+FN_lr+FP_lr))
print('Misclassificate Rate = ',(FP_lr+FN_lr)/(TP_lr+TN_lr+FN_lr+FP_lr))
print('Sensitivity/tp_rate = ', TP_lr/(TP_lr+FN_lr))
print('Specificity/tn_rate = ', TN_lr/(TN_lr+FP_lr))
print('fp rate = ',FP_lr/(TN_lr+FP_lr))
print('fn rate = ',FN_lr/(TP_lr+FN_lr))

**Support Vector Machine Algorithm**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svm = SVC()
param_grid = {'C': [0.001,0.01, 0.1, 1, 10,20,30,40,50,100], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.02, 0.03, 0.001], 
              'kernel': ['linear']} 
grid = GridSearchCV(svm, param_grid, refit=True, verbose=1, cv=10)
grid.fit(X_Train_Scaled, Y_Train)
best_parameters = grid.best_params_
print('The best parameters for using this model is', best_parameters)
svm_classifier = SVC(C = best_parameters['C'], 
                                gamma = best_parameters['gamma'],
                                kernel = 'linear',    
                                random_state = 0,
                                probability = True)
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(svm_classifier, X_Train_Scaled, Y_Train, cv=kfold, scoring='accuracy')
print('Support Vector Machines Accuracy on Training Data with best parameters after 10 Fold Cross Validation is :',cv_results.mean())
print()
svm_classifier.fit(X_Train_Scaled, Y_Train)
Y_Pred_svm = svm_classifier.predict(X_Test_Scaled)
cm_svm = confusion_matrix(Y_Test, Y_Pred_svm)
print(cm_svm)
print()
TP_svm = cm_svm[0][0]
FP_svm = cm_svm[0][1]
TN_svm = cm_svm[1][1]
FN_svm = cm_svm[1][0]
    
print('Success Rate = ',(TP_svm+TN_svm)/(TP_svm+TN_svm+FN_svm+FP_svm))
print('Misclassificate Rate = ',(FP_svm+FN_svm)/(TP_svm+TN_svm+FN_svm+FP_svm))
print('Sensitivity/tp_rate = ', TP_svm/(TP_svm+FN_svm))
print('Specificity/tn_rate = ', TN_svm/(TN_svm+FP_svm))
print('fp rate = ',FP_svm/(TN_svm+FP_svm))
print('fn rate = ',FN_svm/(TP_svm+FN_svm))

**Decision Trees Algorithm**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import tree
y_entropy = []
y_gini = []
for depth in range(len(dataset.columns)):
    classifier_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=depth+1,random_state=0)
    classifier_gini = DecisionTreeClassifier(criterion='gini', max_depth=depth+1,random_state=0)
    classifier_entropy.fit(X_Train_Scaled, Y_Train)
    classifier_gini.fit(X_Train_Scaled, Y_Train)
    y_entropy.append(classifier_entropy.score(X_Test_Scaled, Y_Test)*100)
    y_gini.append(classifier_gini.score(X_Test_Scaled, Y_Test)*100)
    
plt.figure(figsize=(12,4))
plt.plot(range(1, len(dataset.columns)+1), y_entropy)
plt.plot(range(1, len(dataset.columns)+1), y_gini)
plt.title('Gini Vs Entropy')
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.legend(['Entropy', 'Gini'])
plt.show()

best_accuracy = np.amax(y_entropy), np.amax(y_gini)
best_criterion = ['Entropy', 'Gini']

print('Best Criterion: {}, Accuracy {:.2f}% at depth = {}'.format(best_criterion[np.argmax(best_accuracy)], 
                                                                  np.amax(best_accuracy), 
                                                                  np.argmax(y_gini)+1 if np.amax(y_gini) > np.amax(y_entropy) else np.argmax(y_entropy)+1))
dt_classifier = DecisionTreeClassifier(criterion = 'entropy', max_depth = 7, random_state= 0)
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(dt_classifier, X_Train_Scaled, Y_Train, cv=kfold, scoring='accuracy')
print('Decision Trees Accuracy on Training Data with best parameters after 10 Fold Cross Validation is :',cv_results.mean())
print()
dt_classifier.fit(X_Train_Scaled, Y_Train)
Y_Pred_dt = dt_classifier.predict(X_Test_Scaled)
cm_dt = confusion_matrix(Y_Test, Y_Pred_dt)
print(cm_dt)
print()
TP_dt = cm_dt[0][0]
FP_dt = cm_dt[0][1]
TN_dt = cm_dt[1][1]
FN_dt = cm_dt[1][0]
print('Success Rate = ',(TP_dt+TN_dt)/(TP_dt+TN_dt+FN_dt+FP_dt))
print('Misclassificate Rate = ',(FP_dt+FN_dt)/(TP_dt+TN_dt+FN_dt+FP_dt))
print('Sensitivity/tp_rate = ', TP_dt/(TP_dt+FN_dt))
print('Specificity/tn_rate = ', TN_dt/(TN_dt+FP_dt))
print('fp rate = ',FP_dt/(TN_dt+FP_dt))
print('fn rate = ',FN_dt/(TP_dt+FN_dt))

plt.figure(figsize = (10,10))
tree.plot_tree(classifier_gini.fit(X_Test, Y_Test))

**Random Forest Algorithm**

In [None]:
from sklearn.ensemble import RandomForestClassifier
num_folds = 10
kfold = KFold(n_splits=num_folds)
for i in range(1, 21):
    for j in range(1,10):
        rf = RandomForestClassifier(n_estimators = i, random_state=0, max_depth = j)
        score = cross_val_score(rf, X_Train_Scaled, Y_Train, scoring='accuracy' ,cv=kfold).mean()
        print("N_Estimators = " + str(i) + " : Depth = "+ str(j) + " : Accuracy = " + str(score))

In [None]:
# Training the algorithm for best parameters.
# There are 4 cases in which the same maximum training accuracy.
# The 4 cases are - N = 12,14,15,17 and Depth = 5.
N_Estimators = [12,14,15,17]
# Calculating the testing accuracy for the best obtained N_Estimators and Depth and then finding the best testing accuracy.
for i in N_Estimators:
    rfc_classifier = RandomForestClassifier(n_estimators = i, max_depth = 5, random_state = 0)                      
    rfc_classifier.fit(X_Train_Scaled, Y_Train)
    Y_Pred_rfc = rfc_classifier.predict(X_Test_Scaled)
    cm_rfc = confusion_matrix(Y_Test, Y_Pred_rfc)
    TP_rfc = cm_rfc[0][0]
    FP_rfc = cm_rfc[0][1]
    TN_rfc = cm_rfc[1][1]
    FN_rfc = cm_rfc[1][0]
    print('Success Rate = ',(TP_rfc+TN_rfc)/(TP_rfc+TN_rfc+FN_rfc+FP_rfc))

In [None]:
rfc_classifier_final = RandomForestClassifier(n_estimators = 15, max_depth = 5, random_state = 0)       
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(rfc_classifier_final, X_Train_Scaled, Y_Train, cv=kfold, scoring='accuracy')
print('Random Forest Accuracy on Training Data with best parameters after 10 Fold Cross Validation is :',cv_results.mean())
print()
rfc_classifier_final.fit(X_Train_Scaled, Y_Train)
Y_Pred_rfc = rfc_classifier_final.predict(X_Test_Scaled)
cm_rfc = confusion_matrix(Y_Test, Y_Pred_rfc)
print(cm_rfc)
TP_rfc = cm_rfc[0][0]
FP_rfc = cm_rfc[0][1]
TN_rfc = cm_rfc[1][1]
FN_rfc = cm_rfc[1][0]
print()    
print('Success Rate = ',(TP_rfc+TN_rfc)/(TP_rfc+TN_rfc+FN_rfc+FP_rfc))
print('Misclassificate Rate = ',(FP_rfc+FN_rfc)/(TP_rfc+TN_rfc+FN_rfc+FP_rfc))
print('Sensitivity/tp_rate = ', TP_rfc/(TP_rfc+FN_rfc))
print('Specificity/tn_rate = ', TN_rfc/(TN_rfc+FP_rfc))
print('fp rate = ',FP_rfc/(TN_rfc+FP_rfc))
print('fn rate = ',FN_rfc/(TP_rfc+FN_rfc))
print()

**Neural Networks**

In [None]:
from sklearn.neural_network import MLPClassifier
acc = []
learning_rate = [1e-5,1e-4,1e-3,1e-2,1e-1,1]
for i in range(10,90,10):
    for j in learning_rate:
        nn = MLPClassifier(hidden_layer_sizes=(9,2),
                      learning_rate_init = j,
                      max_iter = i,
                      random_state = 33)
        score = cross_val_score(nn, X_Train_Scaled, Y_Train, scoring='accuracy' ,cv=kfold).mean()
        print("Epochs/Number_Of_Iterations = " + str(i) + " : Learning_Rate = "+ str(j) + " : Accuracy = " + str(score))  

In [None]:
# The best accuracy on training data is achieved when learning rate is 1e-2.
# Keeping the learning rate as 1e-2, we iterate through number of epochs/iterations to find the best testing accuracy and 
# minimum overfitting of training data. 

training_acc = []
testing_acc = []
for i in range(1,50):
    model = MLPClassifier(hidden_layer_sizes=(9,2),
                      learning_rate_init = 1e-2,
                      max_iter = i,
                      random_state = 33)
    model.fit(X_Train_Scaled,Y_Train)
    prd_r = model.predict(X_Test_Scaled)
    test_acc = metrics.accuracy_score(Y_Test, prd_r) * 100.
    train_acc = model.score(X_Train_Scaled,Y_Train) *100
    testing_acc.append(test_acc)
    training_acc.append(train_acc)
testing_error = [100-x for x in testing_acc]  
training_error = [100-x for x in training_acc]
plt.title('Model Loss')
plt.ylabel('Error')
plt.xlabel('Epochs')
plt.plot(training_error,label = 'Train')
plt.plot(testing_error, label = 'Test')  
plt.legend(loc='upper right')
plt.show()    
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.plot(training_acc,label = 'Train')
plt.plot(testing_acc, label = 'Test')  
plt.legend(loc='lower right')
plt.show() 

In [None]:
# Optimum number of epochs as seen from the graph can be chosen as 20 (Same testing accuracy for epochs between 14 and 29)
MLP_classifier = MLPClassifier(hidden_layer_sizes=(9,2),
                      learning_rate_init = 1e-2,
                      max_iter = 20,
                      random_state = 33)
num_folds = 10
kfold = KFold(n_splits=num_folds)
cv_results = cross_val_score(MLP_classifier, X_Train_Scaled, Y_Train, cv=kfold, scoring='accuracy')
print('Neural Networks Accuracy on Training Data with best parameters after 10 Fold Cross Validation is :',cv_results.mean())
print()
MLP_classifier.fit(X_Train_Scaled, Y_Train)
Y_Pred_mlp = MLP_classifier.predict(X_Test_Scaled)
cm_mlp = confusion_matrix(Y_Test, Y_Pred_mlp)
print(cm_mlp)
TP_mlp = cm_mlp[0][0]
FP_mlp = cm_mlp[0][1]
TN_mlp = cm_mlp[1][1]
FN_mlp = cm_mlp[1][0]
print()    
print('Success Rate = ',(TP_mlp+TN_mlp)/(TP_mlp+TN_mlp+FN_mlp+FP_mlp))
print('Misclassificate Rate = ',(FP_mlp+FN_mlp)/(TP_mlp+TN_mlp+FN_mlp+FP_mlp))
print('Sensitivity/tp_rate = ', TP_mlp/(TP_mlp+FN_mlp))
print('Specificity/tn_rate = ', TN_mlp/(TN_mlp+FP_mlp))
print('fp rate = ',FP_mlp/(TN_mlp+FP_mlp))
print('fn rate = ',FN_mlp/(TP_mlp+FN_mlp))
print()

**Classifiers as points on Receiver Operating Characterstics Curve**

In [None]:
from sklearn import metrics
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(111)
ax1.plot([0, 1], [0, 1], ls="--")
colors = ("blue","green","orange","red","purple","brown","pink")
tp_array = [TP_nb/(TP_nb+FN_nb),TP_lr/(TP_lr+FN_lr),TP_knn/(TP_knn+FN_knn), TP_svm/(TP_svm+FN_svm),TP_dt/(TP_dt+FN_dt),TP_rfc/(TP_rfc+FN_rfc),TP_mlp/(TP_mlp+FN_mlp)]
fp_array = [FP_nb/(TN_nb+FP_nb),FP_lr/(TN_lr+FP_lr),FP_knn/(TN_knn+FP_knn), FP_svm/(TN_svm+FP_svm), FP_dt/(TN_dt+FP_dt),FP_rfc/(FP_rfc+TN_rfc),FP_mlp/(FP_mlp+TN_mlp)]
for x,y,color in zip(tp_array,fp_array,colors):
    plt.scatter(y,x,c=color)
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('Classifiers on ROC curve')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
ax1.plot(tp_array[0], fp_array[0] , label= "Naive_Bayes" , color = "blue")
ax1.plot(tp_array[2], fp_array[2] , label= "K Nearest Neighbors", color = "orange")
ax1.plot(tp_array[1], fp_array[1] , label= "Logistic Regression", color = "green")
ax1.plot(tp_array[3], fp_array[3] , label= "Support Vector Machine", color = "red")
ax1.plot(tp_array[4], fp_array[4] , label= "Decision Trees", color = "purple")
ax1.plot(tp_array[5], fp_array[5] , label= "Random Forest", color = "brown")
ax1.plot(tp_array[6], fp_array[6] , label= "MLP", color = "white")
ax1.legend(loc="lower right")
plt.show()
# Pink and Red Overlap

**ROC-AUC for different Classifiers**

In [None]:
from numpy import sqrt
from numpy import argmax
# calculate the fpr and tpr for all thresholds of the classification
nb_probs = nb_classifier.predict_proba(X_Test)
knn_probs = knn_classifier.predict_proba(X_Test)
lr_probs = logistic_classifier.predict_proba(X_Test_Scaled)
svm_probs = svm_classifier.predict_proba(X_Test_Scaled)
dt_probs = dt_classifier.predict_proba(X_Test_Scaled)
rfc_probs = rfc_classifier_final.predict_proba(X_Test_Scaled)
mlp_probs = MLP_classifier.predict_proba(X_Test_Scaled)

preds_nb = nb_probs[:,1]
preds_knn = knn_probs[:,1]
preds_lr = lr_probs[:,1]
preds_svm = svm_probs[:,1]
preds_dt = dt_probs[:,1]
preds_rfc = rfc_probs[:,1]
preds_mlp = mlp_probs[:,1]

fpr_nb, tpr_nb, threshold_nb = metrics.roc_curve(Y_Test, preds_nb)
roc_auc_nb = metrics.auc(fpr_nb, tpr_nb)
fpr_knn, tpr_knn, threshold_knn = metrics.roc_curve(Y_Test, preds_knn)
roc_auc_knn = metrics.auc(fpr_knn, tpr_knn)
fpr_lr, tpr_lr, threshold_lr = metrics.roc_curve(Y_Test, preds_lr)
roc_auc_lr = metrics.auc(fpr_lr, tpr_lr)
fpr_svm, tpr_svm, threshold_svm = metrics.roc_curve(Y_Test, preds_svm)
roc_auc_svm = metrics.auc(fpr_svm, tpr_svm)
fpr_dt, tpr_dt, threshold_dt = metrics.roc_curve(Y_Test, preds_dt)
roc_auc_dt = metrics.auc(fpr_dt, tpr_dt)
fpr_rfc, tpr_rfc, threshold_rfc = metrics.roc_curve(Y_Test, preds_rfc)
roc_auc_rfc = metrics.auc(fpr_rfc, tpr_rfc)
fpr_mlp, tpr_mlp, threshold_mlp = metrics.roc_curve(Y_Test, preds_mlp)
roc_auc_mlp = metrics.auc(fpr_mlp, tpr_mlp)

plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characterstics')
plt.plot(fpr_nb, tpr_nb, label='Naive Bayes (AUC = %0.4f)' % (roc_auc_nb))
plt.plot(fpr_knn, tpr_knn, label='K Nearest Neighbors (AUC = %0.4f)' % (roc_auc_knn))
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression (AUC = %0.4f)' % (roc_auc_lr))
plt.plot(fpr_svm, tpr_svm, label='Support Vector Machines (AUC = %0.4f)' % (roc_auc_svm))
plt.plot(fpr_dt, tpr_dt, label='Decision Trees (AUC = %0.4f)' % (roc_auc_dt))
plt.plot(fpr_rfc, tpr_rfc, label='Random Forest Classifier (AUC = %0.4f)' % (roc_auc_rfc))
plt.plot(fpr_mlp, tpr_mlp, label='MLP (AUC = %0.4f)' % (roc_auc_mlp))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
roc_auc_nb = metrics.auc(fpr_nb, tpr_nb)
roc_auc_knn = metrics.auc(fpr_knn, tpr_knn)
roc_auc_lr = metrics.auc(fpr_lr, tpr_lr)
roc_auc_svm = metrics.auc(fpr_svm, tpr_svm)
roc_auc_dt = metrics.auc(fpr_dt, tpr_dt)
roc_auc_rfc = metrics.auc(fpr_rfc, tpr_rfc)
roc_auc_mlp = metrics.auc(fpr_mlp, tpr_mlp)
print("Naive Bayes AUC = ",roc_auc_nb)
print("K Nearest Neighbors AUC = ",roc_auc_knn)
print("Logistic Regression AUC = ",roc_auc_lr)
print("Support Vector Machine AUC = ",roc_auc_svm)
print("Decision Trees AUC = ",roc_auc_dt)
print("Random Forest AUC = ",roc_auc_rfc)
print("MLP AUC = ",roc_auc_mlp)