This is a cancer data set. I have used 10 features which are selected through various feature selection techniques like ELI5. Several explainability techniques like LIME, SHAP, PDP (Partial dependency plots) have been employed as well. Hyperparameter optimization has been performed with the use of Grid-searchCV, Random-searchCV and Bayesian Optimization.

Readers are recommended to take a look here - https://github.com/kaii55 for better understanding.

https://kaii55.github.io - My website

In [None]:
##Importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
 
#Importing Keras and Tensorflow for Deep Learning
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import eli5
from eli5.sklearn import PermutationImportance
import tensorflow as tf
import keras 
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

#Importing other libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

#Importing libraries for model evaluation 
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Reading the dataset
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
#Dropping unncessary columns
list = ['Unnamed: 32','id']
df.drop(list,axis = 1, inplace = True)
df.head()

In [None]:
#Checking for missing values
df.isnull().sum()

In [None]:
# Count the occurrences of malignant and non-malignant cells and print them
occ = df['diagnosis'].value_counts()
print(occ)

# Print the ratio of malignant and non-malignant cells
print(occ / len(df))

In [None]:
#Checking the variance 
print(df.var())

In [None]:
#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['diagnosis'] = labelencoder.fit_transform(df['diagnosis'])

In [None]:
#Correlation checking in the dataset with the target value (Diagnosis)
df.corrwith(df.diagnosis).plot.bar(
        figsize = (30, 10), title = "Correlation with Target - Diagnosis", fontsize = 20,
        rot = 45, grid = True)

In [None]:
#Data selection - Here Only the important features are considered as they affect the models the most

df_1 = pd.DataFrame(df['diagnosis'])
df.drop(['diagnosis'], axis =1, inplace = True)
df_2 = pd.DataFrame(df[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])


In [None]:

ss = StandardScaler()

X = df_2
Y = df_1.values

#Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, 
                                                    stratify = Y)



#Scaling and transforming the training data
ss.fit(X_train[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])

X_train = ss.transform(X_train[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])


#Transforming the data
X_test = ss.transform(X_test[['radius_mean', 'texture_mean', 'perimeter_mean', 
                        'smoothness_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 
                        'texture_se', 'area_se', 'fractal_dimension_mean']])




#Sampling the data


X_train_train, X_test_test, Y_train_train, Y_test_test = train_test_split(X_train, Y_train, test_size = 0.20)

sm = SMOTE(random_state=42)

X_resampled, Y_resampled = sm.fit_resample(X_train_train, Y_train_train)



#If needed can be performed

"""
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)

"""


In [None]:
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))
print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of Y_train: {}".format(Y_train.shape))
print("Shape of Y_test: {}".format(Y_test.shape))
print("Shape of X_resampled: {}".format(X_resampled.shape))
print("Shape of Y_resampled: {}".format(Y_resampled.shape))

In [None]:
from sklearn.metrics import f1_score
callbacks = [EarlyStopping(monitor='val_loss',mode='min',patience=2, restore_best_weights = True)]
results_control_accuracy = []
for i in range(0,30):
    model = Sequential()
    model.add(Dense(64, input_dim=len(X.columns),kernel_initializer = 'he_normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, input_dim=len(X.columns),kernel_initializer = 'he_normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(keras.optimizers.Adam(lr=0.001),'binary_crossentropy',metrics=['accuracy'])

    history = model.fit(X_resampled, Y_resampled, callbacks = callbacks,
          epochs=30,validation_data = (X_test_test, Y_test_test),
          batch_size=256, verbose = 0)

    y_test_pred= model.predict(X_test) > 0.5
    
    f1 = f1_score(Y_test, y_test_pred)
    
    results_control_accuracy.append(f1)
    
print(results_control_accuracy)

In [None]:
results_control_accuracy = pd.DataFrame(results_control_accuracy)
mean_control_accuracy = results_control_accuracy.mean()
print("Mean Control Accuracy: {}".format(mean_control_accuracy))

In [None]:
std_control_accuracy = results_control_accuracy.std()
print("Standard Deviation of Control Accuracy Results: {}".format(std_control_accuracy))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
#Confusion Matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_test_pred)
print(cm)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_test_pred))

In [None]:
print('Train Accuracy: {}\nTest Accuracy:{}'.format(history.history['accuracy'][-1], history.history['val_accuracy'][-1]))

In [None]:
#Roc curve generation
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(Y_test, y_test_pred)
plt.plot ([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label = 'Deep Learning')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Deep Learning')
plt.show()

In [None]:
from sklearn.metrics import f1_score
callbacks = [EarlyStopping(monitor='val_loss',mode='min',patience=2, restore_best_weights = True)]
results_experimental_accuracy = []
for i in range(0,30):
    model = Sequential()
    model.add(Dense(64, input_dim=len(X.columns),kernel_initializer = 'he_normal',activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, kernel_initializer = 'he_normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, input_dim=len(X.columns),kernel_initializer = 'he_normal',activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(keras.optimizers.Adam(lr=0.001),'binary_crossentropy',metrics=['accuracy'])

    history = model.fit(X_resampled, Y_resampled,callbacks = callbacks,
          epochs=30,validation_data = (X_test_test, Y_test_test),
          batch_size=256, verbose = 0)

    y_test_pred= model.predict(X_test) > 0.5
    
    f1 = f1_score(Y_test, y_test_pred)
    
    results_experimental_accuracy.append(f1)
    
print(results_experimental_accuracy)

In [None]:
results_experimental_accuracy = pd.DataFrame(results_experimental_accuracy)
mean_experimental_accuracy = results_experimental_accuracy.mean()
print("Mean Experimental Accuracy: {}".format(mean_experimental_accuracy))

In [None]:
std_experimental_accuracy = results_experimental_accuracy.std()
print("Standard Deviation of Experimental Accuracy Results: {}".format(std_experimental_accuracy))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
#Confusion Matrix generation
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_test_pred)
print(cm)

In [None]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_test_pred))

In [None]:
print('Train Accuracy: {}\nTest Accuracy:{}'.format(history.history['accuracy'][-1], history.history['val_accuracy'][-1]))

In [None]:
#Roc Curve generation
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(Y_test, y_test_pred)
plt.plot ([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label = 'Deep Learning')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Deep Learning')
plt.show()

In [None]:
results_accuracy= pd.concat([results_control_accuracy, results_experimental_accuracy], axis=1)
results_accuracy.columns = ['Control', 'Experimental']

In [None]:
results_accuracy.boxplot()

In [None]:
results_accuracy.boxplot(showfliers=False)

In [None]:
ax = results_accuracy.boxplot()
ax.set_ylim([0.9,1])

In [None]:
results_accuracy.hist(density=True)

In [None]:
#Normality Testing
from scipy import stats

alpha = 0.05;

s, p = stats.normaltest(results_control_accuracy)
if p < alpha:
  print('Control data is not normal')
else:
  print('Control data is normal')

s, p = stats.normaltest(results_experimental_accuracy)
if p < alpha:
  print('Experimental data is not normal')
else:
  print('Experimental data is normal')

In [None]:
#Significance Testing
s, p = stats.wilcoxon(results_control_accuracy[0], results_experimental_accuracy[0])

if p < 0.05:
  print('null hypothesis rejected, significant difference between the data-sets')
else:
  print('null hypothesis accepted, no significant difference between the data-sets')