# Goal

Goal is to get familiar with implementing a DNN classifier

# General approach
1. Load data
2. Visualise data in a meaningful manner, cleaning it up if necessary
3. Create a basic DNN classifier
4. See if I can tune it ?

# 1. Load Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Configure Seaborn plot style
sns.set_style('dark')
sns.set_palette('viridis')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print("Available files:")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
raw_data = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
raw_data.head()

In [None]:
# Displaying a overview of data in each column
for c in raw_data.columns:
    na = raw_data[c].isna().sum()
    print(f"{raw_data[c].dtype} [{c}] ->  - {na} NaN values ({100*na/raw_data.shape[0]:.2f}%)")

**First impressions:**
* Columns that can be dropped (useless): 'id' & 'Unnamed: 32'
* Column used for labelling : 'diagnosis'
* 30 remaining columns are to be numerical data and have no missing values



In [None]:
# Splitting raw data into features (X) and labels (y)
cols_to_drop = ['id','Unnamed: 32']
data = raw_data.drop(cols_to_drop,axis=1)
X_0 = data.drop(['diagnosis'],axis=1)
y_0 = data['diagnosis']

# 2. Data Visualisation

In [None]:
# Look for imbalance in data Labels that could skew our model
malignant = y_0.value_counts()['M']
benign = y_0.value_counts()['B']
total = y_0.shape[0]
print(f"Malignant entries: {malignant} ({100*malignant/total:.2f}%)")
print(f"Benign entries: {benign} ({100*benign/total:.2f}%)")
print(f"Others: {total-malignant-benign}")

ax = sns.countplot(y_0)

In [None]:
# Plotting all features individually separated by diagnosis on a 6*5 grid, because why not ?
plot_rows = 6
plots_per_row = 5
total_plots = plot_rows * plots_per_row

fig, axs = plt.subplots(plot_rows,plots_per_row,figsize=(18,plot_rows*4))
axs = axs.ravel()

i=0
for feature in X_0.columns[:total_plots]:
    axs[i].set_title(feature)
    for d in y_0.unique():
        sns.kdeplot(ax=axs[i], data=data.loc[data.diagnosis == d, feature],
                   label=d,
                   shade=True)
    i+=1


I see at least 7 features that don't seem useful for our classification :
* 'fractal_dimension_mean'
* 'texture_se'
* 'smoothness_se'
* 'symmetry_se'
* 'fractal_dimension_se'
* 'symmetry_worst'
* 'fractal_dimension_worst'

There are most likely others, but less obvious so i'll leave them in for now

In [None]:
# Updating the list of columns that can be dropped
cols_to_drop = ['fractal_dimension_mean',
                'texture_se','smoothness_se','symmetry_se',
                'fractal_dimension_se','symmetry_worst',
                'fractal_dimension_worst']

X = X_0.drop(cols_to_drop, axis=1)

# We are now left with 23 columns
X.columns

In [None]:
encoding = {'M':1,
            'B':0}

# 3. Basic Deep Neural Network

I'll start by making a DNN with :
* 23 inputs scaled down to a range of 0 - 1
* 2 hidden layers with 128 nodes each, 10% dropout, ReLU activation function
* 1 output, sigmoid activation function

This is a wild guess, I have no idea what I'm doing at this point...

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers, callbacks


le = LabelEncoder()
y = le.fit_transform(y_0)


X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

input_shape = [X.shape[1]]


# Layer configuration
model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])


# Putting an early stopper so I don't need to worry to much about epochs
early_stopping = callbacks.EarlyStopping(
    min_delta=0.01,
    patience=10,
    restore_best_weights=True)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid,y_valid),
    batch_size=50,
    epochs=500,
    callbacks=[early_stopping]
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 10
history_df.loc[10:, ['loss', 'val_loss']].plot()
history_df.loc[10:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

...97% accuracy :)

# 4. Tuning

I don't think this is worth tuning, so i'll try to make it worse instead

# 4.1. Narrower, very deep DNN

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers, callbacks


le = LabelEncoder()
y = le.fit_transform(y_0)


X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

input_shape = [X.shape[1]]


# Layer configuration
model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])


# Putting an early stopper so I don't need to worry to much about epochs
early_stopping = callbacks.EarlyStopping(
    min_delta=0.01,
    patience=10,
    restore_best_weights=True)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid,y_valid),
    batch_size=50,
    epochs=500,
    callbacks=[early_stopping],
    verbose=False
)

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 0
history_df.loc[:, ['loss', 'val_loss']].plot()
history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

This one is aweful, perfect :)

# 4.2. Very wide & shallow DNN

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow import keras
from tensorflow.keras import layers, callbacks


le = LabelEncoder()
y = le.fit_transform(y_0)


X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2,random_state=42)

input_shape = [X.shape[1]]


# Layer configuration
model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])


# Putting an early stopper so I don't need to worry to much about epochs
early_stopping = callbacks.EarlyStopping(
    min_delta=0.01,
    patience=10,
    restore_best_weights=True)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid,y_valid),
    batch_size=50,
    epochs=500,
    callbacks=[early_stopping],
    verbose=False
)

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 0
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(), 
              history_df['val_binary_accuracy'].max()))

# Conclusion
Don't go too deep