# Convolutional Neural Network (CNN)


## <span style="color: yellowgreen;">1. </span> Imports


In [312]:
# general actions on arrays & generate an array from a text file
import numpy as np

# Generate a nice plot of the results
import matplotlib.pyplot as plt 
%matplotlib inline

# it will split the features and labels into a train set and a test set
# This also does randomized shuffling, so we don't have to worry about the labels being sorted by accident. This will automatically shuffle them for us.
from sklearn.model_selection import train_test_split

# Converts a class vector (integers) to binary class matrix.
from keras.utils.np_utils import to_categorical

# force all the feature data to fall within a certain range this can actually help the neural network perform better
from sklearn.preprocessing import MinMaxScaler

# Calculate the statistics of our Model
from sklearn.metrics import confusion_matrix, classification_report

# Model architecture
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPool1D, Flatten

## <span style="color: yellowgreen;">2. </span> Load the data


In [313]:
# we are passing the delimiter parameter to specify that the features are separated by a comma
data = np.genfromtxt('./data/labels_features.csv', delimiter=",")
data = data[1:, :]

## <span style="color: yellowgreen;">3. </span> Split data into categories (labels and features)


In [314]:
LABELS = data[:, 1].astype(int) # only class telling real / fake
FEATURES = data[:, 2:] # only features, no class
TARGET_NAMES = ['Metastasis, class 0', 'No metastasis, class 1']

## <span style="color: yellowgreen;">4. </span> Split the dataset - training and test set


In [315]:
test_size = 0.33

dim = len(FEATURES[0])
num_labels = len(np.unique(LABELS))

# @params: X features; y labels; test size of 33%; random_state => seed to have the same shuffle every time
# why 42? => https://news.mit.edu/2019/answer-life-universe-and-everything-sum-three-cubes-mathematics-0910
x_train, x_test, y_train, y_test = train_test_split(FEATURES, LABELS, test_size=test_size, random_state=42)

## <span style="color: yellowgreen;">5. </span> Preprocessing of the data

#### <span style="color: royalblue;">a) </span> One Hot Encode

The labels are numbers (1, 2, 3, 16, etc. depending on the number of classes), so we have to translate this data to be "one hot encoded" for our CNN.

We have 2 classes:

- 0 => metastasis (_pol. "przerzut"_),
- 1 => negative class - no metastasis (_pol. "brak przerzutu"_).

Our data can be categorised as either of those classes, but with 1hot-encode it will always be represented as an array of 2 values. For class "1" we'll have [0, 1] and for class "0" this will be [1, 0].


In [316]:
# categorical versions
# @params: (labels, no_of_classes)
# 0 => metastasis; 1 => negative class - no metastasis
y_cat_test = to_categorical(y_test, num_labels)
y_cat_train = to_categorical(y_train, num_labels)

y_cat_train[0] # we know that is a "class 1" so we should see the number 1 on the index 1 after one hot encoding

array([0., 1.], dtype=float32)

#### <span style="color: royalblue;">b) </span> Processing X Data

We should normalize the X data, because they should always be between 0 and 1.


In [317]:
# create scaler object
scaler_object = MinMaxScaler()

# fit the scaler object to our training data. The function fit() finds the min and max value
# we only fit to x_train and not x_test BECAUSE we do not want the scaler_object to peek at any test data. If we would do that it is called data leakage and is essentially cheating. So we fit to the train data but transform both
scaler_object.fit(x_train)

#  transform() is transforming the given array based on the MinMax we just calculated during the fit
x_train = scaler_object.transform(x_train)
x_test = scaler_object.transform(x_test)

#### <span style="color: royalblue;">c) </span> Reshaping the Data

Capture the shape to easily change the number of features in model


In [318]:
# add artificial dimension to the x_train => (105,) -> (105, 1)
x_train = x_train.reshape(*x_train.shape, 1)
x_test = x_test.reshape(*x_test.shape, 1)

x_train_shape = x_train[0].shape
x_test_shape = x_test[0].shape
[x_train_shape, x_test_shape]

[(105, 1), (105, 1)]

## <span style="color: yellowgreen;">6. </span> Creating and compiling the model

#### <span style="color: royalblue;">a) </span> Create the model architecture - designing layers:


In [319]:
# create a model
model = Sequential()

# CONVOLUTIONAL LAYER
# we can play around with those values but the ones given here are usually a good starting point
# although we can not mess around with the input shape
model.add(Conv1D(filters=16, kernel_size=4, input_shape=x_train_shape, activation='relu'))

# POOLING LAYER
# we can experiment with the pool size
model.add(MaxPool1D(pool_size=1))

# we have to transform the convolutional and pooling layers into something that a single dense layer can understand
model.add(Flatten())

# DENSE HIDDEN LAYER
# here we have 128 neurons in a hidden layer, but we can play around with these values
model.add(Dense(128, activation='relu'))

# OUTPUT LAYER
# can not play around with; output 2 labels and specific activation function that will directly output the class that it thinks it is
model.add(Dense(num_labels, activation='softmax'))

#### <span style="color: royalblue;">b) </span> Compile the created model


In [320]:
# loss: String (name of objective function) or objective function. Configures the model for training.
# optimizer: String (name of optimizer) or optimizer instance. Configures the model for training.
# metrics: List of metrics to be evaluated by the model. Configures the model for training.
# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['accuracy'])


# 2 classes co binary is enough
model.compile(loss= 'binary_crossentropy',
              optimizer= 'adam',
              metrics= ['accuracy'])

#### <span style="color: royalblue;">c) </span> Check and verify the summary of compiled model


In [321]:
# Prints a string summary of the network.
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_16 (Conv1D)           (None, 101, 16)           96        
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 101, 16)           0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 1616)              0         
_________________________________________________________________
dense_31 (Dense)             (None, 128)               206976    
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 258       
Total params: 207,330
Trainable params: 207,330
Non-trainable params: 0
_________________________________________________________________


## <span style="color: yellowgreen;">7. </span> Training the Model:

This can take a while to compute, change no of epochs if necessary.


In [322]:
model.fit(x_train, y_cat_train, epochs= 400, verbose= 2) # have to remember that it has to be a categorical data

Epoch 1/400
 - 1s - loss: 0.6592 - acc: 0.5632
Epoch 2/400
 - 0s - loss: 0.6181 - acc: 0.7126
Epoch 3/400
 - 0s - loss: 0.5953 - acc: 0.7126
Epoch 4/400
 - 0s - loss: 0.5728 - acc: 0.7126
Epoch 5/400
 - 0s - loss: 0.5629 - acc: 0.7126
Epoch 6/400
 - 0s - loss: 0.5498 - acc: 0.7126
Epoch 7/400
 - 0s - loss: 0.5390 - acc: 0.7126
Epoch 8/400
 - 0s - loss: 0.5276 - acc: 0.7126
Epoch 9/400
 - 0s - loss: 0.5177 - acc: 0.7126
Epoch 10/400
 - 0s - loss: 0.5068 - acc: 0.7241
Epoch 11/400
 - 0s - loss: 0.4928 - acc: 0.7471
Epoch 12/400
 - 0s - loss: 0.4867 - acc: 0.7356
Epoch 13/400
 - 0s - loss: 0.4759 - acc: 0.8046
Epoch 14/400
 - 0s - loss: 0.4632 - acc: 0.7816
Epoch 15/400
 - 0s - loss: 0.4538 - acc: 0.7816
Epoch 16/400
 - 0s - loss: 0.4425 - acc: 0.7701
Epoch 17/400
 - 0s - loss: 0.4354 - acc: 0.7931
Epoch 18/400
 - 0s - loss: 0.4229 - acc: 0.8161
Epoch 19/400
 - 0s - loss: 0.4141 - acc: 0.8046
Epoch 20/400
 - 0s - loss: 0.4016 - acc: 0.8161
Epoch 21/400
 - 0s - loss: 0.3944 - acc: 0.8276
E

<keras.callbacks.History at 0x1e75db3ccc0>

## <span style="color: yellowgreen;">8. </span> Model statistics

#### <span style="color: royalblue;">a) </span> Evaluate the Model:


In [323]:
print(f"Metric names:\n\t{model.metrics_names}\n") # ['loss', 'acc']
model.evaluate(x_test, y_cat_test)

Metric names:
	['loss', 'acc']



[3.240778771313754, 0.6136363636363636]

#### <span style="color: royalblue;">b) </span> Test the model on data that is has not seen before


In [324]:
# predicting on data it does not know
predictions = model.predict_classes(x_test)

# we have the answers because we have the y_test vector
conf_mat = confusion_matrix(y_test, predictions)
# [[True Negative, False Negative],
# [False Positive, True Positive]]

confusion = {
    "TruePositive": conf_mat[1][1],
    "TrueNegative": conf_mat[0][0],
    "FalsePositive": conf_mat[1][0],
    "FalseNegative": conf_mat[0][1],
}
accuracy = (confusion["TrueNegative"] + confusion["TruePositive"]) / sum(confusion.values())
accuracy = round(accuracy * 100, 2)

#### <span style="color: royalblue;">c) </span> Print the confusion matrix values


In [325]:
def printDict(obj):
    if isinstance(obj, dict):
        for k, v in obj.items():
            if hasattr(v, '__iter__'):
                print(k)
                printDict(v)
            else:
                print('%s\t:\t%s' % (k, v))
    elif isinstance(obj, list):
        for v in obj:
            if hasattr(v, '__iter__'):
                printDict(v)
            else:
                print(v)
    else:
        print(obj)

separator = "\t---------------------------------\n"
print(f'\nConfusion Matrix:\n{separator}\t|\t{confusion["TruePositive"]}\t|\t{confusion["FalsePositive"]}\t| \n{separator}\t|\t{confusion["FalseNegative"]}\t|\t{confusion["TrueNegative"]}\t|\n{separator}')
printDict(confusion)


Confusion Matrix:
	---------------------------------
	|	21	|	11	| 
	---------------------------------
	|	6	|	6	|
	---------------------------------

TruePositive	:	21
TrueNegative	:	6
FalsePositive	:	11
FalseNegative	:	6


#### <span style="color: royalblue;">d) </span> Print the Classification Report

1. **Accuracy:**

- Accuracy in classification problems is the **number of correct predictions** made by the model divided by the **total number of predictions**.
- Accuracy is useful when the target classes are well balanced, i.e. have approximately the same number of elements.
- Accuracy is **not** a good choice for **unbalanced** classes!
- Imagine we had 99 images of dogs and 1 image of a cat. If our model was simply a line that always predicted **dogs**, we would get 99% accuracy! In this situation we'll want to understand recall and precision.

2. **Precision:**

- The ability of a classification model to identify only the relevant data points.
- Precision is defined as the number of true positives divided by the number of true positives plus the number of false positives.

3. **Recall:**

- The ability of a model to find all relevant cases within a data set.
- The precise definition of recall is the number of true positives divided by the number of true positives plus the number of false negatives.

4. **Trade-offs between recall and precision:**

- There is often a trade-off between recall and precision.
- While recall expresses the ability to find all relevant instances in a data set, precision expresses the proportion of data points that our model said were relevant that were actually relevant.

5. **F1 score:**

- In cases where we want to find an optimal mix of precision and recall, we can combine the two metrics using what is called the F1 score.
- The F1 score is the harmonic mean of precision and recall taking into account both metrics in the following equation:
  $$ F_1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}$$
- We use the harmonic mean instead of a simple average because it penalises extreme values.
- A classifier with a precision of 1.0 and a recall of 0.0 has a simple average of 0.5 but an F1 score of 0.

6. **Support:**

- Support is how many samples are in each class.


In [326]:
# here we are not using the categorical values and not one hot encoded because the predictions have the original format
# print(f"\n\tACCURACY = {accuracy}%\n")
report = classification_report(y_test, predictions, target_names=TARGET_NAMES)
# report[report.find("support"):report.find("support")+len("support")]
index = report.find("support")+len("support")
output = report[:index] + '    ACCURACY' + report[index:]

index = output.find(str(len(x_test)))+len(str(len(x_test)))
report = output[:index] + f'      {accuracy}%' + output[index:]

print(report)

                        precision    recall  f1-score   support    ACCURACY

   Metastasis, class 0       0.35      0.50      0.41        12
No metastasis, class 1       0.78      0.66      0.71        32

           avg / total       0.66      0.61      0.63        44      61.36%



## <span style="color: yellowgreen;">9. </span> Backup or restore the trained model

#### <span style="color: royalblue;">a) </span> Save the model:


In [327]:
# save the model
model.save('./models/cnnModel.h5')

#### <span style="color: royalblue;">b) </span> Load the model and use it to predict scaled data:


In [328]:
# load the model
# from keras.models import load_model
# newModel = load_model('./models/cnnModel.h5')

# use the loaded model to predict classes
# x_test is already after scaling!
# newModel.predict_classes(scaled_X_train)