In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import random
import imageio
import cv2
import scipy.ndimage as ndi
from keras.models import Sequential
#Import from keras_preprocessing not from keras.preprocessing
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
# Conv2D are objects to reprsent convolutions 
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.callbacks import ModelCheckpoint 

## Neural Networks
**Neural Networks** are **parametric**. The model has parameters that we need to optimize to find the optimal value for that parameter. Neural networks are **universal function approximators**. When neurons combine together, they can approximate any function. If a network has more than two hidden layers, it is deep. 
## Design Parameters
+ Architecture
+ Number of layers
+ Number of neurons in a layer
+ Activation functions: typically use the same type of activation function for all of the neurons in one layer. The output layer activation function is determined by the task we want to do. 


Deeper layers and a larger number of neurons in a layer mean more model capacity, it can handle bigger, more complex data. If the capacity is much more than the data, it will overfit. Dense layers have a weight for every connection. 
+ Weights = parameters (the strength of the connection between neurons, they need to be determined as result of optimization) 
+ Hyperparameters = design parameters (ex: number of neurons in a layer, number of layers in a network, type of activation function to use in each layer. 

## Artificial Neural Network 
We are using the Keras library to create neural networks and train these neural networks to classify images of tumor tissue. The model is the Sequential type. Outputs of one layer are provided as inputs only to the next layer.

The neural network models we are using to classify the images as having a tumor or not having a tumor, are **Sequential** type. The outputs of one layer are provided as inputs only to the next layer. 
<br>
Create a neural network with **Dense** layers meaning that each unit in each layer is connected to all of the units in the previous layer. For example, each unit in the first layer is connected to the pixels in the input images. The Dense layer object receivs the following arguments: numer of units in that layer, activation function for the units, and an input_shape keyword (for the first layer in the network). <br>
There are 10 units here to start. More units would increase the complexity of the network and its capacity to represent complex inputs. To facilitate learning, we are using a **rectified linear unit (a relu) as the activation**. The input shape argument keyword argument tells us how many inputs each of these units should expect. 

In [None]:
def append_ext(fn):
    return fn+".tif"
traindf=pd.read_csv("/kaggle/input/histopathologic-cancer-detection/train_labels.csv",dtype=str)
#traindf=pd.read_csv(“./trainLabels.csv”,dtype=str)
testdf=pd.read_csv("/kaggle/input/histopathologic-cancer-detection/sample_submission.csv",dtype=str)
traindf["id"]=traindf["id"].apply(append_ext)
testdf["id"]=testdf["id"].apply(append_ext)
datagen=ImageDataGenerator(rescale=1./255.,validation_split=0.25)
print(len(traindf))

In [None]:
# used this tutorial: https://medium.com/@vijayabhaskar96/tutorial-on-keras-flow-from-dataframe-1fd4493d237c
train_generator=datagen.flow_from_dataframe(
dataframe=traindf,
directory="/kaggle/input/histopathologic-cancer-detection/train/",
#directory="./train/",
x_col="id",
y_col="label",
subset="training",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(96,96))
valid_generator=datagen.flow_from_dataframe(
dataframe=traindf,
directory="/kaggle/input/histopathologic-cancer-detection/train/",
#directory="./train/",
x_col="id",
y_col="label",
subset="validation",
batch_size=14,
seed=42,
shuffle=True,
class_mode="categorical",
target_size=(96,96))

In [None]:
test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator=test_datagen.flow_from_dataframe(
dataframe=testdf,
directory="/kaggle/input/histopathologic-cancer-detection/test/",    
#directory="./test/",
x_col="id",
y_col="label",
#batch_size=32,
batch_size = 2,
seed=42,
shuffle=False,
class_mode="categorical",
target_size=(96,96))

In [None]:
# initialize a sequential model
model = Sequential()
model.add(Flatten())
model.add(Dense(10, activation='relu',input_shape=(96,96,3)))
# add another hidden layer, also with relu activation
# also with 10 units 
model.add(Dense(10, activation='relu'))
# the output is a fully connected layer with a unit for each class of inputs
# the output unit uses sigmoid function to decide which of the 2 classes was presented
# initially had softmax but softmax is a better option for multi-class classification
model.add(Dense(2, activation='softmax'))

Once the model is constructed, it needs to be compiled before it can be fit to data. Specify the optimizer that will be used to fit the model and the loss function that will be used in optimization. Optionally, you can also specify a list of metrics that the model will keep track of (see the list with 'accuracy' in the code below). The **loss function** is a measure of error, how accurate the prediction was. The loss function is used for traiing optimization. The function should be differentiable so MSE or cross-entropy are good options for classification tasks. (Because Accuracy, TPR, and F1 are not differentiable, we typically use other things.) 

In [None]:
# next compile the model
# categorical_crossentropy loss function is appropriate for classification
# list of reported merics including binary accuracy and AUC 
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy', 'binary_accuracy', 'AUC'])

In [None]:
"""
fit the model to training data
network adjusts its weights through backpropagation
and gradient descent
3 epochs means it will go over all of the training data
3 times 
"""
batch_size = 14
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=1
)

In [None]:
model.evaluate_generator(generator=valid_generator,
steps=STEP_SIZE_TEST)

In [None]:
model.metrics_names

In [None]:
print(valid_generator.n)
print(len(valid_generator.classes))

In [None]:
STEP_SIZE_TEST=valid_generator.n//valid_generator.batch_size

valid_generator.reset()
pred=model.predict_generator(valid_generator,
steps=STEP_SIZE_TEST,
verbose=1)

In [None]:
print(len(pred))

In [None]:
# adapted from: https://gist.github.com/RyanAkilos/3808c17f79e77c4117de35aa68447045
#Confusion Matrix and Classification Report
from sklearn.metrics import classification_report, confusion_matrix
#Y_pred = model.predict_generator(validation_generator, num_of_test_samples // batch_size+1)
#y_pred = np.argmax(Y_pred, axis=1)
y_pred = np.argmax(pred, axis=1)
print('Confusion Matrix')
print(confusion_matrix(valid_generator.classes, y_pred))
#print(confusion_matrix(validation_generator.classes, y_pred))
print('Classification Report')
#target_names = ['Cats', 'Dogs', 'Horse']
target_names = ["NoTumor", "Tumor"]
#print(classification_report(validation_generator.classes, y_pred, target_names=target_names))
print(classification_report(valid_generator.classes, y_pred, target_names=target_names))

In [None]:
from sklearn.metrics import roc_curve
#y_pred_keras = keras_model.predict(X_test).ravel()
y_pred = np.argmax(pred, axis=1)
fpr, tpr, thresholds = roc_curve(valid_generator.classes, y_pred)
#fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)

In [None]:
from sklearn.metrics import auc
auc = auc(fpr, tpr)

In [None]:
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.title('ROC Curve', fontsize=18)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.show()

## Evaluation
The Kaggle competition notes, "Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target."
+ tf.keras.metrics.AUC computes the approximate AUC (Area under the curve) for ROC curve via the Riemann sum. 

It would be ineffective for what we want to do, but classifying all images as not having tumor tissue would have an accuracy of about 60%. For the classifier to have any sort of functionality, it is important to pay attention to other metrics, particularly **AUC** which is how the Kaggle competition scores submissions. <br>
+ **AUC** area under ROC curve: ROC is a probability curve. AUC represents degree of measure or separability. It tells how well the model does at distinguishing classes. 
+ **Precision** refers to how exact the predictions are. 
+ **Recall** refers to completeness. 
+ **F1 Score** combines precision and recall into a single higher is better metric. In practice, different types of misclassifications incur different costs. Ultimately, when diagnosing cancer, we would not expect precision and recall to have the same relative importance.  


Resources consulted about Evaluation:
+ https://neptune.ai/blog/keras-metrics
+ https://keras.io/api/metrics/accuracy_metrics/
+ https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5

### Evaluation Metrics:
+ binary_accuracy computes the mean accuracy rate across all predictions for binary classification problems.

Ultimately we want the model to be able to learn how to generalize. 

## Convolutional Neural Network (CNN)
In the first neural network constructed, each unit in the first layer had a unit connecting it separately to a pixel in the image. We know that pixels in most images are not independent from their neighbors. Natural images contain spatial correlations. How can we use these correlations to our advantage? Our own visual system uses these correlations, identifying edges at a particular location in the visual view. The convolution is the fundamental operation that Convolutional Neural Networks use to process images.
The kernel slides over the input image. In each location, the window is multiplied by the values in the kernel and added up to create the resulting values for one pixel in the resulting array. The resulting array is called a feature map. The feature map contains a map of the featues in the image represented by the kernel.
The convolution of an image with a kernel summarizes a part of the image as the sum of the multiplication of that part of the image with the kernel.

One of the articles I read gives this helpful definition:
A convolution is the simple application of a filter to an input that results in an activation. Repeated application of the same filter to an input results in a map of activations called a feature map, indicating the locations and strength of a detected feature in an input, such as an image.<br>
It also provides this descripton of the role of a convolution in a **CNN**. 
In the context of a convolutional neural network, a convolution is a linear operation that involves the multiplication of a set of weights with the input, much like a traditional neural network. Given that the technique was designed for two-dimensional input, the multiplication is performed between an array of input data and a two-dimensional array of weights, called a filter or a kernel.
https://machinelearningmastery.com/convolutional-layers-for-deep-learning-neural-networks/

Keras has objects to represent convolutions. Instead of having every unit connected to every unit in the previous layer (like in the Dense layer), it is connected to the previous layer through a convolution. The output of a Convolutional Layer is the convolution of a kernel over the image input. During training of a network that has convolutions, the kernels in each unit would be adjusted using backpropagation.A convolutional layer has fewer weights than a Dense layer. A Dense layer has one weight for each pixel in the image.A Convolution layer has only one weight for each pixel in the kernel. 
<br>
If the kernel size is 3, the kernel of each unit has 9 pixels. If the layer has 10 units, it would have 90 parameters. 
<br> Flatten is a connection between convolution and densely connected layers. This takes the output of the convolutional layer that we previously referred to as a feature map and flatens it into a one-dimensional array.This is the expected input into the densely connected layer that is the output layer. <br>
The dense layer has two units because there are two classes. We are using the sigmoid activation function. 

In [None]:
# initialize the model object
model = Sequential() 
# add a convolutional layer 
model.add(Conv2D(10, kernel_size=3, activation='relu',
                 input_shape=(96,96,3)))
# flatten the output of the convolutional layer
# this layer translates between the image processing
# and classification parts of the network 
model.add(Flatten())
# add an output layer for the 2 categories
model.add(Dense(2,activation='sigmoid'))

In [None]:
# next compile the model
# categorical_crossentropy loss function is appropriate for classification
# accuracy will be a reported metric 
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy', 'binary_accuracy', 'AUC'])

In [None]:
batch_size = 32
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=3
)

In [None]:
# evaluate on test set held aside during training 
model.evaluate_generator(generator=valid_generator,
steps=STEP_SIZE_TEST)

In [None]:
model.metrics_names

In [None]:
STEP_SIZE_TEST=valid_generator.n//valid_generator.batch_size

valid_generator.reset()
pred=model.predict_generator(valid_generator,
steps=STEP_SIZE_TEST,
verbose=1)

With the previous neural network, we needed to reshape the images before feeding them to the network. Here, we want the pixels to retain their spatial relationships so we don't do that. That is why we specified the input shape when we defined the convolutional layer.

## Why CNN? What do convolution filters do?
+ Learning filters
+ Weight sharing: weights are shared across the image 
+ Computational efficiency
+ Translational invariance
+ Robust: less overfit: since there are less weights, they are less likely to overfit  