# Import Libraries!

In [None]:
import numpy as np #Importing Numpy for Mathematical Operations.

import pandas as pd #Importing Pandas for Data Analysis.

from pathlib import Path #The Pathlib Module in Python simplifies the way in working with Files and Folders.

                         #The Path is used to Identify a File. The Path provides an optional sequence of directory 
                         #names terminated by the Final File Name including the Filename Extension.
        
import os.path #This Module implements some useful functions on Pathnames.

import matplotlib.pyplot as plt #Importing for Plotting. 

from IPython.display import Image, display, Markdown #Certain Display Functionalities.

import matplotlib.cm as cm #To modify the ColorMaps.

from sklearn.model_selection import train_test_split #Sklearn model selection for splitting data arrays into two subsets: 
                                                     #For Training Data and for Testing Data. With this function,
                                                     #you don't need to divide the dataset manually. By default, 
                                                     #Sklearn train_test_split will make random partitions for the two subsets.
            
from sklearn.metrics import confusion_matrix #For Confusion Matrix

import tensorflow as tf
from time import perf_counter #Time module provides various time-related functions i.e. to show time.

import seaborn as sns #Data visualization library built on top of Matplotlib.

def printmd(string):
    # Print with Markdowns:  
    display(Markdown(string))

In [None]:
image_dir = Path('../input/cancer-dataset/Dataset')

# Get filepaths and labels:
filepaths = list(image_dir.glob(r'**/*.png')) 

"""glob is a powerful tool in Python to help with file management and filtering. 
While os helps to manage and create specific paths that are friendly to whatever machine they are used on, 
glob helps to filter through large datasets and pull out only files that are of interest.

The * is a sort of wildcard that can be used to search for items that have differences in their names. 
Whatever text doesn’t match can be replaced by a *.
For example, if you want every file in a directory to be returned to you, you can put a * at the end of a directory path.
glob will return a list of all of the files in that directory."""

labels = list(map(lambda x: os.path.split(os.path.split(x)[0])[1], filepaths)) #0 denotes head and 1 denotes tail i.e.

#os.path.split() method in Python is used to Split the path name into a pair of Head and Tail. 
#Here, Tail is the last path name component and Head is everything leading up to that.

#For example consider the following path name:
#path name = '/home/User/Desktop/file.txt'
#In the above example ‘file.txt’ component of path name is Tail and ‘/home/User/Desktop/’ is Head.
#The tail part will never contain a slash; if name of the path ends with a slash, tail will be empty 
#and if there is no slash in path name, head will be empty.

In [None]:
#Generating Series of Data:
filepaths = pd.Series(filepaths, name='Filepath').astype(str) 
labels = pd.Series(labels, name='Label')

#Concatenate filepaths and labels:
image_df = pd.concat([filepaths, labels], axis=1) 

#axis: It specifies the axis along which any values are computed. By default axis=0
# Axis 0 will act on all the ROWS in each COLUMN
# Axis 1 will act on all the COLUMNS in each ROW

"""
+------------+---------+--------+
|            |  A      |  B     |
+------------+---------+---------
|      0     | 0.626386| 1.52325|----axis=1----->
+------------+---------+--------+
             |         |
             | axis=0  |
             ↓         ↓                
"""

#Shuffle the DataFrame and Reset Index:
image_df = image_df.sample(frac=1).reset_index(drop = True)

#The ".sample": to sample all rows without replacement of a dataframe.

#The frac keyword argument specifies the fraction of rows to return in the random sample, 
#so "frac=1" means return all rows (in random order).

#Here, specifying "drop=True" prevents ".reset_index" from creating a column containing the old index entries.

#Show the result:
image_df.head(5)

# Visualization!

In [None]:
#Display some pictures of the dataset with their labels:

fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15),
                        subplot_kw={'xticks': [], 'yticks': []})

"""The matplotlib.pyplot.subplots method provides a way to plot multiple plots on a single figure. 
Given the number of rows and columns, it returns a tuple (fig, ax), giving a single figure "fig" with an array of axes "ax"."""


"""1. nrows, ncols: Number of rows/columns of the subplot grid.
   2. subplot_kw: Dict of keywords to be passed to the add_subplot call to add keywords to each subplot. 
   The default value is None.
   3. Ticks are the markers denoting data points on axes, The xticks() and yticks() function takes a list object as argument. 
   The elements in the list denote the positions on corresponding action where ticks will be displayed."""
   
for i, ax in enumerate(axes.flat): #axes.flat: For each iteration it would yield the next axes from that array, such 
                                              #that you may easily plot to all axes in a single loop.
                                #axes.flat is not a function, it's an attribute of the numpy.ndarray: numpy.ndarray.flat
                               #ndarray.flat: A 1-D iterator over the array.
        
        
    ax.imshow(plt.imread(image_df.Filepath[i])) #Displaying Image.
    
    ax.set_title(image_df.Label[i]) #Displaying Lable i.e. title.
    
plt.tight_layout() #tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area.

plt.show()

In [None]:
# Display the number of pictures of each category:

vc = image_df['Label'].value_counts() #Calling the value_counts method to get a count of unique values.
plt.figure(figsize=(20,15))

sns.barplot(x = vc.index, y = vc, palette = "rocket") 
#One of the color palette "rocket" is used to display graph.
#x-axis: index value i.e. label names.
#y-axis: value count.

plt.title("Number of pictures of each category", fontsize = 11)
plt.show()

# Load the Images with a generator!

In [None]:
def create_gen():
    
    #Load the Images with a generator and Data Augmentation:
    train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
        validation_split=0.2
    )

    test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input)
    
        #Pre-Processes a tensor or Numpy array encoding a batch of images.
        #Returns Pre-Processed numpy.array or a tf.Tensor with type float32.
        #The inputs pixel values are scaled between -1 and 1, sample-wise.
    

    train_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb', #One of "grayscale", "rgb", "rgba". Default: "rgb".
                          #Whether the images will be converted to have 1, 3, or 4 channels.
        
                          #rgba(red, green, blue, alpha)
                          #The alpha value is declared as a decimal number from 0 to 1, where 0 is fully 
                          #transparent and 1 is fully opaque.
                          
        
        
        class_mode='categorical',   #categorical": 2D output (aka. list of numbers of length N), [0, 0, 1, 0], 
                                    #which is a one-hot encoding (only one number is 1/ "hot") representing the donkey. 
                                    #This is for mutually exclusive labels. A dog cannot be a cat, a human is not a dog.
        
        batch_size=16, #Size of the batches of data - Default: 32.
        
        shuffle=True, #Whether to shuffle the data. Default: True. If set to False, sorts the data in alphanumeric order.
        
        seed=0, #Optional random seed for shuffling and transformations.
        
        subset='training',  #One of "training" or "validation". Only used if validation_split is set.
        
        rotation_range=30,  #rotation_range is a value in degrees (0-180), a range within which to randomly rotate pictures.
        
        zoom_range=0.15, #zoom_range is for randomly zooming inside pictures.
        
        width_shift_range=0.2,
        height_shift_range=0.2,
        
        #width_shift and height_shift are ranges (as a fraction of total width or height) 
        #within which to randomly translate pictures vertically or horizontally.
        
#With width_shift_range=2 possible values are integers [-1, 0, +1], same as with width_shift_range=[-1, 0, +1], 
#while with width_shift_range=1.0 possible values are floats in the interval [-1.0, +1.0)."""
        
#With height_shift_range=2 possible values are integers [-1, 0, +1], same as with height_shift_range=[-1, 0, +1],
#while with height_shift_range=1.0 possible values are floats in the interval [-1.0, +1.0)."""
        
        
        
        
        shear_range=0.15, #shear_range is for randomly applying shearing transformations.
        
#In plane geometry, a shear mapping is a linear map that displaces each point in a fixed direction, 
#by an amount proportional to its signed distance from the line that is parallel to that direction and 
#goes through the origin. This type of mapping is also called shear transformation, transvection, or just shearing.
        
        horizontal_flip=True, #horizontal_flip is for randomly flipping half of the images horizontally 
                              #--relevant when there are no assumptions of horizontal assymetry (e.g. real-world pictures).
        
        fill_mode="nearest")
    
        #fill_mode is the strategy used for filling in newly created pixels, 
        #which can appear after a rotation or a width/height shift.
        
#         One of {"constant", "nearest", "reflect" or "wrap"}. Default is 'nearest'. 
#         Points outside the boundaries of the input are filled according to the given mode: - 
#         'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k) - 
#         'nearest': aaaaaaaa|abcd|dddddddd - 
#         'reflect': abcddcba|abcd|dcbaabcd - 
#         'wrap': abcdabcd|abcd|abcdabcd
  
    
#    The flow_from_directory() method allows you to read the images directly 
#    from the directory and augment them while the neural network model is learning on the training data.
#    The method expects that images belonging to different classes are present in different folders but 
#    are inside the same parent folder.

#     By doing this, we are instructing our data generator to apply all 
#     function to every image as a preprocessing step before feeding it to the model. 
#     This way, we eliminate the need to process all the images and write them to a separate directory. 
    
    
    
    val_images = train_generator.flow_from_dataframe(
        dataframe=train_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=16,
        shuffle=True,
        seed=0,
        subset='validation',
        rotation_range=30, 
        zoom_range=0.15,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.15,
        horizontal_flip=True,
        fill_mode="nearest"
    )

    test_images = test_generator.flow_from_dataframe(
        dataframe=test_df,
        x_col='Filepath',
        y_col='Label',
        target_size=(224, 224),
        color_mode='rgb',
        class_mode='categorical',
        batch_size=16,
        shuffle=False
    )
    
    return train_generator,test_generator,train_images,val_images,test_images

In [None]:
#Separate in Train and Test Data:
train_df, test_df = train_test_split(image_df, train_size=0.7, shuffle=True, random_state=1)

#The random_state parameter is used for initializing the internal random number generator,
#which will decide the splitting of data into train and test indices.

# Setting random_state a fixed value will guarantee that the same sequence of random numbers 
# is generated each time you run the code.

In [None]:
# Create the Generators:
train_generator,test_generator,train_images,val_images,test_images = create_gen()
print('\n')

In [None]:
from tensorflow.keras import layers #Importing layers from Keras as it 
                                    #lets you create a model layer by layer for most problems. 
    
!pip install tensorflow-addons==0.9.1
import tensorflow_addons

# TensorFlow Addons is a repository of contributions that conform 
# to well-established API patterns, but implement new functionality not available in core TensorFlow. 
# TensorFlow natively supports a large number of operators, layers, metrics, losses, and optimizers.
# However, in a fast moving field like ML, there are many interesting new developments that cannot be 
# integrated into core TensorFlow (because their broad applicability is not yet clear, or it is mostly 
# used by a smaller subset of the community).

from tensorflow_addons.metrics import F1Score, CohenKappa #Using more Metrics.

In [None]:
model = tf.keras.Sequential([
    layers.Conv2D(16, (3,3), padding="same", input_shape=(224,224,3), activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
#    1. The first required Conv2D parameter is the number of filters that the convolutional layer will learn.

#    2. We learn a total of 16 filters. Max pooling is then used to reduce the dimensions of the output volume.
   
#    Note: Notice, as our output volume is decreasing our number of filters learned is increasing — 
#    this is a common practice in designing CNN architectures
   
#    3. Now after filter_layers we provide kernel_size. Typical values for kernel_size include: (1, 1) , (3, 3) , (5, 5) , (7, 7). 
#    It’s rare to see kernel sizes larger than 7×7. It's recommended that when we have Input images greater then 128x128,
#    we should use kernel size >=3, to help (1) learn larger filters and (2) to help reduce volume size.
   
#    4. Padding: If you instead want to preserve the dimensions of the volume such that the output volume size 
#    matches the input volume size, then you would want to supply a value of "same" for the padding.
#    With the "valid" parameter the input volume is not zero-padded and the dimensions are 
#    allowed to reduce via the natural application of convolution.
    
    layers.Conv2D(32, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
 
    layers.Conv2D(64, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
    layers.Conv2D(64, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
    layers.Conv2D(128, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
    layers.Conv2D(128, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
    layers.Conv2D(256, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(2,2)),
    layers.BatchNormalization(),
    
    layers.Conv2D(256, (3,3), padding="same", activation = 'relu'),
    layers.MaxPooling2D(pool_size=(1,1)),
    layers.BatchNormalization(),
    
    layers.Flatten(), #To bring all levels of a multi-layered image down to one plane i.e. used to get a copy of a given 
                      #array collapsed into one dimension.
    
    #1st FC Layer:
    layers.Dense(33, activation = 'relu'), 
    #layers.Dense(): This function is used to create fully connected layers, 
    #in which every output depends on every input.
    
    layers.Dropout(0.15),
    
    #Dropout refers to ignoring units (i.e. neurons) during the training phase of certain set 
    #of neurons which is chosen at random. 
    #By “ignoring”, means these units are not considered during a particular forward or backward pass.
    #Use: To prevent over-fitting.
    
    #2nd FC Layer:
    layers.Dense(33, activation = 'softmax')
])


model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy','AUC']
)

#       Optimizers are Classes or methods used to change the attributes of your machine/deep learning model 
#       such as weights and learning rate in order to reduce the losses. Optimizers help to get results faster.
#       Adam stands for Adaptive Moment Estimation, which is another way of using past gradients to calculate 
#       current gradients.
#       Adam utilizes the concept of momentum (Momentum can smooth the progression of the learning 
#       algorithm that, in turn, can accelerate the training process), by adding fractions of previous gradients 
#       to the current 
#       one, it is practically accepted in many projects during training neural nets.



model.summary()

# param_number = output_channel_number * (input_channel_number * kernel_height * kernel_width + 1)

In [None]:
history = model.fit(train_images,
                    epochs=50,                   # No. of Iterations.
                    validation_data = val_images)

In [None]:
pd.DataFrame(history.history)[['accuracy','val_accuracy']].plot()
plt.title("Accuracy")
plt.show()

pd.DataFrame(history.history)[['loss','val_loss']].plot()
plt.title("Loss")
plt.show()

In [None]:
results = model.evaluate(test_images, verbose=0)
"""By setting verbose 0, 1 or 2 you just say how do you want to 'see' the training progress for each epoch.
verbose=0 will show you nothing (silent)
verbose=1 will show you an animated progress bar like this:
[==============================]
verbose=2 will just mention the number of epoch like this:
Epoch 1/50 """

In [None]:
printmd(" ## Test Loss: {:.5f}".format(results[0]))
printmd("## Accuracy on the test set: {:.2f}%".format(results[1] * 100))
print('\n')

In [None]:
# Predict the label of the test_images:
pred = model.predict(test_images)
pred = np.argmax(pred,axis=1)
"""axis: It specifies the axis along which the any values are computed. By default axis=0
1. Axis 0 will act on all the ROWS in each COLUMN
2. Axis 1 will act on all the COLUMNS in each ROW """

# Map the label:
labels = (train_images.class_indices)
labels = dict((v,k) for k,v in labels.items()) #v=value, k=key in dictionary.
pred = [labels[k] for k in pred]

# Display the result:
print(f'The first 5 predictions: {pred[:5]}')

In [None]:
from sklearn.metrics import classification_report
y_test = list(test_df.Label)
from sklearn import metrics

#Calculating metrics and rounding to 5 decimals:
print('Accuracy:', np.round(metrics.accuracy_score(y_test,pred),5)) 
print('Precision:', np.round(metrics.precision_score(y_test,pred, average='weighted'),5))

""" Weighted average is a calculation that takes into account the varying degrees of importance of the numbers in a data set.
    In calculating a weighted average, each number in the data set is multiplied by a predetermined weight before the final 
    calculation is made.
    A weighted average can be more accurate than a simple average in which all numbers in a data set are assigned an 
    identical weight. """

print('Recall:', np.round(metrics.recall_score(y_test,pred, average='weighted'),5))
print('F1 Score:', np.round(metrics.f1_score(y_test,pred, average='weighted'),5))
print('Cohen Kappa Score:', np.round(metrics.cohen_kappa_score(y_test,pred),5))
print(classification_report(y_test, pred))

In [None]:
cf_matrix = confusion_matrix(y_test, pred, normalize='true')

"""normalize{‘true’, ‘pred’, ‘all’}, default=None
Normalizes confusion matrix over the true (rows), predicted (columns)
conditions or all the population. If None, confusion matrix will not be normalized."""

plt.figure(figsize = (25,20))
sns.heatmap(cf_matrix, annot=True, xticklabels = sorted(set(y_test)), yticklabels = sorted(set(y_test)))

#annot: If True, write the data value in each cell.



plt.title('Normalized Confusion Matrix')
plt.show()