<h1 align='center'> Inception ML models on Plant_Pathology_2020 - dataset </h1>

# Importing required packages

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

import os
from tqdm import tqdm # to get progress bars while running
import cv2
from sklearn.utils import shuffle

from tensorflow import keras
import tensorflow as tf
tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
tf.compat.v1.disable_eager_execution()

import warnings
warnings.filterwarnings('ignore')

# Data Loading and Pre-Processing

In [None]:
train_csv = '/kaggle/input/plant-pathology-2020-fgvc7/train.csv'
test_csv = '/kaggle/input/plant-pathology-2020-fgvc7/test.csv'
image_data = '/kaggle/input/plant-pathology-2020-fgvc7/images'

In [None]:
# this CSV contains classition properties of the resp input images file names
data = pd.read_csv(train_csv, index_col=0)

# CLASSES
class_names = list(data.columns)
np.save('class_names',class_names) # save array
print(class_names)

print(data.shape)
data.head()

In [None]:
# getClasses function is used to get the class value for our class key
def Class_Label(n):
    '''Returns label no. if file_name is given and integer decoding'''
    # given a file name - return interger name
    if ((type(n)==str) and ('.' in n)):
        row = data.loc[n.split('.')[0]]
        for x,i in enumerate(class_names):
            if (row[i] == 1): 
                return x
    # given a class name - return interger label
    elif ((type(n)==str) and (n in class_names)):
        return class_names.index(n)
    # given a integer label - return class name
    elif (n in range(len(class_names))):
        return class_names[n]
    else:
        return -1

#### *Primary preprocessing transformations*
- Normalizing and standardizing the images.
- Resizing of the images to 229x229. The Inception network input expects a 229x229 image. </br>
`This will be implemented in CV2 as computation cost is individually very high implementing through numpy`

In [None]:
# We use the load data function to read our data folders and label each of the images with 
# folder name and append them to a list which is then converted to an array
# labeled for supervised/unsupervised data flag
def Load_Data(folders, file_prefix, dim, labeled = True):
    '''To import the dataset from the directories and preprocess them'''
        
    # final outputs:
    images=[]
    labels=[]
    
    # iterate through folders
    for folder in folders:
        
        # iterate through each image in folder
        for file in tqdm(os.listdir(folder)):   
            
            if (file.startswith(file_prefix)):
                
                # get pathname of each image
                img_path = os.path.join(folder, file)
               
                # Open and pre-process it
                image = cv2.imread(img_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # so we are converting to RGB format
                image = cv2.resize(image, (dim[0], dim[1])) # resizing
                image = image/255 # normalization
                #print(image.shape)
                #print(image)
                #print("\n\n")
                # Append the image and its corresponding label to the output
                images.append(image)
                if (labeled):
                    label = Class_Label(file)
                    labels.append(label)
                
    print("Folder-Loaded:",folder,"-->",file_prefix,"over.")
                
    # Converting the data type of the list
    images = np.array(images, dtype = 'float32') # images in float
    labels = np.array(labels, dtype = 'int8') #labels in integer encoded
    
    if (labeled):
        # Shuffling the order of data for better accuracy and a good data split
        images,labels = shuffle(images,labels,random_state=random.randint(0, 10))
        
    if (labeled):
        return images, labels
    else:
        return images

In [None]:
# Default Image Size for all data
image_size = (229,229)

# Loading Data - Train-Test images
train_images, train_labels = Load_Data([image_data], 'Train', image_size)
test_images = Load_Data([image_data], 'Test', image_size, False)

# Data Visualization

In [None]:
print(f'Shape of Test Data : {test_images.shape}')
print(f'Shape of Train Data : {train_images.shape}')

from collections import Counter
label_count = Counter(train_labels)
print(f' Labels and their count :\n {label_count}')

x = list(label_count.keys())
y = list(label_count.values())

fig = plt.figure(figsize = (10,5))
plt.bar(x,y,color=['cyan','blue','magenta','maroon'],  edgecolor='black')
plt.title('Labels vs Count')
plt.xticks([0,1,2,3],class_names)
plt.xlabel('Labels')
plt.ylabel('Count')
plt.show()

In [None]:
def dis_rand_exp(images, labels, x=2, y=3):
    fig = plt.figure(figsize = (15,15))
    fig.suptitle("Random Examples of Data-Set images", fontsize=22)
    for i in range(x*y):
        plt.subplot(x,y,i+1)
        plt.xticks([])
        plt.yticks([])
        rand = random.randint(0,len(images))
        plt.imshow(images[rand], cmap='gray')
        plt.xlabel(Class_Label(labels[rand]), fontsize=18)
    plt.show()   

# Train and Test Split

In [None]:
#from sklearn.model_selection import train_test_split
# Spliting validation data from train data as test data is unclassified
#train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=.15)

In [None]:
x_train, y_train = train_images[:1000], train_labels[:1000]
val_images, val_labels = train_images[1000:1400], train_labels[1000:1400]
tst_images, tst_labels = train_images[1400:], train_labels[1400:]

In [None]:
print("Training data size:", x_train.shape)
print("Test data size:", tst_images.shape)
print("Validation data size:", val_images.shape)
print("\nPloting Test Dataset")
dis_rand_exp(train_images, train_labels)

# <h1 align='center'> Deep Learning Models </h1>

# INCEPTION Architecture - Model Implementation

In [None]:
# For Classification (no.of features = no.of classes)
model = keras.models.Sequential([
    keras.applications.inception_v3.InceptionV3(include_top=False, weights='imagenet', pooling='avg', input_shape=(image_size[0],image_size[1],3)),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.4),
    keras.layers.Dense(512),
    keras.layers.LeakyReLU(alpha=0.05),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(4, activation='softmax')
])

# Compling our Model
model.compile(optimizer=tf.optimizers.SGD(lr=0.0075),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model._name="Inception_Classifier"
model.summary()

# Training and Results

In [None]:
#Stop training when a monitored metric (here, accuracy) has stopped improving.
#patience: Number of epochs with no improvement after which training will be stopped.
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)

# Saving the best Trained model
model_file = model.name+'_Model.h5'
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(model_file, save_best_only=True)

tf.keras.backend.clear_session
history = model.fit(x_train,
                    y_train,
                    epochs=15, # Max no.of epochs
                    #steps_per_epoch=100,
                    batch_size=40, # size for parallel computation, higher require more GPU/CPU-RAM
                    validation_freq=1,
                    validation_data=(val_images,val_labels),
                    callbacks=[early_stopping_cb, checkpoint_cb]
                   )

# Plotting the Accuracy and Loss of the model

In [None]:
def plot_accuracy_and_loss_of_train_and_validation_dataset(history):
    train_acc=history.history['accuracy']
    train_loss=history.history['loss']
    val_acc=history.history['val_accuracy']
    val_loss=history.history['val_loss']
    epoch=[i for i in range(len(train_acc))]
    fig , ax=plt.subplots(1,2)
    fig.set_size_inches(15,8)
    ax[0].plot(epoch,train_acc,'co-',label='training accuracy')
    ax[0].plot(epoch,val_acc,'mo-',label='validation accuracy')
    ax[0].set_title('Training & Validation Accuracy')
    ax[0].legend()
    ax[0].set_xlabel("Epochs")
    ax[0].set_ylabel("Accuracy")
    ax[1].plot(epoch,train_loss,'c-o',label='training loss')
    ax[1].plot(epoch,val_loss,'m-o',label='validation loss')
    ax[1].set_title('Training & Validation loss')
    ax[1].legend()
    ax[1].set_xlabel("Epochs")
    ax[1].set_ylabel("Training & Validation Loss")

# incase of early stop
plot_accuracy_and_loss_of_train_and_validation_dataset(history)

# Evaluation

In [None]:
# loading the saved model of Inception Classifier
model_file = 'Inception_Classifier_Model.h5'
model = keras.models.load_model(model_file) # rollback to best model
loss,accuracy = model.evaluate(x_train,y_train)
print("The accuracy of train image is : ",accuracy)

### Accuracy for each class

In [None]:
pred_Ids_of_test = model.predict(tst_images)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
predIdxs = np.argmax(pred_Ids_of_test, axis=1) # to get the indices of max value in each row
print(classification_report(tst_labels, predIdxs,target_names=class_names))

### Confusion Matrix

In [None]:
CM = confusion_matrix(tst_labels, predIdxs)
# Plot confusion matrix
plt.figure(figsize = (5,5))
plt.imshow(CM,interpolation='nearest',cmap='winter')
for (i, j), z in np.ndenumerate(CM):
    plt.text(j, i, z, ha='center', va='center')
plt.xlabel("y Predict")
plt.ylabel("y Test")
plt.grid(False)
plt.show()

# Evaluating UNLABELED test data & and saving the output for comparing with ML-Classifiers

In [None]:
test_set = pd.read_csv(test_csv) # Reading resp test-images file-names
test_pred = model.predict(test_images, batch_size=10)
df_pred = pd.concat([test_set, pd.DataFrame(test_pred, columns=class_names)], axis=1).set_index("image_id")
df_pred.to_csv("/kaggle/working/inception_results.csv") # saving prediction test-images in each class into a .csv file
print(df_pred.idxmax(axis=1))
df_pred

In [None]:
# Creating test labels for evaluating standard performance measures on upcoming ML classifiers
df_pred.columns = np.arange(len(df_pred.columns))
test_labels = np.array(df_pred.idxmax(axis=1),'int8')
print("Test-Labels:",test_labels)

#Saving Train-Test-Valid Datasets
np.savez_compressed('pre-processed-datasets',x_train,y_train,test_images,test_labels,val_images,val_labels)

print("\nPloting Test Dataset")
dis_rand_exp(test_images, test_labels)

# <h1 align='center'>Transfer Learning into ML Classifiers</h1>

### Importing required packages

In [None]:
#Import Libraries
import numpy as np
import matplotlib.pyplot as plt

from tensorflow import keras
import tensorflow as tf
tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
tf.compat.v1.disable_eager_execution()

import warnings
warnings.filterwarnings('ignore')

# INCEPTION Feature Extractor - using Pre-Trained Classifier

In [None]:
# loading class-names
class_names = np.load('class_names.npy')

# loading Pre-Processed Data-Sets
train_images,train_labels,test_images,test_labels,val_images,val_labels = np.load('pre-processed-datasets.npz')

# we no longer need validation data, so we combine it with train data
with np.load('pre-processed-datasets.npz') as data:
    train_images = np.concatenate((data[train_images],data[val_images]),axis=0)
    train_labels = np.concatenate((data[train_labels],data[val_labels]),axis=0)
    test_images = data[test_images]
    test_labels = data[test_labels]
print("Training data size:", train_images.shape)
print("Test data size:", test_images.shape)

# loading the saved model of Inception Classifier
model_file = 'Inception_Classifier_Model.h5'
model = keras.models.load_model(model_file)

print("\n\nPrinting layers of Model -",model.name,":")
for layer in model.layers:
    print(layer)

print("\nFeature extraction from the model:")
feature_layer = model.get_layer('inception_v3')
print(feature_layer)

## Alternative method to create our own model upto feature-layer as model-output and re-train

In [None]:
feature_model = keras.Model(inputs = feature_layer.inputs, outputs = feature_layer.outputs)
feature_model._name="Inception_Feature_Extractor"

# Showing Inception Model Architecture
print("Inception Model Architecture")
keras.utils.plot_model(feature_model,'Inception.png',show_shapes=True, show_layer_names=False, rankdir='TB', expand_nested=True, dpi=75)
# rankdir='TB' -> top to bottom
#feature_model.summary()

# Extracting the deep features of our Dataset

In [None]:
train_features = feature_model.predict(train_images)
test_features = feature_model.predict(test_images)
print("Training data size:", train_features.shape)
print("Test data size:", test_features.shape)

# ML Model Implementation

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# keep no.of classifiers even for plottinf purposes
classifiers = [LogisticRegression(), GaussianNB(), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(kernel='linear')]

## Fitting and Ploting results

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

fig = plt.figure(figsize=(20,10))
fig.tight_layout()
nof_clsfrs = len(classifiers)
Accuracies = {}


for i in range(nof_clsfrs):
    tf.keras.backend.clear_session
    classifier = classifiers[i]
    plt.subplot(2,3,i+1)
    plt.xticks([])
    plt.yticks([])
    
    print(f'\n\nClassifier: {str(classifier)}')
    classifier.fit(train_features, train_labels)
    test_pred = classifier.predict(test_features)
    
    acc = accuracy_score(test_labels,test_pred) *100
    Accuracies[str(classifier)[0:20]+'...'] = acc
    print(f'Accuracy  is {acc}%, \nReport:')
    print(classification_report(test_labels, test_pred))
    
    plt.gca().set_title(str(classifier)[0:20]+'...')
    CM = confusion_matrix(test_labels, test_pred)
    plt.imshow(CM,interpolation='nearest',cmap='summer')
    for (i, j), z in np.ndenumerate(CM):
        plt.text(j, i, z, ha='center', va='center')
    plt.xticks(np.arange(len(class_names)),class_names, fontsize=8)
    plt.yticks(np.arange(len(class_names)))
    plt.grid(False)
plt.show()

## Getting the best ML model 

In [None]:
import pandas as pd
result = pd.DataFrame({'Classifier':list(Accuracies.keys()),
                 'Accuracy':list(Accuracies.values())})
print("Results:")
print(result)
print("\n\nClassifier for which max Accuracy is obtained:")
print(result.iloc[result["Accuracy"].idxmax()])