# **Import Libraries**

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

from glob import glob
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

 # **Load dataset**

In [None]:
trainFiles = glob('../input/plant-seedlings-classification/train/*/*.*')
testFiles = glob('../input/plant-seedlings-classification/test/*.*')

In [None]:
trainImg = []
trainLabel = []
num = len(trainFiles)

testImg = []
testFileName = []
num = len(testFiles)

In [None]:
for img in trainFiles:
    #print(img)
    imgBgr = cv2.imread(img)
    imgRgb = cv2.cvtColor(imgBgr, cv2.COLOR_BGR2RGB)
    trainImg.append(cv2.resize(imgRgb, (128, 128)))
    trainLabel.append(img.split('/')[-2])
    
trainX = np.asarray(trainImg)
trainY = pd.DataFrame(trainLabel, columns=["Label"])
print(trainX.shape)
print(trainY.shape)

In [None]:
for img in testFiles:
    #print(img)
    imgBgr = cv2.imread(img)
    imgRgb = cv2.cvtColor(imgBgr, cv2.COLOR_BGR2RGB)
    testImg.append(cv2.resize(imgRgb, (128, 128)))
    testFileName.append(img.split('/')[-1])
    
testDataX = np.asarray(testImg)
print(testDataX.shape)

In [None]:
# Print 10 random observations of Dependent feature
trainY.sample(10)

In [None]:
# Print random 10 sample of independent feature
trainX[0:10]

# **Shape of dataset**

In [None]:
# Shape of training dataset
print(trainX.shape)
print(trainY.shape)

# **Visualize first 9 images of dataset**

In [None]:
# Visulize first 9 images of training dataset
plt.figure(figsize=(15, 15))
for i in range(9):
    plt.subplot(330 + 1 + i)
    plt.imshow(trainX[i], cmap = plt.get_cmap('RdYlGn'))
plt.show()

# **Data Preprocessing**

## **Normalization**

In [None]:
# Convert value type to floating 32 data type and standardizing training dataset 
trainX = trainX.astype('float32')
trainX = trainX / 255.0

testDataX = testDataX.astype('float32')
testDataX = testDataX / 255.0

## **Gaussian Blurring**

In [None]:
# Applying Gaussian Blurring to reduce noise and details
for idx, img in enumerate(trainX):
  trainX[idx] = cv2.GaussianBlur(img, (5, 5), 0)

In [None]:
# Applying Gaussian Blurring to reduce noise and details
for idx, img in enumerate(testDataX):
  testDataX[idx] = cv2.GaussianBlur(img, (5, 5), 0)

In [None]:
# Check shape of training dataset
trainX.shape

## **Visualize data after pre-processing**

In [None]:
# Visualize 9 images after applying Normalization and Gaussian Blurring on training dataset
plt.figure(figsize=(15, 15))
for i in range(9):
    plt.subplot(330 + 1 + i)
    plt.imshow(trainX[i], cmap = plt.get_cmap('RdYlGn'))
plt.show()

In [None]:
# Check distribution of observation categories in different plant seedlings
plt.figure(figsize=(15, 10));
sns.countplot(x = 'Label', data = trainY);
plt.xticks(rotation =45);

**Insight:**
Images in dataset is not equally distributed across all the plant seedlings category

# **Make data compatible**

## **Convert labels to one-hot vectors**

In [None]:
# Import Libraries
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder

# Create LabelEncoder object
lblEncoding = LabelEncoder()

# Apply Label Encoding to dependent(target) training dataset
trainYEncoded = lblEncoding.fit_transform(trainY)

# Convert target feature into categorical classes 
trainY = tf.keras.utils.to_categorical(trainYEncoded, num_classes = 12)

## **Data Augmentation of Images**

In [None]:
# Augmenting traning dataset 
data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
])

plt.figure(figsize=(10, 10))
for i in range(9):
  augmented_image = data_augmentation(trainX)
  ax = plt.subplot(3, 3, i + 1)
  plt.imshow(augmented_image[0])
  plt.axis("off")

## **Split the dataset into training , testing and validation set**

### **Splitting training dataset into training and testing dataset**

In [None]:
# Import library and split training dataset into training and testing
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(trainX, trainY, test_size = 0.3, random_state = 1)

### **Splitting testing dataset into testing and validation dataset**

In [None]:
# Import library and split training dataset into training and testing
trainX, valX, trainY, valY = train_test_split(testX, testY, test_size = 0.3, random_state = 1)

## **Check the shape of data**

In [None]:
# Print shape of training, testing and vaidation independent and dependent dataset
print("Shape of trainX set : {0}" .format(trainX.shape))
print("Shape of testX set : {0}" .format(testX.shape))
print("Shape of valX set : {0}" .format(valX.shape))
print("Shape of trainY set : {0}" .format(trainY.shape))
print("Shape of testY set : {0}" .format(testY.shape))
print("Shape of valY set : {0}" .format(valY.shape))

## **Insight: Data is already compatible with Keras models**

# **Building CNN**

## **Define CNN Layers**

In [None]:
# Import Keras libraries
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, GlobalMaxPooling2D, InputLayer, BatchNormalization, GlobalMaxPool2D, GlobalAveragePooling2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam

# Create CNN mode; architecture
model = Sequential()                                                                            # Create Sequential object
model.add(InputLayer(input_shape=(128,128,3,)))                                                 # Input layer
model.add(data_augmentation)                                                                    # Data Augmentaton Layer 
model.add(Conv2D(256, kernel_size=(5,5), padding = "same", activation='relu'))                  # 2D Convolution layer
model.add(MaxPool2D(pool_size = (2,2)))                                                         # Max Pool layer 
model.add(BatchNormalization())                                                                 # Normalization layer

model.add(Conv2D(128, kernel_size=(3,3), padding = "same", activation='relu'))                  # 2D Convolution layer
model.add(MaxPool2D(pool_size = (2,2)))                                                         # Max Pool layer 
model.add(BatchNormalization())                                                                 # Normalization layer

model.add(Conv2D(256, kernel_size=(5,5), padding = "same", activation='relu'))                  # 2D Convolution layer
model.add(MaxPool2D(pool_size = (2,2)))                                                         # Max Pool layer 
model.add(BatchNormalization())                                                                 # Normalization layer

model.add(Conv2D(128, kernel_size=(3,3), padding = "same", activation='relu'))                  # 2D Convolution layer
model.add(MaxPool2D(pool_size = (2,2)))                                                         # Max Pool layer 
model.add(BatchNormalization())                                                                 # Normalization layer

model.add(Flatten())                                                                            # Flattening the data
model.add(Dense(256, activation='relu'))                                                        # Dense Layer
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))                                                      # Add Output Layer
model.summary()

## **Set Optmizer as Adam and loss function as categorical cross entropy**

In [None]:
# Compile the model using Adam optimizer and loss as categorical_crossentropy
optimizer = tf.keras.optimizers.Adam(lr=0.0001)
model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy"])

# **Fit and evaluate model and print confusion matrix**

## **Fit the model on training dataset**

In [None]:
# Fit the model on training dataset
history = model.fit(trainX, trainY, epochs = 30, validation_data = (valX,valY), batch_size = 32, verbose = 1)

## **Plot validation and training accuracy and loss value over different epochs to check for overfitting**

In [None]:
# Plotting Loss between training and validation dataset
fig, ax = plt.subplots(2,1 , figsize=(22,7))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

# Plotting Accuracy between training and validation dataset
ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

## **Evaluate the model**

In [None]:
# Evaluate the model
_, acc = model.evaluate(trainX, trainY, verbose = 1)
print ("Training Accuracy : {0}" .format(acc * 100.0))

In [None]:
# Evaluate the model
_, acc = model.evaluate(testX, testY, verbose = 1)
print ("Test Accuracy : {0}" .format(acc * 100.0))

## **Confusion Matrix**

In [None]:
testDataX.shape

In [None]:
# Predict testing dataset
y_pred = np.argmax(model.predict(testX), axis = 1)
testY = np.argmax(testY, axis = 1)

In [None]:
# Create Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

plt.figure(figsize=(20, 15))
confusion_mtx = tf.math.confusion_matrix(testY,y_pred)
#plot_confusion_matrix(confusion_mtx, classes = range(12))
sns.heatmap(confusion_mtx, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
# Create classification report to output different metrics of classification
from sklearn.metrics import classification_report
print(classification_report(y_pred, testY))

In [None]:
# Print Label Encoding classes
lblEncoding.classes_

## **Predict and Visualize testDataX**

In [None]:
# Visualize 9 images of testing dataset
plt.figure(figsize = (10, 10))
for idx in range(9):
    plt.subplot(330 + 1 + idx)
    plt.imshow(testDataX[idx], cmap = plt.get_cmap('RdYlGn_r'))
    img = testDataX[idx].reshape((1, 128, 128, 3))
    predictOutput = np.argmax(model.predict(img), axis=-1)
    plt.title(lblEncoding.inverse_transform(predictOutput)[0])
plt.show()

In [None]:
# Visualize 9 images of testing dataset
testPrediction = []
for idx, img in enumerate(testDataX):
    img = img.reshape((1, 128, 128, 3))
    predictOutput = np.argmax(model.predict(img), axis=-1)
    testPrediction.append(lblEncoding.inverse_transform(predictOutput)[0])

In [None]:
df = pd.DataFrame()
df['file'] = testFileName
df['species'] = testPrediction

df.sample(20)

In [None]:
df.to_csv('./sample_submission.csv', index=False, header=df.columns)