### **Plant Seedlings Classification**

In this project, let us import the images of various plant seedlings, train the images & then predict the seedlings family. First, we will try to predict with convolutional neural networks & compare it with the supervised learning classifier (like KNN) and neural networks.

The dataset comprises of images from 12 plant species. Source: https://www.kaggle.com/c/plant-seedlings-classification/data

Let us create a classifier capable of determining a plant's species from a photo

In [None]:
#Mounting the drive 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#After mounting the directory, lets check the current working directory path in Google drive
import os
def current_path(): 
    print("Current working directory before") 
    print(os.getcwd()) 
    print() 

In [None]:
# The image files are saved in the following google drive path
! cd /content/drive/My\ Drive/PlantClassification/
! ls /content/drive/My\ Drive/PlantClassification/

***Data Import & CNN***

In [None]:
#Importing the basic neccesary packages. The remaining packages will be imported at later point of time on the need basis
import os        
import numpy as np # linear algebra
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import PIL
import PIL.Image
from tensorflow import keras
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt

In [None]:
#Categories of Seedling available in the training folder. The respective images are saved under each folder of the categories mentioned below
plant_category=os.listdir('/content/drive/MyDrive/PlantClassification/train')
plant_category

In [None]:
#Import Test, Train & Validation Data

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
#Validation Train split as 20-80. We have a separate folder for Test images
train_datagen =ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen =ImageDataGenerator(rescale=1./255)

    
train_seedlings = train_datagen.flow_from_directory(
        '/content/drive/MyDrive/PlantClassification/train',  
            target_size=(224,224),  # Resizes images
            batch_size=64,
            class_mode='categorical',subset = 'training', seed=50)
x_train,y_train=next(train_seedlings)

In [None]:
validation_seedlings = train_datagen.flow_from_directory(
    '/content/drive/MyDrive/PlantClassification/train',
    target_size=(224, 224),
    batch_size= 64,
    class_mode='categorical',
    subset='validation')

x_val,y_val=next(validation_seedlings)

In [None]:
test_seedlings = test_datagen.flow_from_directory(
    '/content/drive/My Drive/PlantClassification/test/',
    target_size=(224, 224),
    batch_size=64,
    class_mode=None,
    )

x_test=next(test_seedlings)

In [None]:
#Number of images available in each of the category 
unique, counts = np.unique(train_seedlings.classes, return_counts=True)
dict1 = dict(zip(train_seedlings.class_indices, counts))

keys = dict1.keys()
values = dict1.values()

plt.xticks(rotation='vertical')
bar = plt.bar(keys, values)

In [None]:
#Plotting few of the images 
from mpl_toolkits.axes_grid1 import ImageGrid

def show_grid(image_list, nrows, ncols, label_list=None, show_labels=False, figsize=(10,10)):

    fig = plt.figure(None, figsize,frameon=False)
    grid = ImageGrid(fig, 111, 
                     nrows_ncols=(nrows, ncols),  
                     axes_pad=0.2, 
                     share_all=True,
                     )
    for i in range(nrows*ncols):
        ax = grid[i]
        ax.imshow(image_list[i],cmap='Greys_r')
        ax.axis('off')

In [None]:
show_grid(x_train,2,4,show_labels=True,figsize=(10,10))

In [None]:
from keras.layers import Conv2D,MaxPooling2D,GlobalMaxPool2D
from keras.layers import BatchNormalization, Activation
from keras.layers import Dropout, Flatten, Dense

##### ***CNN Model***

In [None]:
model = tf.keras.models.Sequential([
    
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=train_seedlings.image_shape),
    tf.keras.layers.MaxPooling2D(2, 2),
    keras.layers.Dropout(rate=0.15), 
    
    # The second convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    keras.layers.Dropout(rate=0.10),
    
    # The third convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    keras.layers.Dropout(rate=0.15),
    
    # Flatten the results to feed into a DNN
    tf.keras.layers.Flatten(),
    
    # hidden layer
    
    tf.keras.layers.Dense(512, activation='relu'),
    keras.layers.BatchNormalization(),    #adding batch normalization
    keras.layers.Dropout(rate=0.10),
  
    
    # 12 output neurons for the 12 classes of Seedling Images
    tf.keras.layers.Dense(12, activation='softmax')
    
    
    ])

from tensorflow.keras.optimizers import RMSprop

model.compile(loss='categorical_crossentropy',
              optimizer="sgd",
              metrics=['acc'])

In [None]:
## Running our model for 12 epochs

from tensorflow.keras import datasets, layers, models

#Model fitting for a number of epochs
history = model.fit(
      train_seedlings,
      steps_per_epoch=50,
      epochs=20,
      validation_data = validation_seedlings,
      validation_steps = 10,
      verbose=1)

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
# returns accuracy of training
print("Training Accuracy:"), print(history.history['acc'][-1])
print("Validation Accuracy:"), print (history.history['val_acc'][-1])

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax = ax.ravel()
for i, met in enumerate(['acc', 'loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_' + met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train', 'val'])

It is seen that as epoch increases,the accuracy increases & loss decreases for training data. But Validation data doesn't follow same pattern like test. After validating the data, let us try to predict the seedlings with test data 

In [None]:
prediction=model.predict(test_seedlings)

In [None]:
from sklearn.preprocessing import LabelEncoder

#encoding dependent variable
encoder = LabelEncoder()
encoder.fit(plant_category)
preds = np.argmax(prediction, axis=1)
pred_cat=encoder.classes_[preds]
preds = np.argmax(prediction, axis=1)

In [None]:
#Final prediction
final_predictions = {'file':test_seedlings.filenames, 'species':pred_cat}
final_predictions

It could be seen that CNN has accuracy around 99% in train data & 67% in test data

 **Supervised Learning**

Lets import data for supervised learning. We can reuse the dataset imported earlier also. 

In [None]:
train_path='/content/drive/MyDrive/PlantClassification/train/'
data_dir= '/content/drive/MyDrive/PlantClassification/'
test_path= '/content/drive/MyDrive/PlantClassification/test/unknown/'

In [None]:
#Import Test, Train & Validation Data

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2

train_datagen =ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen =ImageDataGenerator(rescale=1./255)

    
train_seedlings = train_datagen.flow_from_directory(
        '/content/drive/MyDrive/PlantClassification/train',  
            target_size=(224, 224),  # Resizes images
            batch_size=512,
            #color_mode="grayscale",
            class_mode='categorical',subset = 'training', seed=50)
Kx_train,Ky_train=next(train_seedlings)


In [None]:
validation_seedlings = train_datagen.flow_from_directory(
    '/content/drive/MyDrive/PlantClassification/train',
    target_size=(224, 224),
    batch_size=512,
    class_mode='categorical',
    subset='validation')

Kx_val,Ky_val=next(validation_seedlings)

In [None]:
Kx_train.shape

In [None]:
Kx_val.shape

In [None]:
Kx_train = Kx_train.reshape((Kx_train.shape[0], -1))
Kx_train.shape

In [None]:
Kx_val = Kx_val.reshape((Kx_val.shape[0], -1))
Kx_val.shape

In [None]:
import matplotlib.pyplot as plt
images = x_train[:9]
labels = y_train[:9]

# to visualize some images from our data set
fig, axes = plt.subplots(3, 3, figsize=(2*3,2*3))
for i in range(9):
    ax = axes[i//3, i%3]
    ax.imshow(images[i], cmap='gray')
plt.show()

In [None]:
Kx_train=Kx_train/255
Kx_val=Kx_val/255

In [None]:
#importing libraries
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#Using K Nearest Neibours algorithm for the image classification  
Klist=[3,5,7,9,11]
Kscore = [] #to store scores

#For loop to run kmodel
for k in Klist:
    KNN= KNeighborsClassifier(n_neighbors = k)
    KNN.fit(Kx_train, Ky_train)
    K_predict = KNN.predict(Kx_val)
    score = accuracy_score(Ky_val,K_predict)
    Kscore.append(score)

#Find Mean Square Error t check optimak k
MSE = [1-x for x in Kscore]

optimalk = Klist[MSE.index(min(MSE))]
print("Optimal K for this dataset is : %d" %optimalk)

#Visualising K and MSE
plt.plot(Klist,MSE)


KNN with k as 3 is giving 10% accuracy for this problem statement. Also from the classification report warning, we can see that few labels have not been predicted at all.

In [None]:
print(Kscore[0])

**Neural Network**

We can proceed to build Neural Network model for better prediction

In [None]:
#Loading Train data
train_data=[]
if not train_data:
  for category_id, category in enumerate(plant_category):
    for file in os.listdir(os.path.join(train_path,category)):
      train_data.append(['train/{}/{}'.format(category,file),file,category_id,category])
  train_data = pd.DataFrame(train_data, columns=['file','filename', 'category_id', 'category'])
  train_data.shape
else:
  print("train data already extracted")

In [None]:
# one hot encoding target variable
from keras.utils import np_utils
categorical_labels = np_utils.to_categorical(train_data.category_id,num_classes=12)

In [None]:
img_rows=128
img_cols=128
num_channel=3

In [None]:
#Import Test, Train & Validation Data

#resizing the train image and save
from tqdm import tqdm
import cv2
x_feature = []
y_feature = []

i = 0 # initialisation

for f in tqdm(train_data.file):
     # f for format ,jpg
  train_img = cv2.imread(data_dir+'{}'.format(f))
  label = categorical_labels[i]
  train_img_resize = cv2.resize(train_img, (img_rows, img_cols),interpolation=cv2.INTER_LINEAR) 
  x_feature.append(train_img_resize)
  y_feature.append(label)
  i += 1


In [None]:
validation_seedlings = train_datagen.flow_from_directory(
    '/content/drive/MyDrive/PlantClassification/train',
    target_size=(128, 128),
    batch_size=512,
    class_mode='categorical',
    subset='validation')

x_val,y_val=next(validation_seedlings)

In [None]:
x_train_data = np.array(x_feature, np.float32) / 255.   # /= 255 for normolisation
print (x_train_data.shape)

In [None]:
y_train_data = np.array(y_feature)
y_train_data.shape

In [None]:
#Splitting train and validation set
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.2, random_state=2)
print (x_train.shape)
print (x_val.shape)

In [None]:
#Generating augmented images using image generator
datagen= tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=20,
                                                         width_shift_range=0.2,
                                                         height_shift_range=0.2,
                                                         zoom_range=[0.4,1.5],
                                                         horizontal_flip=True,
                                                         vertical_flip=True)

In [None]:
datagen.fit(x_train)
print(datagen)

In [None]:
from tensorflow.keras.layers import Flatten, InputLayer
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential

In [None]:
#NN model for seedling data

nn_model=Sequential()
nn_model.add(Flatten(input_shape=(128,128,3)))
nn_model.add(Dense(units = 840, activation = 'relu'))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(units = 210, activation = 'relu'))
nn_model.add(Dense(units = 210, activation = 'relu'))
nn_model.add(Dropout(0.6))
nn_model.add(Dense(units = 105, activation = 'relu'))
nn_model.add(Dense(units = 12, activation = 'softmax'))
nn_model.summary()

In [None]:
adam = Adam(lr=0.0001)
nn_model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
# Use earlystopping
#callback = tensorflow.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, min_delta=0.001)

In [None]:
#fitting NN model
with tf.device('/device:GPU:0'):
  history = nn_model.fit(datagen.flow(x_train, y_train),
          epochs=44,
          steps_per_epoch= 100,  #Number of training images//batch_size
          validation_data=(x_val,y_val),
          validation_steps = 10, #Number of validation images//batch_size
          #callbacks = [callback],
          verbose = 1)

In [None]:
#Evaluating the model
nn_prediction=nn_model.evaluate(x_val,y_val)
print(nn_prediction)

### **Conclusion**

* Dataset had 12 categories of plant seedling 
* All the images are resized to 224*224  
* In Supervised Algorithm, K Nearest Neibour is used and the accuracy was around 10%
* Fully connected neural network achieved 29% validation accuracy
* CNN achieved 65% validation accuracy 
* It could be seen that accuracy is the maximum in CNN in comparison to Supervised / Fully Connected Neural network
* Transfer Learning could also be a better choice for CNN 