# CZ/CE 4041 Machine Learning
## Plant Seedling Classification [Kaggle] 
### Approach 1: k-means Clustering Algorithm

### Team
* Dwivedee Lakshyajeet
* Gupta Jay
* Bansal Aditya
* Mantri Raghav
* Bhatia Ritik

> **Warning:** This notebook was created on the Kaggle platform where it fetches data from the Kaggle directories. It will not work by default on the Jupyter Notebook Platform.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Fetch Data from Kaggle's Directory

In [None]:
import os

data_directory = "../input/plant-seedlings-classification/"
train_data_directory = os.path.join(data_directory, "train")
test_data_directory = os.path.join(data_directory, "test")

## Data Preprocessing using OpenCV

In [None]:
import cv2
import numpy as np

# References
#   - Gábor Vecsei (Kaggle)
#   - OpenCV Docs (https://docs.opencv.org/3.4/d7/d37/tutorial_mat_mask_operations.html)

"""
Creating a mask to extract the relevant features from the plant images.

    Args:
        - image: OpenCV Image 
"""
def create_mask_for_plant(image):
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    sensitivity = 35
    
    # Get pixels from image in the following (Hue, Saturation, Lightness) range
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])

    mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11,11))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    
    return mask

"""
Applying the mask on the image.

    Args:
        - image: OpenCV Image
"""
def segment_plant(image):
    mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(image, image, mask = mask)
    return output

In [None]:
from glob import glob

images = []
labels = []

"""
Loading data from our source, folder by folder. 
Every folder contains one type of plant seedling images.
We convert RGB images to a Grayscale vector using OpenCV and apply a mask on it. 
"""
for class_folder_name in os.listdir(train_data_directory):
    class_folder_path = os.path.join(train_data_directory, class_folder_name)
    
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        # Read the image
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        # Resize the image to 150 x 150
        image = cv2.resize(image, (150, 150))
        # Apply mask on the image
        image = segment_plant(image)
        # Convert image to grayscale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # Resize the image to 45 x 45
        image = cv2.resize(image, (45,45))
        # Flatten
        image = image.flatten()
        
        # Append image to the list of all images
        images.append(image)
        # Append lable to the list of all labels
        labels.append(class_folder_name)

# List of Image Arrays
images = np.array(images)

# List of Labels
labels = np.array(labels)

In [None]:
import pprint as pp

# Dictionary to convert Labels to IDs
label_to_id_dict = {v:i for i,v in enumerate(np.unique(labels))}
# Dictionary to convert IDs to Labels
id_to_label_dict = {v: k for k, v in label_to_id_dict.items()}

print("Label Legend:")
print("-------------")
pp.pprint(label_to_id_dict)

In [None]:
import matplotlib.pyplot as plt

# Displaying a sample image from our data
plt.imshow(np.reshape(images[734], (45,45)), cmap="gray")
print("Label:", labels[734])
print("Label ID:", label_to_id_dict[labels[734]])

In [None]:
from sklearn.preprocessing import StandardScaler

# Adding a label ID to our data using the dictionary 
label_ids = np.array([label_to_id_dict[x] for x in labels])

# Normalizing our data such that it has mean of 0 and variance of 1 for faster learning 
images_scaled = StandardScaler().fit_transform(images)

print("Number of Examples, Size of Exampe: ", end="")
print(images_scaled.shape)

## k-means Clustering 

### Training

In [None]:
from sklearn.cluster import KMeans

# K-Means Clustering with 12 clusters for different plant seedling types
kmeans = KMeans(n_clusters = len(label_to_id_dict), init='random')
kmeans.fit(images_scaled)

### Testing

In [None]:
# References:
#    - S Joel Franklin (https://medium.com/@joel_34096/k-means-clustering-for-image-classification-a648f28bdc47)

"""
As k-means Clustering algorithm is an unsupervised training method,
we do not know the correct lables corresponding to each cluster.
So we approximate the labels by firstly grouping all the images
of a same cluster and then taking the arg(max) of their labels.

    Args:
        - cluster_labels: labels of the k-means cluster
        - y_train: ground truth labels
"""
def infer_cluster_label(cluster_labels, y_train):
    
    ref_labels = {}
    
    # loop through every label of the cluster == 12
    for i in range(len(np.unique(kmeans.labels_))):
        index = np.where(cluster_labels == i,1,0)
        num = np.bincount(y_train[index==1]).argmax()
        
        ref_labels[i] = num
        
    return ref_labels

In [None]:
ref_labels = infer_cluster_label(kmeans.labels_, label_ids)
predicted_labels = np.random.rand(len(kmeans.labels_))

# Get the predicted actual labels
for i in range(len(kmeans.labels_)):
    predicted_labels[i] = ref_labels[kmeans.labels_[i]]

## Accuracy

In [None]:
print("Predicted Labels: ", end="")
print(predicted_labels[:10].astype('int'))

print("Actual Labels: ", end="")
print(label_ids[:10])

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy: ", end="")
print(accuracy_score(predicted_labels,label_ids))