# Preprocess and Feature Extraction


In [4]:
pip install mahotas




In [28]:
import os
import cv2
import numpy as np
import pandas as pd
import mahotas as mt
from matplotlib import pyplot as plt
from tqdm import tqdm
from imutils import paths
%matplotlib inline

In [20]:
ds_path = "Medicinal Plants"
img_files = os.listdir(ds_path)

In [29]:
img_files = list(paths.list_images("Medicinal Plants"))
len(img_files)

14868

In [30]:
print(img_files)

['Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-001.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-002.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-003.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-004.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-005.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-006.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-007.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-008.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-009.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-010.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-011.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-012.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-013.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-014.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-015.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-016.jpg', 'Medicinal Plants\\Alpinia Galanga (Rasna)\\AG-S-017.jp

In [31]:
def create_dataset():
    names = ['area','perimeter','physiological_length','physiological_width','aspect_ratio','rectangularity','circularity', \
             'mean_r','mean_g','mean_b','stddev_r','stddev_g','stddev_b', \
             'contrast','correlation','inverse_difference_moments','entropy'
            ]
    df = pd.DataFrame([], columns=names)
    for file in img_files:
        label = file.split(os.path.sep)[-2]
        main_img = cv2.imread(file)
        #print("yes")
        if main_img is not None:
            #Preprocessing
            #print("yes")
            img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
            gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
            blur = cv2.GaussianBlur(gs, (25,25),0)
            ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
            kernel = np.ones((50,50),np.uint8)
            closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)

            #Shape features
            #image, contours, hierachy  = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
            contours, hierachy  = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
            cnt = contours[0]
            M = cv2.moments(cnt)
            area = cv2.contourArea(cnt)
            if area==0:
                continue
            perimeter = cv2.arcLength(cnt,True)
            x,y,w,h = cv2.boundingRect(cnt)
            aspect_ratio = float(w)/h
            rectangularity = w*h/area
            circularity = ((perimeter)**2)/area
            

            #Color features
            red_channel = img[:,:,0]
            green_channel = img[:,:,1]
            blue_channel = img[:,:,2]
            blue_channel[blue_channel == 255] = 0
            green_channel[green_channel == 255] = 0
            red_channel[red_channel == 255] = 0

            red_mean = np.mean(red_channel)
            green_mean = np.mean(green_channel)
            blue_mean = np.mean(blue_channel)

            red_std = np.std(red_channel)
            green_std = np.std(green_channel)
            blue_std = np.std(blue_channel)

            #Texture features
            textures = mt.features.haralick(gs)
            ht_mean = textures.mean(axis=0)
            contrast = ht_mean[1]
            correlation = ht_mean[2]
            inverse_diff_moments = ht_mean[4]
            entropy = ht_mean[8]

            vector = [area,perimeter,w,h,aspect_ratio,rectangularity,circularity,\
                      red_mean,green_mean,blue_mean,red_std,green_std,blue_std,\
                      contrast,correlation,inverse_diff_moments,entropy
                     ]

            df_temp = pd.DataFrame([vector],columns=names)
            df = df.append(df_temp)
            print(file)
    return df
plantdata = create_dataset()

Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-001.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-002.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-003.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-004.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-005.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-006.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-007.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-008.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-009.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-010.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-011.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-012.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-013.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-014.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-015.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-016.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-017.jpg
Medicinal Plants\Alpinia Galanga (Rasna)\AG-S-018.jpg
Medicinal Plants\Alpinia Gal

In [32]:
plantdata.shape

(14780, 17)

In [33]:
type(plantdata)

pandas.core.frame.DataFrame

In [34]:
plantdata.to_csv("Medicinal_Plant_Data.csv")