In [33]:
import pandas as pd
import numpy as np
import cv2
import os
from glob import glob
import mahotas
from tqdm import notebook
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

### Load the dataset

In [2]:
df = pd.read_csv('plant-pathology-2020-fgvc7/train.csv')
df.head(2)

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0


In [3]:
df.columns = [df.columns[0], 0,1,2,3]
df.head(2)

Unnamed: 0,image_id,0,1,2,3
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0


In [4]:
temp = df.loc[:, df.columns[1:]]
y = temp.idxmax(axis=1).to_list()
y = np.array(y)

### Feature Extraction 

In [5]:
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

def fd_haralick(image):    # convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    return haralick
 
def fd_histogram(image, mask=None):
    # convert the image to HSV color-space
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # compute the color histogram
    hist  = cv2.calcHist([image], [0, 1, 2], None, [15, 15, 15], [0, 256, 0, 256, 0, 256])
    # normalize the histogram
    cv2.normalize(hist, hist)
    return hist.flatten()


In [6]:
image_paths = [f'plant-pathology-2020-fgvc7/images/Train_{x}.jpg' for x in range(len(y))]

X = []
for path in notebook.tqdm(image_paths):
    img = cv2.imread(path)
    global_feature = np.hstack([fd_histogram(img), fd_haralick(img), fd_hu_moments(img)])
    X.append(global_feature)

X = np.stack(X)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




In [7]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(X)

In [8]:
pickle.dump(rescaled_features, open('train_features.pkl', 'wb'))

### Train Test Split 

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(rescaled_features, y, test_size=0.25, random_state=42)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(rescaled_features, y, test_size=0.25, random_state=42)

### SVM 

In [21]:
from sklearn.svm import SVC
clf = SVC(C=0.5, random_state=7)
clf.fit(X_train, y_train)

SVC(C=0.5, random_state=7)

In [36]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(class_weight='balanced'),
             param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]})

In [37]:
clf.score(X_test, y_test)

0.6557017543859649

In [38]:
y_pred = clf.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 86,   1,  18,  21],
       [  7,   5,   2,   8],
       [ 29,   1, 106,  19],
       [ 30,   5,  16, 102]])

### Logistic 

In [39]:
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(max_iter=1000)
clf.fit(X_train, y_train)

LogisticRegressionCV(max_iter=1000)

In [40]:
clf.score(X_test, y_test)

0.6293859649122807

In [41]:
y_pred = clf.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 74,   0,  21,  31],
       [  1,   4,   5,  12],
       [ 26,   2, 104,  23],
       [ 26,   2,  20, 105]])