In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import time
from skimage.feature import blob_doh
from statistics import mean

start = time.time()

train_image_path = '../input/resized-plant2021/img_sz_256/' #uses smaller version of dataset for efficiency
test_image_path = '../input/plant-pathology-2021-fgvc8/test_images/'
train_df_path = '../input/plant-pathology-2021-fgvc8/train.csv'
test_df_path = '../input/plant-pathology-2021-fgvc8/sample_submission.csv'

train_df = pd.read_csv(train_df_path)

In [None]:
blue_minimums = []
blue_maximums = []
blue_means = []
n_blue_blob = []
blue_max_blob = []
blue_mean_blob = []

green_minimums = []
green_maximums = []
green_means = []
n_green_blob = []
green_max_blob = []
green_mean_blob = []

red_minimums = []
red_maximums = []
red_means = []
n_red_blob = []
red_max_blob = []
red_mean_blob = []

def get_max_by_col(li, col):
    # col - 1 is used to 'hide' the fact lists' indexes are zero-based from the caller
    if not li.any():
        return 0
    else:
        return max(li, key=lambda x: x[col - 1])[col - 1]

def get_mean(li):
    # col - 1 is used to 'hide' the fact lists' indexes are zero-based from the caller
    if not li.any():
        return 0
    
    s = 0
    n = 0
    for l in li:
        s += l[2]
        n += 1
    return s/n

n = 0

for image_path in train_df['image'].tolist():
    img = cv2.imread(train_image_path + image_path) #cv2 reads image into numpy array
    
    #adapted from https://www.kaggle.com/shikhar0051/plant-disease-detection
    edges = cv2.Canny(img, 100, 200)
    edge_coors = []
    for i in range(edges.shape[0]):
        for j in range(edges.shape[1]):
            if edges[i][j] != 0:
                edge_coors.append((i, j))
    
    row_min = edge_coors[np.argsort([coor[0] for coor in edge_coors])[0]][0]
    row_max = edge_coors[np.argsort([coor[0] for coor in edge_coors])[-1]][0]
    col_min = edge_coors[np.argsort([coor[1] for coor in edge_coors])[0]][1]
    col_max = edge_coors[np.argsort([coor[1] for coor in edge_coors])[-1]][1]
    new_img = img[row_min:row_max, col_min:col_max]
    
    #openCV uses BGR image formatting, so
    blue_channel = new_img[:,:,0]
    green_channel = new_img[:,:,1]
    red_channel = new_img[:,:,2]
    
    blobs_blue = blob_doh(blue_channel)
    blobs_green = blob_doh(green_channel)
    blobs_red = blob_doh(red_channel)
    
    #extract features
    blue_minimums.append(np.min(blue_channel))
    blue_maximums.append(np.max(blue_channel).astype(np.int16))
    blue_means.append(np.mean(blue_channel))
    n_blue_blob.append(len(blobs_blue))
    blue_max_blob.append(get_max_by_col(blobs_blue, 3))
    blue_mean_blob.append(get_mean(blobs_blue))
    
    green_minimums.append(np.min(green_channel))
    green_maximums.append(np.max(green_channel).astype(np.int16))
    green_means.append(np.mean(green_channel))
    n_green_blob.append(len(blobs_green))
    green_max_blob.append(get_max_by_col(blobs_green, 3))
    green_mean_blob.append(get_mean(blobs_green))

    red_minimums.append(np.min(red_channel))
    red_maximums.append(np.max(red_channel).astype(np.int16))
    red_means.append(np.mean(red_channel))
    n_red_blob.append(len(blobs_red))
    red_max_blob.append(get_max_by_col(blobs_red, 3))
    red_mean_blob.append(get_mean(blobs_red))
    
    n+=1
    if n == 3000:
        break

In [None]:
colour_features = pd.DataFrame(
    {     
    'blue_minimums' : blue_minimums,
    'blue_maximums' : blue_maximums,
    'blue_means' : blue_means,
    'n_blue_blob':  n_blue_blob,
    'blue_max_blob': blue_max_blob,
    'blue_mean_blob' : blue_mean_blob,
    
    'green_minimums' : green_minimums,
    'green_maximums' : green_maximums,
    'green_means' : green_means,
    'n_green_blob':  n_green_blob,
    'green_max_blob': green_max_blob,
    'green_mean_blob' : green_mean_blob,

    'red_minimums' : red_minimums,
    'red_maximums' : red_maximums,
    'red_means' : red_means,
    'n_red_blob':  n_red_blob,
    'red_max_blob': red_max_blob,
    'red_mean_blob' : red_mean_blob
    })

y_train = train_df['labels'].astype('category')

y_train = y_train[y_train.index.isin(range(3000))]

linreg = LogisticRegression(solver='liblinear')
linreg.fit(colour_features, y_train)

In [None]:
test_df = pd.read_csv(test_df_path)

blue_minimums = []
blue_maximums = []
blue_means = []
n_blue_blob = []
blue_max_blob = []
blue_mean_blob = []

green_minimums = []
green_maximums = []
green_means = []
n_green_blob = []
green_max_blob = []
green_mean_blob = []

red_minimums = []
red_maximums = []
red_means = []
n_red_blob = []
red_max_blob = []
red_mean_blob = []


for image_path in test_df['image'].tolist():
    print(image_path)
    img = cv2.imread(test_image_path + image_path) #cv2 reads image into numpy array
    
    img = cv2.resize(img, (256,171))
    
    #adapted from https://www.kaggle.com/shikhar0051/plant-disease-detection
    edges = cv2.Canny(img, 100, 200)
    edge_coors = []
    for i in range(edges.shape[0]):
        for j in range(edges.shape[1]):
            if edges[i][j] != 0:
                edge_coors.append((i, j))
    
    row_min = edge_coors[np.argsort([coor[0] for coor in edge_coors])[0]][0]
    row_max = edge_coors[np.argsort([coor[0] for coor in edge_coors])[-1]][0]
    col_min = edge_coors[np.argsort([coor[1] for coor in edge_coors])[0]][1]
    col_max = edge_coors[np.argsort([coor[1] for coor in edge_coors])[-1]][1]
    new_img = img[row_min:row_max, col_min:col_max]
    
    #openCV uses BGR image formatting, so
    blue_channel = new_img[:,:,0]
    green_channel = new_img[:,:,1]
    red_channel = new_img[:,:,2]
    
    blobs_blue = blob_doh(blue_channel)
    blobs_green = blob_doh(green_channel)
    blobs_red = blob_doh(red_channel)
    
    #extract features
    blue_minimums.append(np.min(blue_channel))
    blue_maximums.append(np.max(blue_channel).astype(np.int16))
    blue_means.append(np.mean(blue_channel))
    n_blue_blob.append(len(blobs_blue))
    blue_max_blob.append(get_max_by_col(blobs_blue, 3))
    blue_mean_blob.append(get_mean(blobs_blue))
    
    green_minimums.append(np.min(green_channel))
    green_maximums.append(np.max(green_channel).astype(np.int16))
    green_means.append(np.mean(green_channel))
    n_green_blob.append(len(blobs_green))
    green_max_blob.append(get_max_by_col(blobs_green, 3))
    green_mean_blob.append(get_mean(blobs_green))

    red_minimums.append(np.min(red_channel))
    red_maximums.append(np.max(red_channel).astype(np.int16))
    red_means.append(np.mean(red_channel))
    n_red_blob.append(len(blobs_red))
    red_max_blob.append(get_max_by_col(blobs_red, 3))
    red_mean_blob.append(get_mean(blobs_red))
    
colour_features = pd.DataFrame(
    {     
    'blue_minimums' : blue_minimums,
    'blue_maximums' : blue_maximums,
    'blue_means' : blue_means,
    'n_blue_blob':  n_blue_blob,
    'blue_max_blob': blue_max_blob,
    'blue_mean_blob' : blue_mean_blob,
    
    'green_minimums' : green_minimums,
    'green_maximums' : green_maximums,
    'green_means' : green_means,
    'n_green_blob':  n_green_blob,
    'green_max_blob': green_max_blob,
    'green_mean_blob' : green_mean_blob,

    'red_minimums' : red_minimums,
    'red_maximums' : red_maximums,
    'red_means' : red_means,
    'n_red_blob':  n_red_blob,
    'red_max_blob': red_max_blob,
    'red_mean_blob' : red_mean_blob
    })

In [None]:
preds = linreg.predict(colour_features)
test_df['labels'] = preds
    
test_df.to_csv('submission.csv', index=False)

end = time.time()
print(end - start)