In [None]:
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
import cv2
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
tqdm.pandas()

In [None]:
IMAGE_PATH = "../input/plant-pathology-2020-fgvc7/images/"
TEST_PATH = "../input/plant-pathology-2020-fgvc7/test.csv"
TRAIN_PATH = "../input/plant-pathology-2020-fgvc7/train.csv"
SUB_PATH = "../input/plant-pathology-2020-fgvc7/sample_submission.csv"

sub = pd.read_csv(SUB_PATH)
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)

In [None]:
IMAGE_SIZE = (40, 40)
def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, IMAGE_SIZE)
    return image.flatten().astype(np.float32)

train_images = train_data["image_id"].progress_apply(load_image)
test_images = test_data["image_id"].progress_apply(load_image)

In [None]:
X = np.stack(train_images.to_numpy())
y = train_data[['healthy', 'multiple_diseases', 'rust', 'scab']].to_numpy()
y = y[:, 0] + y[:, 1] * 2 + y[:, 2] * 3 + y[:, 3] * 4 - 1

In [None]:
model = DecisionTreeClassifier(max_depth=4)
model = model.fit(X, y)

In [None]:
def eval_accuracy(X, y):
    pr = model.predict_proba(X)
    pred = np.argmax(pr, axis=1)
    right = np.count_nonzero(pred == y)
    return right / y.shape[0]
eval_accuracy(X, y)

In [None]:
X_test = np.stack(test_images.to_numpy())
test_pr = model.predict_proba(X_test)
sub.loc[:, 'healthy':] = test_pr
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
from sklearn import tree
tree.plot_tree(model) 