Kaggle Plant Pathology 2020 - FGVC7(Detecting the category of foliar diseases in apple trees)

In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import cv2
from PIL import Image
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
tqdm.pandas()

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

In [51]:
MAIN_PATH = "../input/plant-pathology-2020-fgvc7"
SUB_PATH = MAIN_PATH + '/sample_submission.csv'
print(os.listdir(MAIN_PATH))

IMAGE_PATH = MAIN_PATH + "/images/"
train_df = pd.read_csv(MAIN_PATH + "/train.csv")
test_df = pd.read_csv(MAIN_PATH + "/test.csv")

['sample_submission.csv', 'images', 'test.csv', 'train.csv']


In [52]:
# 이미지 갯수, csv shape 확인
print('train data shape: ', train_df.shape)
print('Total images in train set: ', train_df['image_id'].count())
print('[train_csv example]\n', train_df.head(3))
print('---------------------------------------------------')
print('test data shape: ', test_df.shape)
print('Total images in test set: ', train_df['image_id'].count())
print('[test_csv example]\n', test_df.head(3))

train data shape:  (1821, 5)
Total images in train set:  1821
[train_csv example]
   image_id  healthy  multiple_diseases  rust  scab
0  Train_0        0                  0     0     1
1  Train_1        0                  1     0     0
2  Train_2        1                  0     0     0
---------------------------------------------------
test data shape:  (1821, 1)
Total images in test set:  1821
[test_csv example]
   image_id
0   Test_0
1   Test_1
2   Test_2


In [53]:
# 분류되지 않은 데이터 확인
print('train set')
print(train_df.info())
print('------------------------------------------')
print('test set')
print(test_df.info())

train set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1821 entries, 0 to 1820
Data columns (total 5 columns):
image_id             1821 non-null object
healthy              1821 non-null int64
multiple_diseases    1821 non-null int64
rust                 1821 non-null int64
scab                 1821 non-null int64
dtypes: int64(4), object(1)
memory usage: 71.3+ KB
None
------------------------------------------
test set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1821 entries, 0 to 1820
Data columns (total 1 columns):
image_id    1821 non-null object
dtypes: object(1)
memory usage: 14.4+ KB
None


In [54]:
# class 추출
temp=[]
classes = {}
for col in train_df.columns:
    temp.append(col) 
temp.remove('image_id')
for i in range(len(temp)):
    classes[i] = temp[i]

In [55]:
# 클래스 별 샘플 수 체크
for c in range(0,len(classes)):
    print(f"#{classes[c]} samples: {train_df[classes[c]].sum()}")

#healthy samples: 516
#multiple_diseases samples: 91
#rust samples: 622
#scab samples: 592


multiple_diseases 샘플 수가 다른 샘플들에 비해 상대적으로 적음

In [56]:
# 중복된 데이터 확인
train_id = set(train_df.image_id.values)
print(f"#Unique train images: {len(train_id)}")
test_id = set(test_df.image_id.values)
print(f"#Unique train images: {len(test_id)}")
both_images = train_id.intersection(test_id)
print(f"#Images in both train set and test set: {len(both_images)}")

#Unique train images: 1821
#Unique train images: 1821
#Images in both train set and test set: 0


train set, test set내에 중복된 데이터는 존재하지 않음
또한, train set과 test set에 모두 들어간 데이터도 존재하지 않음

In [57]:
def load_image(image_id):
    image = cv2.imread(IMAGE_PATH + image_id + '.jpg')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (40, 40))
    return image.flatten().astype(np.float32)

train_X_flatten = train_df['image_id'].progress_apply(load_image)
test_X_flatten = test_df['image_id'].progress_apply(load_image)
train_X_flatten =np.stack(train_X_flatten.to_numpy())
test_X_flatten = np.stack(test_X_flatten.to_numpy())

100%|██████████| 1821/1821 [01:12<00:00, 25.18it/s]
100%|██████████| 1821/1821 [01:21<00:00, 22.23it/s]


In [59]:
train_X_flatten = train_X_flatten / 255.
test_X_flatten = test_X_flatten / 255.
train_Y = train_df[['healthy', 'multiple_diseases', 'rust', 'scab']].to_numpy()
train_Y = train_Y[:, 0] + train_Y[:, 1]*2 + train_Y[:, 2]*3 + train_Y[:, 3]*4 - 1
print(f"Train set(flatten) shape: {train_X_flatten.shape}")
print(f"Test set(flatten) shape: {test_X_flatten.shape}")


Train set(flatten) shape: (1821, 4800)
Test set(flatten) shape: (1821, 4800)


In [60]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(train_X_std, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [61]:
score = neigh.score(train_X_std, train_Y)
print(f"Model accuracy: {score}")

Model accuracy: 0.5744096650192202


In [62]:
Y_hat = neigh.predict_proba(test_X_std)
sub = pd.read_csv(SUB_PATH)
sub.loc[:, 'healthy':] = Y_hat
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Test_0,0.4,0.2,0.0,0.4
1,Test_1,0.2,0.2,0.4,0.2
2,Test_2,0.4,0.2,0.2,0.2
3,Test_3,0.4,0.0,0.2,0.4
4,Test_4,0.0,0.0,0.6,0.4
