In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# !pip install git+https://github.com/fastai/fastai --upgrade
# !pip install git+https://github.com/fastai/fastprogress --upgrade
!pip install fastprogress==0.2.3
!pip install fastai==1.0.61
from fastai import *
from fastai.vision import *
from sklearn.metrics import roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import scipy
import skimage
import skimage.io
# Any results you write to the current directory are saved as output.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
!ls /kaggle/input/plant-pathology-2020-different-size-images/images

In [None]:
path = Path('/kaggle/input/plant-pathology-2020-fgvc7/')

In [None]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sample_df = pd.read_csv(path/'sample_submission.csv')

In [None]:
test_df['image_id'] = test_df['image_id'] + '.jpg'
train_df['image_id'] = train_df['image_id'] + '.jpg'

In [None]:
train_df.head()

In [None]:
def get_label(row):
    if row.healthy == 1:
        return 'healthy'
    elif row.rust == 1:
        return 'rust'
    elif row.scab == 1:
        return 'scab'
    else:
        return 'multiple_diseases'

In [None]:
train_df['label'] = train_df.apply(get_label, axis=1)

In [None]:
train_df = train_df[['image_id', 'label']]

In [None]:
train_df.head()

In [None]:
c = Counter(train_df.label), len(train_df)
c

In [None]:
id_label = list(enumerate(train_df.label.tolist()))
random.seed(100)
train_sample_per_class = {'scab': 532, 'multiple_diseases': 71, 'healthy': 476, 'rust': 542}
val_sample_per_class = {'scab': 60, 'multiple_diseases': 20, 'healthy': 40, 'rust': 80}
chose = lambda k: list(map(lambda x: x[0], random.sample(list(filter(lambda x: x[1] == k, id_label)), train_sample_per_class[k] + val_sample_per_class[k]))) 
scab_chosen = chose('scab')
multiple_diseases_chosen = chose('multiple_diseases')
healthy_chosen = chose('healthy')
rust_chosen = chose('rust')

train_idx = scab_chosen[-train_sample_per_class['scab']:] + multiple_diseases_chosen[-train_sample_per_class['multiple_diseases']:] + healthy_chosen[-train_sample_per_class['healthy']:] + rust_chosen[-train_sample_per_class['rust']:]
val_idx = scab_chosen[:val_sample_per_class['scab']] + multiple_diseases_chosen[:val_sample_per_class['multiple_diseases']] + healthy_chosen[:val_sample_per_class['healthy']] + rust_chosen[:val_sample_per_class['rust']]
random.shuffle(train_idx)
random.shuffle(val_idx)
# print(scab_val)
# print(multiple_diseases_val)
# print(healthy_val)
# print(rust_val)

In [None]:
psuedo_label_submission = pd.read_csv('/kaggle/input/plant-pathology-fgvc7-2020-submissions/971968_966_964.csv')

In [None]:
psuedo_label_submission.head()

In [None]:
def get_pseudo_label(row):
    probs = [row.healthy, row.multiple_diseases, row.rust, row.scab]
    label = ['healthy', 'multiple_diseases', 'rust', 'scab'][np.argmax(probs)]
    return label

In [None]:
psuedo_label_submission['label'] = psuedo_label_submission.apply(get_pseudo_label, axis=1)

In [None]:
psuedo_label_submission['image_id'] = psuedo_label_submission['image_id'] + '.jpg'

In [None]:
psuedo_label_submission = psuedo_label_submission[['image_id', 'label']]

In [None]:
train_df_with_pl = pd.concat([train_df, psuedo_label_submission])

In [None]:
train_idx_with_pl = train_idx + list(range(len(train_df), len(train_df) + len(psuedo_label_submission)))

In [None]:
def plot_image(image_id, img_size, axis):
    images_path = '/kaggle/input/plant-pathology-2020-different-size-images-crop/images/images-' + str(img_size)
    img = skimage.io.imread(images_path + '/' + image_id)
#     plt.figure(figsize = (20,20))
    axis.imshow(img)
    plt.show()

In [None]:
def get_img_id_from_idx(idx):
    return train_df.iloc[idx].image_id

In [None]:
def plot_ten_images(images, title, img_size):
    fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(40, 20))
    fig.suptitle(title, fontsize=48)
    for i in range(10):
        plot_image(get_img_id_from_idx(images[i]), img_size, axes[i // 5][i % 5])
    fig.tight_layout()

In [None]:
# plot_ten_images(multiple_diseases_chosen[10:], 'multiple_diseases', 128)

In [None]:
# fig = plt.figure(figsize=(20, 20))
# ax = fig.add_subplot(111)
# plot_image(get_img_id_from_idx(healthy_chosen[5]), 1024, ax)

In [None]:
# fig = plt.figure(figsize=(20, 20))
# ax = fig.add_subplot(111)
# plot_image(get_img_id_from_idx(rust_chosen[4]), 1024, ax)

In [None]:
# fig = plt.figure(figsize=(20, 20))
# ax = fig.add_subplot(111)
# plot_image(get_img_id_from_idx(scab_chosen[4]), 1024, ax)

In [None]:
# fig = plt.figure(figsize=(20, 20))
# ax = fig.add_subplot(111)
# plot_image(get_img_id_from_idx(multiple_diseases_chosen[4]), 1024, ax)

In [None]:
# random.seed(21)
# train_chosen = rust_chosen[-10:] + healthy_chosen[-10:] + scab_chosen[-10:] + multiple_diseases_chosen[-10:]
# random.shuffle(train_chosen)

In [None]:
# len(train_chosen)

In [None]:
# preds = []

In [None]:
# i = 0

In [None]:
# fig = plt.figure(figsize=(20, 20))
# ax = fig.add_subplot(111)
# plot_image(get_img_id_from_idx(train_chosen[i]), 1024, ax)
# prediction = input()
# preds.append(prediction)

In [None]:
# i += 1
# i, len(preds)

In [None]:
# actual = [train_df.iloc[idx].label for idx in train_chosen]

In [None]:
# actual = [l[0] for l in actual]

In [None]:
# sum([preds[i] == actual[i] for i in range(len(preds))])

In [None]:
# l_to_i = {'h': 0, 'r': 1, 's': 2, 'm': 3}

In [None]:
# preds = [l_to_i[l] for l in preds]
# actual = [l_to_i[l] for l in actual]

In [None]:
# confusion_matrix(actual, preds)

In [None]:
# for i, idx in enumerate(train_chosen):
#     print(i, train_df.iloc[idx].label)

In [None]:
def create_databunch_from_img_size(img_size, bs):
    tfms = get_transforms(flip_vert=True,
                      max_rotate=None,
                      max_lighting=None,
                      max_zoom=0.0,
                      max_warp=None)
    p = 0.25
    tfms = ([dihedral(),
#              jitter(magnitude=0.01, p=p),
             brightness(change=(0.25, 0.75), p=2*p),
             contrast(scale=(0.80, 1.25), p=2*p),
             rotate(degrees=(-45.0, 45.0), p=p),
             skew(direction=(0, 7), magnitude=2.0, p=p),
             symmetric_warp(magnitude=(-0.3, 0.3), p=p),
#              cutout(n_holes=(16, 32), length=(img_size / 32, img_size / 16), p=p),
             squish(scale=(0.75, 2.0), p=p),
             zoom(scale=(0.90, 1.10), p=p),
            ], [])
#     tfms = ([], [])
    
    images_path = '/kaggle/input/plant-pathology-2020-different-size-images-crop/images/images-' + str(img_size)

    test_data = ImageList.from_df(test_df, images_path)

    src = (ImageList.from_df(train_df_with_pl, images_path)
           .split_by_idxs(train_idx_with_pl, val_idx)
           .label_from_df()
           .add_test(test_data))

    train_data = (src
                  .transform(tfms, padding_mode='zeros')
                  .databunch(bs=bs, num_workers=4)
                  .normalize(imagenet_stats))
    return train_data

In [None]:
img_size = 1024
train_data = create_databunch_from_img_size(img_size, 32)

In [None]:
train_data

In [None]:
train_data.show_batch()

In [None]:
# train_data.show_batch(ds_type=DatasetType.Valid)

In [None]:
class ROCAUCScore(Callback):
    
    def on_epoch_begin(self, **kwargs):
        self.targets, self.preds = [], []
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        self.targets.extend(last_target.tolist())
        self.preds.extend(list(map(scipy.special.softmax, last_output.tolist())))
    
    def on_epoch_end(self, last_metrics, **kwargs):
        sc = roc_auc_score(self.targets, self.preds, multi_class='ovr')
        return add_metrics(last_metrics, sc)

In [None]:
class StopTrainingAtEpoch(Callback):
    
    def __init__(self, learn, n):
        self.learn = learn
        self.n = n
    
    def on_epoch_end(self, epoch, **kwargs):
        if epoch == self.n:
            self.learn.save('model-' + str(self.n))
            return {'stop_training': True}

In [None]:
# ce_weight = torch.tensor([1.75, 4.0, 2.0, 1.75]).to(device)
# ce_weight = torch.tensor([3, 20, 3.5, 3]).to(device)
ce_weight = torch.tensor([1.0, 4.0, 1.0, 1.0]).to(device)
# ce_weight /= ce_weight.sum()
ce_weight=None
print(ce_weight)
learn = cnn_learner(train_data, models.resnet18, metrics=[ROCAUCScore(), accuracy, error_rate, Precision(average='macro'), Recall(average='macro'), FBeta(beta=1.0, average='macro')], 
                    loss_func=CrossEntropyFlat(reduction='mean', weight=ce_weight), ps=[0.5, 0.5, 0.5], wd=0.01,
                    path='/kaggle/working', callback_fns=[ShowGraph])#, partial(AccumulateScheduler, n_step=128)])
# learn = mixup(learn, alpha=0.4)

In [None]:
learn.load('/kaggle/input/plant-pathology-2020-fgvc7-best-single-model/stage-05');
learn.unfreeze()

In [None]:
img_size = 1024
train_data = create_databunch_from_img_size(img_size, 16)
learn.data = train_data

In [None]:
len(learn.data.train_dl), len(train_data.train_dl)

In [None]:
# learn.lr_find(start_lr=1e-7, end_lr=1e1,
#               num_it=20, #len(learn.data.train_dl)+1,
#               stop_div=True)

In [None]:
# learn.recorder.plot(suggestion=True, skip_end=0, skip_start=0)

In [None]:
learn.save('stage-05')

In [None]:
learn.fit_one_cycle(20, slice(1e-7, 1e-2 / 10), start_epoch=0, callbacks=[StopTrainingAtEpoch(learn, 19)])

In [None]:
learn.save('stage-06')

In [None]:
learn.load('stage-06');

In [None]:
learn.validate(learn.data.train_dl)

In [None]:
learn.validate(learn.data.valid_dl)

In [None]:
learn.recorder.plot_lr()

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
losses,idxs = interp.top_losses()

In [None]:
interp.plot_top_losses(16, figsize=(25,25), heatmap=False)

In [None]:
interp.plot_top_losses(16, figsize=(25,25), heatmap=True)

In [None]:
interp.plot_confusion_matrix(figsize=(12, 12), dpi=60)

In [None]:
interp.most_confused()

In [None]:
valid_tta_preds, y, losses = learn.TTA(scale=1.10, with_loss=True)

In [None]:
accuracy(valid_tta_preds, y)

In [None]:
confusion_matrix(y.cpu().numpy(), valid_tta_preds.argmax(1).cpu().numpy())

In [None]:
# train_preds, y, losses = learn.get_preds(DatasetType.Train, with_loss=True)
# roc_auc_score(y.cpu().numpy(), train_preds.cpu().numpy(), multi_class='ovr')

In [None]:
# plt.hist(losses, bins=24);

In [None]:
# val_preds, y, losses = learn.get_preds(DatasetType.Valid, with_loss=True)
# roc_auc_score(y.cpu().numpy(), val_preds.cpu().numpy(), multi_class='ovr')

In [None]:
# plt.hist(losses, bins=24);

In [None]:
learn.export('model.pkl')

In [None]:
preds, y = learn.get_preds(DatasetType.Test) 

In [None]:
sample_df.iloc[:,1:] = preds.numpy()
sample_df.to_csv('submission.csv', index=False)

In [None]:
sample_df.head()