In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
!ls /kaggle/input/plant-pathology-2020-fgvc7

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from fastai.vision.all import *
path = Path('/kaggle/input/plant-pathology-2020-fgvc7')

In [None]:
train = pd.read_csv(path/'train.csv')
test = pd.read_csv(path/'test.csv')

In [None]:
train['label'] = train.apply(lambda x: " ".join([cat for cat in train.columns if x[cat] == 1]), axis=1)

In [None]:
train['label'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(shuffle=True)

In [None]:
train['fold'] = -1
for i, (train_index, val_index) in enumerate(skf.split(train, train['label'])):
    print(train_index.shape, val_index.shape)
    train.loc[val_index, 'fold'] = i

In [None]:
train.to_csv('my_train_with_folds.csv', index=False)

In [None]:
for fold in range(0,5):
    print(fold)
    plant = DataBlock(blocks=(ImageBlock, CategoryBlock),
                      splitter=IndexSplitter(train[train['fold'] == fold].index),
                      get_x=ColReader('image_id', pref=path/"images", suff='.jpg'),
                      get_y=ColReader('label'),
                      item_tfms=Resize(480),
                      batch_tfms=aug_transforms(size=224, min_scale=0.75))
    
    dls = plant.dataloaders(train)
    learn = cnn_learner(dls, 
                        resnet50, 
                        loss_func=LabelSmoothingCrossEntropy(), 
                        metrics=RocAuc(), 
                        cbs=CSVLogger(f'history_{fold}'))
    
    learn.fine_tune(5, base_lr=3e-3)
    learn.save(f'model_{fold}')
    
    tst_dl = dls.test_dl(test)
    res = learn.get_preds(dl=tst_dl)
    
    cols=['healthy', 'multiple_diseases', 'rust', 'scab']
    pd.concat([test, pd.DataFrame(res[0], columns=cols)], axis=1).to_csv(f'submission_{fold}.csv', index=False)

In [None]:
from scipy.stats import rankdata

my_df = {}
for fold in range(0, 5):
    my_df[fold] = pd.read_csv(f'submission_{fold}.csv').set_index('image_id')

# for each target, for each fold, compute the rank of the solution
for x in ['healthy','multiple_diseases','rust','scab']:
    for fold in range(0, 5):
        my_df[fold][x] = rankdata(my_df[fold][x], method='min')

# sum the ranks in the 1st df
for x in ['healthy','multiple_diseases','rust','scab']:
    for fold in range(1, 5):
        my_df[0][x] += my_df[fold][x]

my_df[0].reset_index().to_csv('submission.csv', index=False)