# Training Data

In [0]:
import catalyst
from pathlib import Path
from catalyst.dl import utils

SEED = 42
utils.set_global_seed(SEED)

In [0]:
ROOT = "ICLR/"

In [28]:
from catalyst.utils import (
    create_dataset, create_dataframe, get_dataset_labeling, map_dataframe
)
dataset = create_dataset(dirs=f"{ROOT}train/train/*", extension="*")
df = create_dataframe(dataset, columns=["class", "filepath"])

tag_to_label = get_dataset_labeling(df, "class")
class_names = [
    name for name, id_ in sorted(tag_to_label.items(), key=lambda x: x[1])
]

df_with_labels = map_dataframe(
    df, 
    tag_column="class", 
    class_column="label", 
    tag2class=tag_to_label, 
    verbose=False
)
df_with_labels.head()

Unnamed: 0,class,filepath,label
0,healthy_wheat,ICLR/train/train/healthy_wheat/03TD19.jfif,0
1,healthy_wheat,ICLR/train/train/healthy_wheat/0LBIWV.jpg,0
2,healthy_wheat,ICLR/train/train/healthy_wheat/0O5BON.jfif,0
3,healthy_wheat,ICLR/train/train/healthy_wheat/0PFX47.jpg,0
4,healthy_wheat,ICLR/train/train/healthy_wheat/1LNMUQ.jfif,0


In [29]:
df_with_labels.shape

(875, 3)

In [30]:
df_with_labels = catalyst.utils.pandas.balance_classes(df_with_labels, class_column='label',
                                                    random_state=SEED, how='upsampling')
print(df_with_labels.shape)
df_with_labels.head()

(1128, 3)


Unnamed: 0,class,filepath,label
0,healthy_wheat,ICLR/train/train/healthy_wheat/03TD19.jfif,0
1,healthy_wheat,ICLR/train/train/healthy_wheat/0LBIWV.jpg,0
2,healthy_wheat,ICLR/train/train/healthy_wheat/0O5BON.jfif,0
3,healthy_wheat,ICLR/train/train/healthy_wheat/0PFX47.jpg,0
4,healthy_wheat,ICLR/train/train/healthy_wheat/1LNMUQ.jfif,0


In [31]:
df_with_labels.loc[:,'kfold'] = -1
df_with_labels.head()

Unnamed: 0,class,filepath,label,kfold
0,healthy_wheat,ICLR/train/train/healthy_wheat/03TD19.jfif,0,-1
1,healthy_wheat,ICLR/train/train/healthy_wheat/0LBIWV.jpg,0,-1
2,healthy_wheat,ICLR/train/train/healthy_wheat/0O5BON.jfif,0,-1
3,healthy_wheat,ICLR/train/train/healthy_wheat/0PFX47.jpg,0,-1
4,healthy_wheat,ICLR/train/train/healthy_wheat/1LNMUQ.jfif,0,-1


In [32]:
df_with_labels = df_with_labels.sample(frac=1).reset_index(drop=True)
df_with_labels.head()

Unnamed: 0,class,filepath,label,kfold
0,stem_rust,ICLR/train/train/stem_rust/1CQ95E.JPG,2,-1
1,stem_rust,ICLR/train/train/stem_rust/7QBIPY.jpg,2,-1
2,leaf_rust,ICLR/train/train/leaf_rust/1Z2P44.jpg,1,-1
3,healthy_wheat,ICLR/train/train/healthy_wheat/ZB9CAK.jpg,0,-1
4,stem_rust,ICLR/train/train/stem_rust/4PS17R.jpg,2,-1


In [0]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

X = df_with_labels.filepath.values
y = df_with_labels.label.values

skf = StratifiedKFold(n_splits=5)

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    #print("TRAIN:", train_index, "TEST:", val_index)
    df_with_labels.loc[val_index, 'kfold'] = fold

In [34]:
df_with_labels.kfold.value_counts()

2    226
1    226
0    226
4    225
3    225
Name: kfold, dtype: int64

In [35]:
df_with_labels.head()

Unnamed: 0,class,filepath,label,kfold
0,stem_rust,ICLR/train/train/stem_rust/1CQ95E.JPG,2,0
1,stem_rust,ICLR/train/train/stem_rust/7QBIPY.jpg,2,0
2,leaf_rust,ICLR/train/train/leaf_rust/1Z2P44.jpg,1,0
3,healthy_wheat,ICLR/train/train/healthy_wheat/ZB9CAK.jpg,0,0
4,stem_rust,ICLR/train/train/stem_rust/4PS17R.jpg,2,0


In [36]:
df_with_labels.shape

(1128, 4)

In [0]:
df_with_labels.to_csv('train.csv', index=False)

# Test Data

In [38]:
from catalyst.utils import (
    create_dataset, create_dataframe, get_dataset_labeling, map_dataframe
)
test_dataset = create_dataset(dirs=f"{ROOT}test/test/", extension="*")
test_df = create_dataframe(test_dataset, columns=["class", "filepath"])

test_df = test_df.drop('class', 1)
test_df.head()

Unnamed: 0,filepath
0,ICLR/test/test/008FWT.JPG
1,ICLR/test/test/00AQXY.JPG
2,ICLR/test/test/01OJZX.JPG
3,ICLR/test/test/07OXKK.jfif
4,ICLR/test/test/085IEC.jpg


In [0]:
test_df.to_csv('test.csv', index=False)