In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# [iterative-stratification](https://github.com/trent-b/iterative-stratification)

iterative-stratification is a project that provides scikit-learn compatible cross validation with stratification for multilabel data.

scikit-learn provides module to split dataset with stratification. But, there is no option to stratify multilabel dataset.

iterative-stratification offers 'MultilabelStratifiedKFold', 'MultilabelRepeatedStratifiedKFold', 'MultilabelStratifiedShuffleSplit' methods which are suitable with multilabel dataset. 

In [None]:
pip install iterative-stratification

In [None]:
df = pd.read_csv('/kaggle/input/plant-pathology-2021-fgvc8/train.csv')
df.head()

In [None]:
len(df)

In [None]:
import collections

labels = df['labels'].to_list()
c = collections.Counter(labels)
c

In [None]:
sep_labels = []
for label in labels:
    sep_labels.extend(label.split(' '))
cs = collections.Counter(sep_labels)
cs

In [None]:
key = {label:i for i,label in enumerate(cs.keys())}
key

In [None]:
text_to_category = {label:[] for label in cs.keys()}
for idx, item in df.iterrows():
    for label in text_to_category:
        if label in item['labels']:
            text_to_category[label].append(1)
        else:
            text_to_category[label].append(0)

for label in text_to_category:
    df[label] = text_to_category[label]

In [None]:
df.head()

In [None]:
import plotly.express as px

In [None]:
fig = px.parallel_categories(df[["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"]], color="healthy", color_continuous_scale="sunset",\
                             title="Parallel categories plot of targets")
fig.show()

In [None]:
X,Y = df['image'].to_numpy(), df[["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"]].to_numpy(dtype=np.float32)

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

msss = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

for train_index, test_index in msss.split(X, Y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    kfold_train_df = pd.DataFrame(columns=["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"], data=y_train)
    kfold_test_df = pd.DataFrame(columns=["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"], data=y_test)
    
    fig_train = px.parallel_categories(kfold_train_df[["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"]], color="healthy", color_continuous_scale="sunset",\
                             title="categories plot of y_train")
    fig_test = px.parallel_categories(kfold_test_df[["healthy", "scab", "frog_eye_leaf_spot", "complex","rust","powdery_mildew"]], color="healthy", color_continuous_scale="sunset",\
                             title="categories plot of y_test")
    
    fig_train.show()
    fig_test.show()
    
    break