## Main Idea

* Stratified kfolds based on **categorical features** and **206 multiple labels**
* Group kfolds based on **drug_id** 
* Make sure that the different labels and categorical features are spreaded nicely in each fold
* Make sure that each group(drug) does not appear in the same fold 

## Params

In [None]:
# params
seed = 42
nfold = 5
drop_vehicle = False

## Imports

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.cluster import KMeans
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from collections import Counter, defaultdict
from sklearn.utils import check_random_state

In [None]:
train = pd.read_csv('../input/lish-moa/train_features.csv')
train_tar = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
drug = pd.read_csv('../input/lish-moa/train_drug.csv')

In [None]:
if drop_vehicle:
    train = train.query("cp_type != 'ctl_vehicle'")
    train_tar = train_tar.loc[train.index]
    train = train.reset_index(drop=True)
    train_tar = train_tar.reset_index(drop=True)

In [None]:
combined = train.merge(train_tar, how='inner', on='sig_id')

## RSGKF Util

In [None]:
class RepeatedStratifiedGroupKFold():

    def __init__(self, n_splits=5, n_repeats=1, random_state=None):
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.random_state = random_state
        
    def split(self, X, y=None, groups=None):
        k = self.n_splits
        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
            
        rnd = check_random_state(self.random_state)
        for repeat in range(self.n_repeats):
            labels_num = np.max(y) + 1
            y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
            y_distr = Counter()
            for label, g in zip(y, groups):
                y_counts_per_group[g][label] += 1
                y_distr[label] += 1

            y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
            groups_per_fold = defaultdict(set)
        
            groups_and_y_counts = list(y_counts_per_group.items())
            rnd.shuffle(groups_and_y_counts)

            for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
                best_fold = None
                min_eval = None
                for i in range(k):
                    fold_eval = eval_y_counts_per_fold(y_counts, i)
                    if min_eval is None or fold_eval < min_eval:
                        min_eval = fold_eval
                        best_fold = i
                y_counts_per_fold[best_fold] += y_counts
                groups_per_fold[best_fold].add(g)
            
            all_groups = set(groups)
            for i in range(k):
                train_groups = all_groups - groups_per_fold[i]
                test_groups = groups_per_fold[i]

                train_indices = [i for i, g in enumerate(groups) if g in train_groups]
                test_indices = [i for i, g in enumerate(groups) if g in test_groups]

                yield train_indices, test_indices

## Split Folds

First, concat all labels into a string to something like **0001000010000100..48D1**

In [None]:
# concat all the labels 
# e.g 000100100100000.....
target_cols = train_tar.drop('sig_id',axis=1).columns.values.tolist()
targets = combined[target_cols].astype(str)
# create a new col to store the label
combined['combined_tar'] = ''
for i in tqdm(range(targets.shape[1])):
    combined['combined_tar'] += targets.iloc[:,i]
# add the categorical features as well
combined['combined_tar'] += (combined.cp_time.astype(str) + combined.cp_dose)
# take a look at it
combined.combined_tar.value_counts()[:5]

In [None]:
# encode the combined_tar, it's required for RepeatedStratifiedGroupKFold to have numerical targets
combined['combined_tar'] = LabelEncoder().fit_transform(combined['combined_tar'])

In [None]:
combined['drug_id'] = drug.drug_id

In [None]:
combined['fold'] = -1
rskf = RepeatedStratifiedGroupKFold(n_splits=nfold, random_state=42)
for i, (train_idx, valid_idx) in enumerate(rskf.split(combined, combined.combined_tar, combined.drug_id)):
    combined.loc[valid_idx, 'fold'] = i
combined.fold = combined.fold.astype(int)

## Sanity Check

In [None]:
# sanity check for stratification
combined.query('fold==0').combined_tar.value_counts()

In [None]:
# sanity check for stratification
combined.query('fold==1').combined_tar.value_counts()

In [None]:
# sanity check for groups
np.intersect1d(combined.query('fold==0').drug_id.unique(), combined.query('fold==1').drug_id.unique())

In [None]:
# sanity check for groups
np.intersect1d(combined.query('fold==1').drug_id.unique(), combined.query('fold==2').drug_id.unique())

## Output

In [None]:
combined.head()

In [None]:
if drop_vehicle:
    combined.to_csv('RSGKF_combined_no_v.csv', index=False)
else:
    combined.to_csv('RSGKF_combined.csv', index=False)