## Params

In [None]:
seed = 42
nfold = 5

## Imports

In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.cluster import KMeans
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from collections import Counter, defaultdict
from sklearn.utils import check_random_state

In [None]:
train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
train.head()

## Basic EDA

In [None]:
train.nunique()

As we can see, we only have 3255 patients. We want to make sure that each patient's images do not appear in multiple folds to avoid data leakage.

In [None]:
train.PatientID.value_counts()

Some patient has 172 images and some only has 1.

## Utils

In [None]:
class RepeatedStratifiedGroupKFold():

    def __init__(self, n_splits=5, n_repeats=1, random_state=None):
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.random_state = random_state
        
    def split(self, X, y=None, groups=None):
        k = self.n_splits
        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std(
                    [y_counts_per_fold[i][label] / y_distr[label] for i in range(k)]
                )
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
            
        rnd = check_random_state(self.random_state)
        for repeat in range(self.n_repeats):
            labels_num = np.max(y) + 1
            y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
            y_distr = Counter()
            for label, g in zip(y, groups):
                y_counts_per_group[g][label] += 1
                y_distr[label] += 1

            y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
            groups_per_fold = defaultdict(set)
        
            groups_and_y_counts = list(y_counts_per_group.items())
            rnd.shuffle(groups_and_y_counts)

            for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
                best_fold = None
                min_eval = None
                for i in range(k):
                    fold_eval = eval_y_counts_per_fold(y_counts, i)
                    if min_eval is None or fold_eval < min_eval:
                        min_eval = fold_eval
                        best_fold = i
                y_counts_per_fold[best_fold] += y_counts
                groups_per_fold[best_fold].add(g)
            
            all_groups = set(groups)
            for i in range(k):
                train_groups = all_groups - groups_per_fold[i]
                test_groups = groups_per_fold[i]

                train_indices = [i for i, g in enumerate(groups) if g in train_groups]
                test_indices = [i for i, g in enumerate(groups) if g in test_groups]

                yield train_indices, test_indices

## Split folds
Ideas:
1. make sure that the labels are stratified
2. one patient's images are grouped in one fold

In [None]:
# let's first concat all the labels 
# e.g 00000000010   
target_cols = train.drop(['StudyInstanceUID', 'PatientID'],axis=1).columns.values.tolist()
targets = train[target_cols].astype(str)
# create a new col to store the label
train['combined_tar'] = ''
for i in tqdm(range(targets.shape[1])):
    train['combined_tar'] += targets.iloc[:,i]
# take a look at it
train.combined_tar.value_counts()

In [None]:
train['combined_tar'] = LabelEncoder().fit_transform(train['combined_tar'])

In [None]:
train['fold'] = -1
rskf = RepeatedStratifiedGroupKFold(n_splits=nfold, random_state=42)
for i, (train_idx, valid_idx) in enumerate(rskf.split(train, train.combined_tar, train.PatientID)): #(df, targets, group)
    train.loc[valid_idx, 'fold'] = int(i)

## Sanity Check
You wanna make sure this split makes sense. We can do that by checking the stratification and groups.

In [None]:
train.query('fold==0').combined_tar.value_counts()

In [None]:
train.query('fold==1').combined_tar.value_counts()

In [None]:
train.query('fold==2').combined_tar.value_counts()

It seems that the label is very nicely stratified. Now let's check groups.

In [None]:
np.intersect1d(train.query('fold==0').PatientID.unique(), train.query('fold==1').PatientID.unique())

In [None]:
np.intersect1d(train.query('fold==1').PatientID.unique(), train.query('fold==2').PatientID.unique())

In [None]:
np.intersect1d(train.query('fold==2').PatientID.unique(), train.query('fold==3').PatientID.unique())

No patient has appeared in two folds.

## Save final CSV

In [None]:
train.drop('combined_tar', axis=1)

In [None]:
train.drop('combined_tar', axis=1).to_csv('train_folds.csv', index=False)