In [None]:
!pip install -q /kaggle/input/iterative-stratification/iterative-stratification-master/

In [None]:
import pandas as pd
import numpy as np
from fastai.vision.all import *
import pickle
import os
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/knn-dataset/combined.csv')
df = df.sample(frac=1, random_state=42)

In [None]:
labels = [str(i) for i in range(19)]
for x in labels: df[x] = df['Label'].apply(lambda r: int(x in r.split('|')))

In [None]:
for i,row in df.iterrows():
    if row['cluster'] == -1:
        df['cluster'].loc[i] = i + 7933

In [None]:
df_copy = df['cluster'].copy()

In [None]:
len(df), len(df_copy)

In [None]:
df_copy = df_copy.drop_duplicates()
len(df_copy)

In [None]:
df_copy = df.loc[df_copy.index]

In [None]:
def get_counts(dfs):
    unique_counts = {}
    for lbl in labels:
        unique_counts[lbl] = len(dfs[dfs.Label == lbl])

    full_counts = {}
    for lbl in labels:
        count = 0
        for row_label in dfs['Label']:
            if lbl in row_label.split('|'): count += 1
        full_counts[lbl] = count

    counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
    counts = np.array(sorted(counts, key=lambda x:-x[1]))
    counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])
    counts = counts.set_index('label').T
    return counts

In [None]:
get_counts(df_copy)

In [None]:
dfs = df_copy.reset_index(drop=True)
nfold = 5
seed = 42

y = dfs[labels].values
X = dfs['ID'].values

dfs['fold'] = np.nan

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
mskf = MultilabelStratifiedKFold(n_splits=nfold, random_state=seed)
for i, (_, test_index) in enumerate(mskf.split(X, y)):
    dfs.iloc[test_index, -1] = i
    
dfs['fold'] = dfs['fold'].astype('int')

In [None]:
df_folds = dfs[['cluster', 'fold']].drop_duplicates()
len(df_folds)

In [None]:
df = pd.merge(df, df_folds, how='left', on='cluster')

In [None]:
df.fold.value_counts()

In [None]:
dfs1 = df[df['fold'] == 0]
c1 = dfs1.cluster.unique().tolist()
get_counts(dfs1)

In [None]:
dfs1 = df[df['fold'] == 1]
c2 = dfs1.cluster.unique().tolist()
get_counts(dfs1)

In [None]:
dfs1 = df[df['fold'] == 2]
c3 = dfs1.cluster.unique().tolist()
get_counts(dfs1)

In [None]:
dfs1 = df[df['fold'] == 3]
c4 = dfs1.cluster.unique().tolist()
get_counts(dfs1)

In [None]:
dfs1 = df[df['fold'] == 4]
c5 = dfs1.cluster.unique().tolist()
get_counts(dfs1)

In [None]:
assert len(set(c1 + c2 + c3 + c4 + c5)) == len(c1 + c2 + c3 + c4 + c5)

In [None]:
df = df[['ID', 'Label', 'dataset', 'fold']]
df

In [None]:
df.to_csv('hpa_folds_v1.csv', index=False)