# Overview
In this kernel, I'm going to demonstrate how to build a stratified validation splits while doing some preliminary EDA on both train and test set

# EDA and Observations

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/rsna-str-pulmonary-embolism-detection/train.csv')
test = pd.read_csv('../input/rsna-str-pulmonary-embolism-detection/test.csv')

print('train.shape', train.shape, 'test.shape', test.shape)

In [None]:
train.head(5)

In [None]:
train.groupby('StudyInstanceUID')['SeriesInstanceUID'].nunique().max(), test.groupby('StudyInstanceUID')['SeriesInstanceUID'].nunique().max()

> The StudyInstanceUID and SeriesInstanceUID is 1-1 mapping in both train and test set

In [None]:
np.intersect1d(train.StudyInstanceUID.unique(), test.StudyInstanceUID.unique())

> No repeated StudyInstanceUID between train and test

### **From the above 2 observations, it is important to split train and validation set based on StudyInstanceUID\SeriesInstanceUID to simulate train-test split**

In [None]:
train_image_num_per_patient = train.groupby('StudyInstanceUID')['SOPInstanceUID'].nunique()
test_image_num_per_patient = test.groupby('StudyInstanceUID')['SOPInstanceUID'].nunique()

In [None]:
train_image_num_per_patient.describe()

In [None]:
test_image_num_per_patient.describe()

In [None]:
import matplotlib.pyplot as plt
plt.title('image_num_per_patient')
plt.hist(train_image_num_per_patient, bins=100, label='train', density=True)
plt.hist(test_image_num_per_patient, bins=100, label='test', density=True)
plt.legend()
plt.show()

> As we could see, the number of images per patient in train and test are pretty close, except some slight right shift in train set (comparing the 25%-50%-75% from the describe() method)

### **From the above, it is better to also do *stratified split* based on the *image number per patient* to simulate train-test split**

### We will do validation splits based on
1. Patient: Same patient should be in the same validation split
2. Number of image per patient: Distribution should be similar across all validation splits

# Create Stratified Validation Splits

In [None]:
FOLD_NUM = 20
target_cols = [c for i, c in enumerate(train.columns) if i > 2]

In [None]:
# build summary of image num and target variables for each patient
train_per_patient_char = pd.DataFrame(index=train_image_num_per_patient.index, columns=['image_per_patient'], data=train_image_num_per_patient.values.copy())
for t in target_cols:
    train_per_patient_char[t] = train_per_patient_char.index.map(train.groupby('StudyInstanceUID')[t].mean())

train_per_patient_char.head(10)

> only **pe_present_on_image** is image level, that's why only it is the only patient-level value with floating number after averaging

In [None]:
# make image_per_patient and pe_present_on_image into bins
bin_counts = [40] #, 20]
digitize_cols = ['image_per_patient'] #, 'pe_present_on_image']
non_digitize_cols = [c for c in train_per_patient_char.columns if c not in digitize_cols]

In [None]:
for i, c in enumerate(digitize_cols):
    bin_count = bin_counts[i]
    percentiles = np.percentile(train_per_patient_char[c], q=np.arange(bin_count)/bin_count*100.)
    #print(percentiles)
    print(train_per_patient_char[c].value_counts())
    train_per_patient_char[c+'_digitize'] = np.digitize(train_per_patient_char[c], percentiles, right=False)
    print(train_per_patient_char[c+'_digitize'].value_counts())
    plt.hist(train_per_patient_char[c+'_digitize'], bins=bin_count)
    plt.show()

In [None]:
train_per_patient_char['key'] = train_per_patient_char[digitize_cols[0]+'_digitize'].apply(str)
for c in digitize_cols[1:]:
    train_per_patient_char['key'] = train_per_patient_char['key']+'_'+train_per_patient_char[c+'_digitize'].apply(str)

train_per_patient_char['key'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
folds = FOLD_NUM
kfolder = StratifiedKFold(n_splits=folds, shuffle=True, random_state=719)
val_indices = [val_indices for _, val_indices in kfolder.split(train_per_patient_char['key'], train_per_patient_char['key'])]

train_per_patient_char['fold'] = -1
for i, vi in enumerate(val_indices):
    patients = train_per_patient_char.index[vi]
    train_per_patient_char.loc[patients, 'fold'] = i
train_per_patient_char['fold'].value_counts()

In [None]:
# check each fold for the distribution of the number of images per patients
for col in digitize_cols:
    fig, axs = plt.subplots(nrows=4, ncols=int(np.floor(folds/4)), constrained_layout=False, sharex=True, sharey=True)
    fig.set_figheight(10)
    fig.set_figwidth(20)
    axs = axs.flat
    for i, vi in enumerate(val_indices):
        patients = train_per_patient_char.index[vi]
        axs[i].set_title(col+' fold_'+str(i))
        axs[i].hist(train_per_patient_char.loc[patients, col], bins=20, range=(train_per_patient_char[col].min(), train_per_patient_char[col].max()))
    plt.show()

In [None]:
# check each fold for the target distribution
for col in non_digitize_cols:
    fig, axs = plt.subplots(nrows=4, ncols=int(np.floor(folds/4)), constrained_layout=False, sharex=True, sharey=True)
    fig.set_figheight(10)
    fig.set_figwidth(20)
    axs = axs.flat
    for i, vi in enumerate(val_indices):
        patients = train_per_patient_char.index[vi]
        axs[i].set_title(col+' fold_'+str(i))
        axs[i].hist(train_per_patient_char.loc[patients, col], bins=20, range=(train_per_patient_char[col].min(), train_per_patient_char[col].max()))
    plt.show()

In [None]:
train_per_patient_char.to_csv('rsna_train_splits_fold_{}.csv'.format(FOLD_NUM))

## Each fold looks similar in the distribution of the number of images per patient now

**Further usage of this kernel:**
* You could use the output csv directly to do patient level subsampling (ex. select fold=1-5 to do 5 fold cross-validation) 
* You could modify FOLD_NUM above to create different number of stratified folds yourself 
* You could modify bin_counts+digitize_cols above to digitize columns with designated bin counts, which will be futher incorporated into the new "key" to do the stratified validation splits