# Competition Data

Jupyter notebook for the competition data lifecycle

## Sections

### Download the Dataset

### Make Dataframes

### Feature Engineering

### Read Dataset



In [5]:
# Autoreload extensions
%load_ext autoreload
%autoreload 2

# Display all 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Cell #1: Imports and Constants
- Import the libs
- Write some constants for this file like default values for the parametersmm

In [7]:
from src.data.comp.config import (
    # Competition specific config
    DATASET_NAME, LABELS, LABEL_COLS, SPLIT, RENAME_MAP, 
    # Mostly constants
    HOLDOUT_PERCENTAGE, NUM_FOLDS, RANDOM_STATE, 
    # Paths for the dataset
    RAW_DATA_PATH, INTERIM_DATA_PATH, PROCESSED_DATA_PATH, 
)

# Default values for the parameters
INPUT_FOLDER = RAW_DATA_PATH 
OUTPUT_FOLDER = INTERIM_DATA_PATH

In [8]:
""" 
Cell #2: Read input dataframes from the dataset
"""
def merge_input_dataframes(train_img, train_study):
    image_level_rename_map = { 'StudyInstanceUID': 'study_id', 'id': 'img_id' }
    train_img.id = train_img.id.str.replace('_image', '')
    train_img = train_img.rename(columns=image_level_rename_map)
    study_level_rename_map = {'id':'study_id'}
    train_study.id = train_study.id.str.replace('_study', '')
    train_study = train_study.rename(columns=study_level_rename_map)
    train = train_img.merge(train_study, on='study_id')
    return train

def read_raw_train(input_folder=INPUT_FOLDER):
    train_study = pd.read_csv(input_folder / 'train_study_level.csv')
    train_img = pd.read_csv(input_folder / 'train_image_level.csv')
    train = merge_input_dataframes(train_img, train_study)
    return train

def get_path_components(path): 
    normalized_path = os.path.normpath(path)
    path_components = normalized_path.split(os.sep)
    return path_components

def read_raw_test(input_folder=INPUT_FOLDER):
    filepaths = glob.glob(str(input_folder / 'test/**/*dcm'), recursive=True)
    test = pd.DataFrame({ 'img_path': filepaths })
    test['img_id'] = test.img_path.map(lambda x: get_path_components(x)[-1].replace('.dcm', ''))
    test['study_id'] = test.img_path.map(lambda x: get_path_components(x)[-3].replace('.dcm', ''))
    return test 

def read_raw_sample_sub(input_folder=INPUT_FOLDER):
    sample_sub = pd.read_csv(input_folder / 'sample_submission.csv')
    return sample_sub


#%%
# Jupyter: Test for the cell 
train = read_raw_train()
test = read_raw_test()
sample_sub = read_raw_sample_sub()

train.head(3)
test.head(3)

' \nCell #2: Read input dataframes from the dataset\n'

Unnamed: 0,img_id,boxes,label,study_id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,0,1,0,0
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,1,0,0,0
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,0,1,0,0


Unnamed: 0,img_path,img_id,study_id
0,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,435bc0fcb0ab,0d7e69753505
1,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,75b2c9f1f232,149c8d66e874
2,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,5f65421ff6fd,1c716d133c0c


In [12]:
""" 
Cell #3: Standardize, Sample and Split
Standardize train - add columns like group, stratify, etc
Sample - sample 1, 5, 20 and 100% of data
"""
import numpy as np

def standardize_train(train): 
    # One hot encode and add labels
    train['one_hot'] = train[LABEL_COLS].apply(lambda row: row.values, axis='columns')
    train['label'] = train.one_hot.apply(lambda array: np.argmax(array))

    # Add stratify column and group column
    train['stratify'] = train['one_hot'].apply(str)
    train['group'] = train['study_id'].apply(str)
    
    return train

#%%
# Jupyter: testing and validation that fold_dfs are similar to each other
train = standardize_train(train)

print('expected values')
train.stratify.value_counts() / NUM_FOLDS
fold_dfs = src.data.utils.get_fold_dfs(train, 'group', NUM_FOLDS)
for fold_i, fold_df in enumerate(fold_dfs): 
    print(f'--- FOLD {fold_i} ---')
    fold_df.stratify.value_counts()

' \nCell #3: Standardize, Sample and Split\nStandardize train - add columns like group, stratify, etc\nSample - sample 1, 5, 20 and 100% of data\n'

expected values


[0 1 0 0]    751.75
[1 0 0 0]    434.00
[0 0 1 0]    277.00
[0 0 0 1]    120.75
Name: stratify, dtype: float64

--- FOLD 0 ---


[0 1 0 0]    766
[1 0 0 0]    425
[0 0 1 0]    275
[0 0 0 1]    118
Name: stratify, dtype: int64

--- FOLD 1 ---


[0 1 0 0]    731
[1 0 0 0]    447
[0 0 1 0]    285
[0 0 0 1]    121
Name: stratify, dtype: int64

--- FOLD 2 ---


[0 1 0 0]    758
[1 0 0 0]    430
[0 0 1 0]    261
[0 0 0 1]    134
Name: stratify, dtype: int64

--- FOLD 3 ---


[0 1 0 0]    752
[1 0 0 0]    434
[0 0 1 0]    287
[0 0 0 1]    110
Name: stratify, dtype: int64

In [10]:
def sample_train_func(train, num_values_to_sample, random_state=RANDOM_STATE):
    return train.sample(num_values_to_sample, random_state=random_state)


#%%
# Jupyter: Test sample function
num_values_to_sample = src.data.utils.get_num_values_to_sample(train, 'twenty') 
sample_train_func(train, num_values_to_sample, RANDOM_STATE).head(3)

Unnamed: 0,img_id,boxes,label,study_id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,one_hot,stratify,group
1357,32e5cba874ee,"[{'x': 2599.54486, 'y': 329.96207, 'width': 11...",1,18a530e6a802,0,1,0,0,"[0, 1, 0, 0]",[0 1 0 0],18a530e6a802
2080,4f0f03727fad,"[{'x': 729.18486, 'y': 696.88445, 'width': 424...",2,09d8a69657ab,0,0,1,0,"[0, 0, 1, 0]",[0 0 1 0],09d8a69657ab
2718,6813965b522a,"[{'x': 1689.78928, 'y': 536.67926, 'width': 79...",1,cc7cafd867f1,0,1,0,0,"[0, 1, 0, 0]",[0 1 0 0],cc7cafd867f1


In [13]:
def build_dataframes(input_folder=INPUT_FOLDER, output_folder=OUTPUT_FOLDER): 
    """
    Main function to build the dataframes and the folder

    Args:
        input_folder (Path): files read from here
        output_folder (Path): dataframes saved here
        build_options (OmegaConf): config dict to build the output
    """
    # Read input dataframes
    train, test, sample_sub = read_raw_train(input_folder), read_raw_test(input_folder), read_raw_sample_sub(input_folder)
    
    # Standardize train
    train = standardize_train(train)
    
    # Split train into holdout    
    train, holdout = src.data.utils.get_holdout(train, HOLDOUT_PERCENTAGE)
    pd.to_pickle(train, output_folder/ 'train_full.pkl')
    for df_name in ['test', 'holdout', 'sample_sub']: 
        df = eval(df_name)
        pd.to_pickle(df, output_folder / (df_name+'.pkl'))
    
    # Save the dataframes
    build_fold_dfs = lambda df: src.data.utils.get_fold_dfs(df, 'group', NUM_FOLDS)
    src.data.utils.save_dataframes(train, sample_train_func, build_fold_dfs, NUM_FOLDS, RANDOM_STATE, output_folder)
    
    
if __name__ == '__main__':
    build_dataframes()

dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_0\one
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_0\five
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_0\twenty
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_0\full
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_1\one
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_1\five
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_1\twenty
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_1\full
dataframes created for folder C:\Users\sarth\Desktop\kaggle-v2\data\interim\siim-covid19-detection\fold_2\one


# Feature Engineering

In [14]:
import pandas as pd
import glob
import os 

from src.data.utils import feature_col
from src.data.comp.config import (
    # Competition specific config
    DATASET_NAME, LABELS, LABEL_COLS, SPLIT, RENAME_MAP, 
    # Mostly constants
    HOLDOUT_PERCENTAGE, NUM_FOLDS, RANDOM_STATE, 
    # Paths for the dataset
    RAW_DATA_PATH, INTERIM_DATA_PATH, PROCESSED_DATA_PATH, 
)
import src.data.comp.make_dataset

# Default values for the parameters
INPUT_FOLDER = INTERIM_DATA_PATH
OUTPUT_FOLDER = PROCESSED_DATA_PATH

# Main functions for feature engineering
def add_file_path(df, input_folder): 
    @feature_col
    def file_path(filepaths, img_id, study_id): 
        for filepath in filepaths: 
            if img_id in filepath and study_id in filepath: 
                return filepath
        return None
    glob_re = str(input_folder/ '**/*dcm')
    filepaths = glob.glob(glob_re, recursive=True)
    df = file_path(df, filepaths=filepaths)
    return df    

# Feature Engineering Pipeline
def common_feature_engineering_pipeline(df): 
    return df

def train_pipeline(train, **kwargs): 
    train = common_feature_engineering_pipeline(train)
    train = add_file_path(train, input_folder=RAW_DATA_PATH / 'train')
    train = train.reset_index(drop=True)
    return train 

def test_pipeline(test, **kwargs): 
    test = common_feature_engineering_pipeline(test)
    return test

def read_all_pkl_files(input_folder): 
    all_pkl_files_in_input_folder = glob.glob(str(input_folder / '**/*.pkl'), recursive=True)
    return all_pkl_files_in_input_folder

def get_output_path(input_path, input_folder, output_folder): 
    """
    Get output path for the dataframe by replacing input folder in the input path by output path
    """
    output_path = str(input_path).replace(str(input_folder), str(output_folder))
    return output_path

def apply_train_feature_engineering_pipeline(input_folder=INPUT_FOLDER, output_folder=OUTPUT_FOLDER, train_pipeline=train_pipeline): 
    all_pkl_files = read_all_pkl_files(input_folder)
    for pkl_path in all_pkl_files: 
        output_path = get_output_path(pkl_path, input_folder, output_folder)
        is_train = 'train' in pkl_path or 'valid' in pkl_path
        if not is_train: continue
        df = pd.read_pickle(pkl_path)
        df = train_pipeline(df)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df.to_pickle(output_path)


def apply_test_feature_engineering_pipeline(raw_input_folder=RAW_DATA_PATH, test_pipeline=test_pipeline):
    test = src.data.comp.make_dataset.read_raw_test(raw_input_folder)
    test = test_pipeline(test)
    return test

def save_test(output_folder=OUTPUT_FOLDER):
    test = apply_test_feature_engineering_pipeline()
    save_path = output_folder / 'test.pkl'
    test.to_pickle(save_path)
    print(f'test saved at {save_path}')
    

if __name__ == '__main__': 
    apply_train_feature_engineering_pipeline()

In [16]:
# Main functions for feature engineering
def add_file_path(df, input_folder): 
    @feature_col
    def file_path(filepaths, img_id, study_id): 
        for filepath in filepaths: 
            if img_id in filepath and study_id in filepath: 
                return filepath
        return None
    glob_re = str(input_folder/ '**/*dcm')
    filepaths = glob.glob(glob_re, recursive=True)
    df = file_path(df, filepaths=filepaths)
    return df    

# Feature Engineering Pipeline
def common_feature_engineering_pipeline(df): 
    return df

def train_pipeline(train, **kwargs): 
    train = common_feature_engineering_pipeline(train)
    train = add_file_path(train, input_folder=RAW_DATA_PATH / 'train')
    train = train.reset_index(drop=True)
    return train 

def test_pipeline(test, **kwargs): 
    test = common_feature_engineering_pipeline(test)
    return test
 

def read_all_pkl_files(input_folder): 
    all_pkl_files_in_input_folder = glob.glob(str(input_folder / '**/*.pkl'), recursive=True)
    return all_pkl_files_in_input_folder

def get_output_path(input_path, input_folder, output_folder): 
    """
    Get output path for the dataframe by replacing input folder in the input path by output path
    """
    output_path = str(input_path).replace(str(input_folder), str(output_folder))
    return output_path

def apply_train_feature_engineering_pipeline(input_folder=INPUT_FOLDER, output_folder=OUTPUT_FOLDER, train_pipeline=train_pipeline): 
    all_pkl_files = read_all_pkl_files(input_folder)
    for pkl_path in all_pkl_files: 
        output_path = get_output_path(pkl_path, input_folder, output_folder)
        is_train = 'train' in pkl_path or 'valid' in pkl_path
        if not is_train: continue
        df = pd.read_pickle(pkl_path)
        df = train_pipeline(df)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df.to_pickle(output_path)


def apply_test_feature_engineering_pipeline(raw_input_folder=RAW_DATA_PATH, test_pipeline=test_pipeline):
    test = src.data.comp.make_dataset.read_raw_test(raw_input_folder)
    test = test_pipeline(test)
    return test


if __name__ == '__main__': 
    apply_train_feature_engineering_pipeline()

file_path took 0.1046602725982666 seconds to execute for df of length 6270
file_path took 0.09667801856994629 seconds to execute for df of length 6270
file_path took 0.003986358642578125 seconds to execute for df of length 235
file_path took 0.002990245819091797 seconds to execute for df of length 79
file_path took 0.029905319213867188 seconds to execute for df of length 1584
file_path took 0.07574462890625 seconds to execute for df of length 4750
file_path took 0.07674312591552734 seconds to execute for df of length 4702
file_path took 0.023920774459838867 seconds to execute for df of length 1568
file_path took 0.0259096622467041 seconds to execute for df of length 1584
file_path took 0.09170150756835938 seconds to execute for df of length 4750
file_path took 0.001995086669921875 seconds to execute for df of length 47
file_path took 0.0009970664978027344 seconds to execute for df of length 16
file_path took 0.02691030502319336 seconds to execute for df of length 1584
file_path took 0.

In [17]:
train_path = OUTPUT_FOLDER / 'fold_3' / 'full' / 'train.pkl'
train = pd.read_pickle(train_path)
train.head(3)

Unnamed: 0,img_id,boxes,label,study_id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,one_hot,stratify,group,img_path,fold,file_path
0,a344911414ce,,0,07c985cdc2e3,1,0,0,0,"[1, 0, 0, 0]","[1, 0, 0, 0]",07c985cdc2e3,,1,
1,bcb1d52e4eab,"[{'x': 1558.82736, 'y': 938.61835, 'width': 63...",1,76122cbd97f4,0,1,0,0,"[0, 1, 0, 0]","[0, 1, 0, 0]",76122cbd97f4,,0,
2,5af8dfe95d88,"[{'x': 2944.08004, 'y': 556.8, 'width': 1061.4...",1,022f9b3cfd91,0,1,0,0,"[0, 1, 0, 0]","[0, 1, 0, 0]",022f9b3cfd91,,2,


# READ DATASET

In [25]:

import pandas as pd
import glob
import os 

from src.data.utils import feature_col
from src.data.comp.config import (
    # Competition specific config
    DATASET_NAME, LABELS, LABEL_COLS, SPLIT, RENAME_MAP, 
    # Mostly constants
    HOLDOUT_PERCENTAGE, NUM_FOLDS, RANDOM_STATE, 
    # Paths for the dataset
    RAW_DATA_PATH, INTERIM_DATA_PATH, PROCESSED_DATA_PATH, 
)
import src.data.comp.build_features
import src.data.comp.make_dataset

# Constants to be imported 
FEATURE_COLS = ['file_path'] # X for the model
TARGET_COL = 'label' # y for the model

def read_dataframe(df_name, fold=0, debug_percentage='full', input_folder=PROCESSED_DATA_PATH): 
    df_path = input_folder / f'fold_{fold}' / debug_percentage / f'{df_name}.pkl'
    df = pd.read_pickle(df_path)
    return df

def read_dataframes(fold=0, debug_percentage='full', input_folder=PROCESSED_DATA_PATH):
    res = {}
    # read the files from the outer folder
    res['train_full'] = pd.read_pickle(input_folder / 'train_full.pkl')
    res['test'] = pd.read_pickle(input_folder / 'test.pkl')
    res['holdout'] = pd.read_pickle(input_folder / 'holdout.pkl')
    
    # read the files from the inner folder 
    inner_folder = input_folder / f'fold_{fold}' / debug_percentage
    res['train'] = pd.read_pickle(inner_folder / 'train.pkl')
    res['valid'] = pd.read_pickle(inner_folder / 'valid.pkl')
    res['valid_75'] = pd.read_pickle(inner_folder / 'valid_75.pkl')
    res['valid_25'] = pd.read_pickle(inner_folder / 'valid_25.pkl')
    
    # make some testing files
    res['tr'] = res['train'].head(10)
    res['te'] = res['test'].head(5)
    res['val'] = res['valid'].head(5)
    
    return res
    

def read_test(input_folder=RAW_DATA_PATH): 
    test = src.data.comp.make_dataset.read_raw_test(input_folder)
    test = src.data.comp.build_features.apply_test_feature_engineering_pipeline(input_folder)
    return test


def read_file(file_path):
    return read_x 


if __name__ == '__main__':
    read_test()
    read_dataframes().keys()
    print('working!')

Unnamed: 0,img_path,img_id,study_id
0,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,435bc0fcb0ab,0d7e69753505
1,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,75b2c9f1f232,149c8d66e874
2,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,5f65421ff6fd,1c716d133c0c
3,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,84dd9eff2ecf,2ebd6459c760
4,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,6fd7971538df,39a02fb99c60
5,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,31c07523a69a,81c860c6efe8
6,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,aba653aebd55,a134c7f3e533
7,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,09443dcb865f,a134c7f3e533
8,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,89426c0c18a8,a150ce575fd8
9,C:\Users\sarth\Desktop\kaggle-v2\data\raw\siim...,1c1c48cb66e4,b647d5c8422e


dict_keys(['train_full', 'test', 'holdout', 'train', 'valid', 'valid_75', 'valid_25', 'tr', 'te', 'val'])

working!


In [21]:
[file for file in glob.glob(str(PROCESSED_DATA_PATH / '**/*pkl'), recursive=True) ]

['C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\train.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\train_full.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\five\\train.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\five\\valid.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\five\\valid_25.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\five\\valid_75.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\full\\train.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\full\\valid.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\processed\\siim-covid19-detection\\fold_0\\full\\valid_25.pkl',
 'C:\\Users\\sarth\\Desktop\\kaggle-v2\\data\\proce