In [1]:
import os
os.chdir('../../')
!pwd

/root/python/myenv/medical-coding-reproducibility-main


In [2]:
import logging
import random
from pathlib import Path
import multiprocessing
import pandas as pd

from src.settings import (
    DATA_DIRECTORY_MIMICIV_ICD10,
    ID_COLUMN,
    SUBJECT_ID_COLUMN,
    TARGET_COLUMN,
)

from src.utils.stratify_function import (
    iterative_stratification,
    kl_divergence,  
    labels_not_in_split,
)


# Generate splits
def generate_split(dataset: pd.DataFrame, split_path: Path):
    processlog=[]
    splits = dataset[[SUBJECT_ID_COLUMN, ID_COLUMN]]
    subject_series = dataset.groupby(SUBJECT_ID_COLUMN)[TARGET_COLUMN].sum()
    subject_ids = subject_series.index.to_list()
    codes = subject_series.to_list()
    subject_ids_train, subject_ids_test = iterative_stratification(
        subject_ids, codes, [1 - TEST_SIZE, TEST_SIZE]
    )
    codes_train = [
        codes[subject_ids.index(subject_id)] for subject_id in subject_ids_train
    ]
    val_size = VAL_SIZE / (1 - TEST_SIZE)
    subject_ids_train, subject_ids_val = iterative_stratification(
        subject_ids_train, codes_train, [1 - val_size, val_size]
    )

    codes_train = [
        codes[subject_ids.index(subject_id)] for subject_id in subject_ids_train
    ]
    codes_val = [codes[subject_ids.index(subject_id)] for subject_id in subject_ids_val]
    codes_test = [
        codes[subject_ids.index(subject_id)] for subject_id in subject_ids_test
    ]

    splits.loc[splits[SUBJECT_ID_COLUMN].isin(subject_ids_train), "split"] = "train"
    splits.loc[splits[SUBJECT_ID_COLUMN].isin(subject_ids_val), "split"] = "val"
    splits.loc[splits[SUBJECT_ID_COLUMN].isin(subject_ids_test), "split"] = "test"

    processlog.append(logging.info("------------- Splits Statistics -------------"))
    processlog.append(logging.info(
        f"Labels missing in the test set: {labels_not_in_split(codes, codes_test)}"
    ))
    processlog.append(logging.info(
        f"Labels missing in the val set: {labels_not_in_split(codes, codes_val)} %"
    ))
    processlog.append(logging.info(
        f"Labels missing in the train set: {labels_not_in_split(codes, codes_train)} %"
    ))
    processlog.append(logging.info(f"Test: KL divergence: {kl_divergence(codes, codes_test)}"))
    processlog.append(logging.info(f"Val: KL divergence: {kl_divergence(codes, codes_val)}"))
    processlog.append(logging.info(f"Train: KL divergence: {kl_divergence(codes, codes_train)}"))
    processlog.append(logging.info(f"Test Size: {len(codes_test) / len(codes)}"))
    processlog.append(logging.info(f"Val Size: {len(codes_val) / len(codes)}"))
    processlog.append(logging.info(f"Train Size: {len(codes_train) / len(codes)}"))

    splits = splits[[ID_COLUMN, "split"]].reset_index(drop=True)
    splits.to_feather(split_path)
    processlog.append(logging.info(
        "Splits generated and saved. Now making subsplits used to analyse the performance of the models when trained on less data."
    ))
    return processlog

def generate_training_subset(
    dataset: pd.DataFrame,
    splits: pd.DataFrame,
    number_of_training_examples: int,
    split_path: Path,
):
    processlog = []
    # Merge the dataset with splits
    dataset = pd.merge(dataset, splits, on=ID_COLUMN)
    
    # Separate the dataset into train, validation, and test sets
    
    training_set = dataset[dataset["split"] == "train"]
    val_set = dataset[dataset["split"] == "val"]
    test_set = dataset[dataset["split"] == "test"]

    
    # Calculate the size for stratification
    size = number_of_training_examples / len(training_set)

    # Group by subject ID and compute target column sums
    subject_series = training_set.groupby(SUBJECT_ID_COLUMN)[TARGET_COLUMN].sum()
    subject_ids = subject_series.index.to_list()
    codes = subject_series.to_list()

    # Perform iterative stratification
    _, subject_ids_train_subset = iterative_stratification(
        subject_ids, codes.copy(), [1 - size, size]
    )
    
    # Retrieve the stratified training subset
    codes_train_subset = [
        codes[subject_ids.index(subject_id)] for subject_id in subject_ids_train_subset
    ]
    
    # Append logging information individually
    processlog.append(logging.info(f"------------- Splits Statistics {number_of_training_examples}-------------"))
    processlog.append(logging.info(
        f"Labels missing in the training subset: {labels_not_in_split(codes, codes_train_subset)} %"
    ))
    processlog.append(logging.info(
        f"Train subset: KL divergence: {kl_divergence(codes, codes_train_subset)}"
    ))
    processlog.append(logging.info(f"Train subset size: {len(codes_train_subset) / len(codes)}"))

    # Filter the training set with the stratified subset of subject IDs
    training_set = training_set[
        training_set[SUBJECT_ID_COLUMN].isin(subject_ids_train_subset)
    ]
    
    # Concatenate the updated training set with validation and test sets
    dataset = pd.concat([training_set, val_set, test_set])[
        [ID_COLUMN, "split"]
    ].reset_index(drop=True)

    # Save the resulting dataset
    dataset.to_feather(split_path)

    return processlog

KL - Entropy น้อยๆ คือ กระจายทั่วถึง

In [3]:
TEST_SIZE = 0.15  # Test split ratios
VAL_SIZE = 0.1
STEP_SIZE = 0.2  # Step size for the iterative stratification

random.seed(10)

logging.basicConfig(level=logging.INFO)

output_dir_icd10 = Path(DATA_DIRECTORY_MIMICIV_ICD10)


mimic_icd10 = pd.read_feather(output_dir_icd10 / "mimiciv_icd10.feather")
mimic_icd10[TARGET_COLUMN] = mimic_icd10[TARGET_COLUMN].apply(lambda x: list(x))


In [4]:
tasks = [
    (mimic_icd10, output_dir_icd10 / "mimiciv_icd10_split.feather")
]

with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    results = pool.starmap(generate_split, tasks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  splits.loc[splits[SUBJECT_ID_COLUMN].isin(subject_ids_train), "split"] = "train"
INFO:root:------------- Splits Statistics -------------
INFO:root:Labels missing in the test set: 0.08813900780659784
INFO:root:Labels missing in the val set: 0.45328632586250317 %
INFO:root:Labels missing in the train set: 0.03777386048854193 %
INFO:root:Test: KL divergence: 0.0043161456458808475
INFO:root:Val: KL divergence: 0.00741076150597404
INFO:root:Train: KL divergence: 0.00042690198364618505
INFO:root:Test Size: 0.15981297023972707
INFO:root:Val Size: 0.10798379481555942
INFO:root:Train Size: 0.7322032349447135
INFO:root:Splits generated and saved. Now making subsplits used to analyse the performance of the models when trained on less da

In [5]:
mimic_icd10_splits = pd.read_feather(output_dir_icd10 / "mimiciv_icd10_split.feather")

In [6]:
print("Done")

Done
