In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

# import data

Required:  
    input.csv - preprocessed spectra with 'lot_id' as index (specimen level)  
    label.csv - trainable labels with 'lot_id' as index (specimen level)  
    meta.csv - metadata containing 'lot_name', 'specimen_id', 'date_scanned', 'analyser_id' (specimen level)

In [None]:
data_dir = Path("../data/processed") # change your path as needed

train_folder = data_dir / "train" # change your path as needed
X_train = pd.read_csv(train_folder / "input.csv", index_col=0)
y_train = pd.read_csv(train_folder / "label.csv", index_col=0)
meta_train = pd.read_csv(train_folder / "meta.csv", index_col=0)

assert (X_train.shape[0] == y_train.shape[0]) and (X_train.shape[0] == meta_train.shape[0]), "Number of rows do not match!"
print(X_train.shape, y_train.shape, meta_train.shape)

test_folder = data_dir / "test" # change your path as needed
X_test = pd.read_csv(test_folder / "input.csv", index_col=0)
y_test = pd.read_csv(test_folder / "label.csv", index_col=0)
meta_test = pd.read_csv(test_folder / "meta.csv", index_col=0)

assert (X_test.shape[0] == y_test.shape[0]) and (X_test.shape[0] == meta_test.shape[0]), "Number of rows do not match!"
print(X_test.shape, y_test.shape, meta_test.shape)

eval_folder = data_dir / "eval" # change your path as needed
X_eval = pd.read_csv(eval_folder / "input.csv", index_col=0)
y_eval = pd.read_csv(eval_folder / "label.csv", index_col=0)
meta_eval = pd.read_csv(eval_folder / "meta.csv", index_col=0)

assert (X_eval.shape[0] == y_eval.shape[0]) and (X_eval.shape[0] == meta_eval.shape[0]), "Number of rows do not match!"
print(X_eval.shape, y_eval.shape, meta_eval.shape)


(276, 191) (276, 1) (276, 5)


# check that samples are sorted by date

In [3]:
def sort_all_by_date(X, y, meta):

    try:
        assert meta['date_scanned'].is_monotonic_increasing, "samples not sorted by date"
        print("Samples already sorted by date.")
        return X, y, meta
    except AssertionError:
        print("Sorting samples by date...")
        pass

    X = X.reset_index()
    y = y.reset_index()
    meta = meta.reset_index()

    sorted_indices = meta.sort_values('date_scanned').index
    X_sorted = X.loc[sorted_indices]
    y_sorted = y.loc[sorted_indices]
    meta_sorted = meta.loc[sorted_indices]

    X_sorted = X_sorted.set_index('lot_id')
    y_sorted = y_sorted.set_index('lot_id')
    meta_sorted = meta_sorted.set_index('lot_id')

    assert meta_sorted['date_scanned'].is_monotonic_increasing
    assert y_sorted.index.equals(meta_sorted.index) and X_sorted.index.equals(meta_sorted.index)

    print("Samples sorted by date.")
    
    return X_sorted, y_sorted, meta_sorted

In [None]:
X_train, y_train, meta_train = sort_all_by_date(X_train, y_train, meta_train)
X_test, y_test, meta_test = sort_all_by_date(X_test, y_test, meta_test)
X_eval, y_eval, meta_eval = sort_all_by_date(X_eval, y_eval, meta_eval)

Samples already sorted by date.


# Utils

In [5]:
def create_disjoint_splits(meta, split_ratio=0.5):
    """
    Split dataset into two disjoint parts based on time
    """
    eval_lot_names = meta['lot_name'].drop_duplicates()
    num_lots_total = len(eval_lot_names)
    split_point = int(num_lots_total * split_ratio)
    
    # First part for retrain, second part for retest
    train_lot_names = eval_lot_names.iloc[:split_point].values
    test_lot_names = eval_lot_names.iloc[split_point:].values

    assert set(train_lot_names).isdisjoint(set(test_lot_names)), "Train and Test lots are not disjoint!"
    
    # print(f"Total lots: {num_lots_total}")
    # print(f"Retrain lots: {len(train_lot_names)}")
    # print(f"Retest lots: {len(test_lot_names)}")
    
    return train_lot_names, test_lot_names


def get_samples_by_lots(X, y, meta, lot_names):
    """
    Extract samples belonging to specified lot names
    """
    X = X.reset_index()
    y = y.reset_index()
    meta = meta.reset_index()
    
    indices_to_take = meta[meta['lot_name'].isin(lot_names)].index
    
    X_taken = X.loc[indices_to_take].set_index('lot_id')
    y_taken = y.loc[indices_to_take].set_index('lot_id')
    meta_taken = meta.loc[indices_to_take].set_index('lot_id')
    
    assert X_taken.index.equals(y_taken.index) and X_taken.index.equals(meta_taken.index)
    # print(f"Shapes taken: {X_taken.shape}, {y_taken.shape}, {meta_taken.shape}")
    
    return X_taken, y_taken, meta_taken

In [None]:
# # create toy data for testing

# train_lots_from_train, test_lots_from_train = create_disjoint_splits(meta_train, split_ratio=0.5)

# X_test_temp, y_test_temp, meta_test_temp = get_samples_by_lots(
#     X_train, y_train, meta_train, test_lots_from_train
# )

# X_train, y_train, meta_train = get_samples_by_lots(
#     X_train, y_train, meta_train, train_lots_from_train
# )

# test_lots_from_test, eval_lots_from_test = create_disjoint_splits(meta_test_temp, split_ratio=0.5)

# X_test, y_test, meta_test = get_samples_by_lots(
#     X_test_temp, y_test_temp, meta_test_temp, test_lots_from_test
# )

# X_eval, y_eval, meta_eval = get_samples_by_lots(
#     X_test_temp, y_test_temp, meta_test_temp, eval_lots_from_test
# )

# print(f"train shapes: {X_train.shape}, {y_train.shape}, {meta_train.shape}")
# print(f"test shapes: {X_test.shape}, {y_test.shape}, {meta_test.shape}")
# print(f"eval shapes: {X_eval.shape}, {y_eval.shape}, {meta_eval.shape}")

# assert set(X_train.index).isdisjoint(set(X_test.index)), "TRAIN and TEST sets are not disjoint!"
# assert set(X_train.index).isdisjoint(set(X_eval.index)), "TRAIN and EVAL sets are not disjoint!"
# assert set(X_test.index).isdisjoint(set(X_eval.index)), "TEST and EVAL  sets are not disjoint!"

train shapes: (142, 191), (142, 1), (142, 5)
test shapes: (66, 191), (66, 1), (66, 5)
eval shapes: (68, 191), (68, 1), (68, 5)


# Step 1 SAFE

In [7]:
def prepare_SAFE_set_disjoint(
        X_test, 
        y_test, 
        meta_test, 
        X_eval, 
        y_eval, 
        meta_eval,
        split_ratio = 0.5
        ):
    
    print(f"Original shapes of eval set: {X_eval.shape}, {y_eval.shape}, {meta_eval.shape}")
    
    # Split test set into two disjoint parts
    retrain_lots, retest_lots = create_disjoint_splits(meta_test, split_ratio)

    # For SAFE, we want the second part (retest_lots) from test
    X_test_taken, y_test_taken, meta_test_taken = get_samples_by_lots(
        X_test, y_test, meta_test, retest_lots
    )
    
    # Concatenate to form SAFE eval set
    X_SAFE_eval = pd.concat([X_test_taken, X_eval], axis=0)
    y_SAFE_eval = pd.concat([y_test_taken, y_eval], axis=0)
    meta_SAFE_eval = pd.concat([meta_test_taken, meta_eval], axis=0)

    assert X_SAFE_eval.index.equals(y_SAFE_eval.index) and X_SAFE_eval.index.equals(meta_SAFE_eval.index)
    print(f"Shapes of SAFE eval set: {X_SAFE_eval.shape}, {y_SAFE_eval.shape}, {meta_SAFE_eval.shape}")
    
    return X_SAFE_eval, y_SAFE_eval, meta_SAFE_eval

In [8]:
X_SAFE_eval, y_SAFE_eval, meta_SAFE_eval = prepare_SAFE_set_disjoint(
    X_test, y_test, meta_test, X_eval, y_eval, meta_eval
)

Original shapes of eval set: (68, 191), (68, 1), (68, 5)
Shapes of SAFE eval set: (102, 191), (102, 1), (102, 5)


In [9]:
# export data
output_dir = data_dir / "SAFE_split"
output_dir.mkdir(parents=True, exist_ok=True)

X_SAFE_eval.to_csv(output_dir / "input.csv", index=True)
y_SAFE_eval.to_csv(output_dir / "target.csv", index=True)
meta_SAFE_eval.to_csv(output_dir / "meta.csv", index=True)

# STEP 2 Retrain

In [17]:
def prepare_new_train_disjoint(
        X_train,
        y_train,
        meta_train,
        X_test,
        y_test,
        meta_test,
        X_eval,
        y_eval,
        meta_eval,
        ):
    
    # For retrain, we want all of train set + first half from test + first half from eval

    retrain_lots_from_test, _ = create_disjoint_splits(meta_test, split_ratio=0.5)

    X_test_taken, y_test_taken, meta_test_taken = get_samples_by_lots(
        X_test, y_test, meta_test, retrain_lots_from_test
    )
    # print(f"Shapes taken from test set: {X_test_taken.shape}, {y_test_taken.shape}, {meta_test_taken.shape}")
    
    retrain_lots_from_eval, _ = create_disjoint_splits(meta_eval, split_ratio=0.5)
    X_eval_taken, y_eval_taken, meta_eval_taken = get_samples_by_lots(
        X_eval, y_eval, meta_eval, retrain_lots_from_eval
    )
    
    X_retrain = pd.concat([X_train, X_test_taken, X_eval_taken], axis=0)
    y_retrain = pd.concat([y_train, y_test_taken, y_eval_taken], axis=0)
    meta_retrain = pd.concat([meta_train, meta_test_taken, meta_eval_taken], axis=0)
    
    assert X_retrain.index.equals(y_retrain.index) and X_retrain.index.equals(meta_retrain.index)
    print(f"Shapes of RETRAIN set: {X_retrain.shape}, {y_retrain.shape}, {meta_retrain.shape}")
    
    return X_retrain, y_retrain, meta_retrain

In [18]:
def prepare_new_test_disjoint(
        X_test,
        y_test,
        meta_test,
        X_eval,
        y_eval,
        meta_eval
        ):
    
    # For retest, we want all of train set + second half from test + second half from eval

    _, retest_lots_from_test = create_disjoint_splits(meta_test, split_ratio=0.5)

    X_test_taken, y_test_taken, meta_test_taken = get_samples_by_lots(
        X_test, y_test, meta_test, retest_lots_from_test
    )
    # print(f"Shapes taken from test set: {X_test_taken.shape}, {y_test_taken.shape}, {meta_test_taken.shape}")
    
    _, retest_lots_from_eval = create_disjoint_splits(meta_eval, split_ratio=0.5)

    X_eval_taken, y_eval_taken, meta_eval_taken = get_samples_by_lots(
        X_eval, y_eval, meta_eval, retest_lots_from_eval
    )
    
    X_retest = pd.concat([X_test_taken, X_eval_taken], axis=0)
    y_retest = pd.concat([y_test_taken, y_eval_taken], axis=0)
    meta_retest = pd.concat([meta_test_taken, meta_eval_taken], axis=0)
    
    print(f"Shapes of RETEST set: {X_retest.shape}, {y_retest.shape}, {meta_retest.shape}")
    
    return X_retest, y_retest, meta_retest

In [25]:
# Step 2: Create retrain set using the retrain_lots
print(f"Shape of original train set: {X_train.shape}, {y_train.shape}, {meta_train.shape}")
print(f"Shape of original test set: {X_test.shape}, {y_test.shape}, {meta_test.shape}")
print(f"Shape of original eval set: {X_eval.shape}, {y_eval.shape}, {meta_eval.shape}")
print("\n")

X_retrain, y_retrain, meta_retrain = prepare_new_train_disjoint(
    X_train, y_train, meta_train,
    X_test, y_test, meta_test,
    X_eval, y_eval, meta_eval,
)

# Step 3: Create retest set using the retest_lots  
X_retest, y_retest, meta_retest = prepare_new_test_disjoint(
    X_test, y_test, meta_test,
    X_eval, y_eval, meta_eval
)

# Verify disjoint property
assert set(X_retrain.index).isdisjoint(set(X_retest.index)), "RETRAIN and RETEST sets are not disjoint!"

Shape of original train set: (142, 191), (142, 1), (142, 5)
Shape of original test set: (66, 191), (66, 1), (66, 5)
Shape of original eval set: (68, 191), (68, 1), (68, 5)


Shapes of RETRAIN set: (208, 191), (208, 1), (208, 5)
Shapes of RETEST set: (68, 191), (68, 1), (68, 5)


In [22]:
# export data
output_dir = data_dir / "retrain"
output_dir.mkdir(parents=True, exist_ok=True)
X_retrain.to_csv(output_dir / "input.csv", index=True)
y_retrain.to_csv(output_dir / "target.csv", index=True)
meta_retrain.to_csv(output_dir / "meta.csv", index=True)

output_dir = data_dir / "retest"
output_dir.mkdir(parents=True, exist_ok=True)
X_retest.to_csv(output_dir / "input.csv", index=True)
y_retest.to_csv(output_dir / "target.csv", index=True)
meta_retest.to_csv(output_dir / "meta.csv", index=True)