In [1]:
# sample data from scanb_malmo with philips and xr1/2

# criteria: 100% foreground, macenko_blur=500





In [1]:
import pandas as pd
import numpy as np


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 0)   # show all columns
pd.set_option('display.width', 0)         # no line wrapping


In [5]:

def stratified_split_one_df(df, var, n_train=10000, n_test=2000, random_state=42, split_col="split"):
    """
    Returns one DataFrame containing only the selected rows, with a `split` column.
    - var: column to stratify on (e.g., 'scanner_model')
    - n_train, n_test: per-stratum targets
    - If a stratum has fewer rows than requested, it will take as many as possible (train first).
    """
    rng = 12
    df_shuf = df.sample(frac=1, random_state=rng)  # shuffle once for reproducibility

    labels = []  # collect (index, split_label)

    for level, g in df_shuf.groupby(var, group_keys=False):
        need_train = n_train
        need_test  = n_test
        total = len(g)

        n_train_take = min(need_train, total)
        g_train = g.sample(n=n_train_take, random_state=rng)

        g_rest = g.drop(g_train.index)
        n_test_take = min(need_test, len(g_rest))
        g_test = g_rest.sample(n=n_test_take, random_state=rng)

        if total < (need_train + need_test):
            print(f"[warn] '{level}': only {total} rows; took train={n_train_take}, test={n_test_take}.")

        labels.extend([(idx, "train") for idx in g_train.index])
        labels.extend([(idx, "test")  for idx in g_test.index])

    # Build output DF of selected rows only, with split column
    selected_idx = [i for i, _ in labels]
    split_map = dict(labels)

    out = df.loc[selected_idx].copy()
    out[split_col] = out.index.map(split_map)

    # sanity prints
    print("\nPer-stratum counts:")
    print(out.groupby([var, split_col]).size().unstack(fill_value=0))

    # ensure no overlap & only requested rows
    assert set(out[split_col].unique()) <= {"train","test"}
    return out





In [6]:
# batch_1 crude tiles with macenko_blur
file_path = "/mnt/chime-preprocessed_data/SCANB_Malmo/SCANB_Malmo_224x224/processed-data/SCANB_Malmo_pil_10.4_224px/batches/Batch_1/dataframes/all_tiles_blur.csv"
batch_1 = pd.read_csv(file_path)

# wsi list with scanner_model
df_wsi = pd.read_csv("/mnt/chime-preprocessed_data/SCANB_Malmo/SCANB_Malmo_224x224/df_batch_SCANB_Malmo_224x224.csv", index_col=False)
len(df_wsi)
df_wsi = df_wsi.rename(columns={"slide_filename": "file_name"})

# merge with tile list
batch_1= pd.merge(batch_1, df_wsi[['file_name', 'scanner_model']], on = 'file_name', how='left')

#mask = (batch_1['tissue_proportion']==1) & (batch_1['macenko_blur']> 500) & (batch_1['scanner_model'].isin(['XR2', 'PHILIPS', 'XR1']))
mask = batch_1['scanner_model'].isin(['XR2', 'PHILIPS', 'XR1']) # make it general to use all tiles 
batch_1_select = batch_1[mask]

print(len(batch_1))
print(len(batch_1_select))


print(batch_1_select.scanner_model.value_counts(normalize=True))
batch_1_select['scanner_model_new'] = np.where(
    batch_1_select['scanner_model'].isin(['XR1', 'XR2']),
    'XR',
    'PHILIPS')

14338543
12494647
scanner_model
PHILIPS    0.522985
XR1        0.273281
XR2        0.203733
Name: proportion, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch_1_select['scanner_model_new'] = np.where(


In [7]:
df_out = stratified_split_one_df(batch_1_select, var='scanner_model_new')  # or your column name
# Keep only rows NOT selected in df_out
mask = ~batch_1_select["crude_tile_path"].isin(df_out["crude_tile_path"])
df_remain = batch_1_select.loc[mask].copy()

# Second stratified split for test set
df_test = stratified_split_one_df(
    df_remain,
    var="scanner_model_new",
    n_train=0,
    n_test=2000
)


Per-stratum counts:
split              test  train
scanner_model_new             
PHILIPS            2000  10000
XR                 2000  10000

Per-stratum counts:
split              test
scanner_model_new      
PHILIPS            2000
XR                 2000


In [None]:
df_out['folder_name'] = (
    '/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/' + df_out['file_name'] + '/' + df_out['file_name'] + '/')

df_out['crude_tile_path'] = df_out['folder_name'] + df_out['tile_name']
df_out.to_csv("/mnt/ssd/bojing/Image-Adaptive-3DLUT/dataframes/scanb_malmo_philips_xr_train.csv")



In [8]:
df_test['folder_name'] = (
    '/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/' + df_test['file_name'] + '/' + df_test['file_name'] + '/')

df_test['crude_tile_path'] = df_test['folder_name'] + df_test['tile_name']
df_test.to_csv("/mnt/ssd/bojing/Image-Adaptive-3DLUT/dataframes/scanb_malmo_philips_xr_test.csv")



In [2]:
df_test =  pd.read_csv("/mnt/ssd/bojing/Image-Adaptive-3DLUT/dataframes/scanb_malmo_philips_xr_test.csv")

In [3]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,file_name,file_format,tile_name,top_source,bottom_source,left_source,right_source,slide_h,slide_w,mpp,mpp_source,lvl,tissue_proportion,blur,crude_tile_path,macenko_blur,scanner_model,scanner_model_new,split,folder_name
0,12253251,SCANB_MALMO_RiDn1512_slide_873506_WSI_clean,isyntax,SCANB_MALMO_RiDn1512_slide_873506_WSI_clean_9744_10149_71050_71455.jpg,9744,10149,71050,71455,86018,136194,0.4536,0.25,0,1.0,380.278458,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn1512_slide_873506_WSI_clean/SCANB_MALMO_RiDn1512_slide_873506_WSI_clean/SCANB_MALMO_RiDn1512_slide_873506_WSI_clean_9744_10149_71050_71455.jpg,653.791017,PHILIPS,PHILIPS,test,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn1512_slide_873506_WSI_clean/SCANB_MALMO_RiDn1512_slide_873506_WSI_clean/
1,6323420,SCANB_MALMO_RiDn4046_slide_349814_WSI_clean,isyntax,SCANB_MALMO_RiDn4046_slide_349814_WSI_clean_24360_24765_35728_36133.jpg,24360,24765,35728,36133,82434,81410,0.4536,0.25,0,1.0,894.207286,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn4046_slide_349814_WSI_clean/SCANB_MALMO_RiDn4046_slide_349814_WSI_clean/SCANB_MALMO_RiDn4046_slide_349814_WSI_clean_24360_24765_35728_36133.jpg,1610.923578,PHILIPS,PHILIPS,test,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn4046_slide_349814_WSI_clean/SCANB_MALMO_RiDn4046_slide_349814_WSI_clean/
2,9441004,SCANB_MALMO_RiDn0419_slide_734327_WSI_clean,isyntax,SCANB_MALMO_RiDn0419_slide_734327_WSI_clean_44254_44659_112462_112867.jpg,44254,44659,112462,112867,93698,125442,0.4536,0.25,0,1.0,46.223649,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn0419_slide_734327_WSI_clean/SCANB_MALMO_RiDn0419_slide_734327_WSI_clean/SCANB_MALMO_RiDn0419_slide_734327_WSI_clean_44254_44659_112462_112867.jpg,223.705154,PHILIPS,PHILIPS,test,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn0419_slide_734327_WSI_clean/SCANB_MALMO_RiDn0419_slide_734327_WSI_clean/
3,8906921,SCANB_MALMO_RiDn5596_slide_170885_WSI_clean,isyntax,SCANB_MALMO_RiDn5596_slide_170885_WSI_clean_77952_78357_88508_88913.jpg,77952,78357,88508,88913,90626,129538,0.4536,0.25,0,1.0,4.139309,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn5596_slide_170885_WSI_clean/SCANB_MALMO_RiDn5596_slide_170885_WSI_clean/SCANB_MALMO_RiDn5596_slide_170885_WSI_clean_77952_78357_88508_88913.jpg,12.151148,PHILIPS,PHILIPS,test,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn5596_slide_170885_WSI_clean/SCANB_MALMO_RiDn5596_slide_170885_WSI_clean/
4,5989844,SCANB_MALMO_RiDn7233_slide_698314_WSI_clean,isyntax,SCANB_MALMO_RiDn7233_slide_698314_WSI_clean_76328_76733_53998_54403.jpg,76328,76733,53998,54403,100866,121346,0.4536,0.25,0,1.0,1812.207103,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn7233_slide_698314_WSI_clean/SCANB_MALMO_RiDn7233_slide_698314_WSI_clean/SCANB_MALMO_RiDn7233_slide_698314_WSI_clean_76328_76733_53998_54403.jpg,3297.763672,PHILIPS,PHILIPS,test,/mnt/ssd/bojing/Image-Adaptive-3DLUT/data/Batch_1/tiles/SCANB_MALMO_RiDn7233_slide_698314_WSI_clean/SCANB_MALMO_RiDn7233_slide_698314_WSI_clean/
