In [3]:
import os
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import KFold

In [4]:
def binary_bach_data(data_path, neg_labels, pos_labels):
    """_summary_

    Args:
        data_path (_type_): _description_
        neg_labels (_type_): _description_
        pos_labels (_type_): _description_

    Returns:
        _type_: _description_
    """
    neg_slides = list()
    pos_slides = list()

    for i in neg_labels:
        for neg in tf.io.gfile.listdir(os.path.join(data_path, i)):
            neg_slides.append(neg)
    
    for j in pos_labels:
        for pos in tf.io.gfile.listdir(os.path.join(data_path, j)):
            pos_slides.append(pos)
    
    return neg_slides, pos_slides

In [5]:
data_path = "/home/quincy/data/BACH/"
neg_labels = ["Normal", "Benign"]
pos_labels = ["InSitu", "Invasive"]

In [6]:
neg_slides, pos_slides = binary_bach_data(data_path, neg_labels, pos_labels)

In [9]:
neg_slides[0:2], pos_slides[0:2]

(['n065.tif', 'n044.tif'], ['is037.tif', 'is033.tif'])

In [12]:
len(neg_slides), len(pos_slides)

(200, 200)

In [10]:
def select_test_data(neg_slides, pos_slides, test_ratio=0.2):
    """_summary_

    Args:
        neg_slides (_type_): _description_
        pos_slides (_type_): _description_
        test_ratio (float, optional): _description_. Defaults to 0.2.

    Returns:
        _type_: _description_
    """
    import random

    neg_test_slides = random.choices(
        neg_slides, 
        k=int(len(neg_slides) * test_ratio)
    )
    
    pos_test_slides = random.choices(
        pos_slides, 
        k=int(len(pos_slides) * test_ratio)
    )

    return neg_test_slides, pos_test_slides

In [11]:
neg_test_slides, pos_test_slides = select_test_data(neg_slides, pos_slides, test_ratio=0.2)

In [13]:
neg_test_slides[0:2], pos_test_slides[0:2]

(['b082.tif', 'n002.tif'], ['iv015.tif', 'iv077.tif'])

In [14]:
len(neg_test_slides), len(pos_test_slides)

(40, 40)

In [16]:
def cross_val_data(neg_slides, pos_slides, test_ratio=0.0, n_folds=5, kf_shuffle=False, kf_rs=None):
    """_summary_

    Args:
        neg_slides (_type_): _description_
        pos_slides (_type_): _description_
        kf_csv_path (_type_): _description_
        test_ratio (float, optional): _description_. Defaults to 0.0.
        n_folds (int, optional): _description_. Defaults to 5.
        kf_shuffle (bool, optional): _description_. Defaults to False.
        kf_rs (_type_, optional): _description_. Defaults to None.
    """
    if test_ratio != 0.0:
        neg_test_slides, pos_test_slides = select_test_data(
            neg_slides=neg_slides, 
            pos_slides=pos_slides, 
            test_ratio=test_ratio
        )
        test_df = pd.DataFrame(
            {"UUID": neg_test_slides + pos_test_slides}
        )

        ## return slides uuids for training and validation from the negative and positive class samples
        neg_slides = list(set(neg_slides) - set(neg_test_slides))
        pos_slides = list(set(pos_slides) - set(pos_test_slides))

    ## initiate k-fold cross validation, check sklearn KFold documentation via 
    ## https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html for 
    ## more information in details
    kf_cv = KFold(n_splits=n_folds, shuffle=kf_shuffle, random_state=kf_rs)

    ## K-Fold Cross Validation Data Split in Negative Class Samples
    neg_train_index = list()
    neg_val_index = list()
    for nt_idx, nv_idx in kf_cv.split(neg_slides):
        neg_train_index.append(nt_idx)
        neg_val_index.append(nv_idx)
    
    neg_train_slides = list()
    for fold_nt_idx in neg_train_index: 
        neg_train_slides.append([neg_slides[i] for i in fold_nt_idx])
    
    neg_val_slides = list()
    for fold_nv_idx in neg_val_index: 
        neg_val_slides.append([neg_slides[i] for i in fold_nv_idx])
    
    ## K-Fold Cross Validation Data Split in Postive Class Samples
    pos_train_index = list()
    pos_val_index = list()
    for pt_idx, pv_idx in kf_cv.split(pos_slides):
        pos_train_index.append(pt_idx)
        pos_val_index.append(pv_idx)
    
    pos_train_slides = list()
    for fold_pt_idx in pos_train_index: 
        pos_train_slides.append([pos_slides[i] for i in fold_pt_idx])
    
    pos_val_slides = list()
    for fold_pv_idx in pos_val_index: 
        pos_val_slides.append([pos_slides[i] for i in fold_pv_idx])
    
    ## Combine K-Fold Cross Validation Data Split in Negative and Positive Class Samples
    for f in range(n_folds):
        train_fold_slides = neg_train_slides[f] + pos_train_slides[f]
        val_fold_slides = neg_val_slides[f] + pos_val_slides[f]
        train_fold_df = pd.DataFrame(
            {"UUID": train_fold_slides}
        )
        val_fold_df = pd.DataFrame(
            {"UUID": val_fold_slides}
        )
    
    return train_fold_df, val_fold_df, test_df

In [17]:
train_fold_df, val_fold_df, test_df = cross_val_data(neg_slides, pos_slides, test_ratio=0.2, n_folds=5, kf_shuffle=True, kf_rs=13)

In [18]:
train_fold_df

Unnamed: 0,UUID
0,n043.tif
1,n010.tif
2,b052.tif
3,b027.tif
4,n095.tif
...,...
257,is031.tif
258,is050.tif
259,is033.tif
260,iv059.tif


In [19]:
val_fold_df

Unnamed: 0,UUID
0,n007.tif
1,b080.tif
2,n052.tif
3,n009.tif
4,b071.tif
...,...
59,is099.tif
60,iv009.tif
61,is018.tif
62,is001.tif


In [20]:
test_df

Unnamed: 0,UUID
0,n056.tif
1,b065.tif
2,b040.tif
3,n081.tif
4,n100.tif
...,...
75,is059.tif
76,is004.tif
77,iv004.tif
78,is078.tif
