In [1]:
import pandas as pd
import re

train_df = pd.read_csv("/Applications/Projects/Bladder Research/Data/Stratified Data/train_split.csv")
valid_df = pd.read_csv("/Applications/Projects/Bladder Research/Data/Stratified Data/valid_split.csv")
test_df  = pd.read_csv("/Applications/Projects/Bladder Research/Data/Stratified Data/test_split.csv")

In [2]:
def extract_case_id(path):
    """
    Extracts 'case_XXX_pt_YYY' from:
    case_XXX_pt_YYY_frame_ZZZZ.png
    """
    filename = path.split("/")[-1]  # handles full paths safely
    match = re.match(r"(case_\d+_pt_\d+)", filename)
    return match.group(1) if match else None

In [3]:
train_cases = set(train_df["HLY"].apply(extract_case_id).dropna())
valid_cases = set(valid_df["HLY"].apply(extract_case_id).dropna())
test_cases  = set(test_df["HLY"].apply(extract_case_id).dropna())

In [4]:
train_valid_overlap = train_cases.intersection(valid_cases)
train_test_overlap  = train_cases.intersection(test_cases)
valid_test_overlap  = valid_cases.intersection(test_cases)

In [5]:
def print_overlap(split_name, overlap_set):
    if len(overlap_set) == 0:
        print(f"‚úÖ No overlap between {split_name}\n")
    else:
        print(f"‚ùå Overlap found between {split_name}")
        print(f"   Number of overlapping cases: {len(overlap_set)}")
        for case in sorted(overlap_set):
            print(f"   - {case}")
        print()

print_overlap("Train & Validation", train_valid_overlap)
print_overlap("Train & Test", train_test_overlap)
print_overlap("Validation & Test", valid_test_overlap)

‚ùå Overlap found between Train & Validation
   Number of overlapping cases: 68
   - case_002_pt_001
   - case_002_pt_003
   - case_002_pt_004
   - case_004_pt_001
   - case_004_pt_002
   - case_004_pt_003
   - case_004_pt_005
   - case_005_pt_001
   - case_005_pt_002
   - case_005_pt_004
   - case_005_pt_005
   - case_005_pt_008
   - case_005_pt_010
   - case_005_pt_011
   - case_005_pt_012
   - case_006_pt_001
   - case_006_pt_004
   - case_006_pt_005
   - case_008_pt_001
   - case_008_pt_002
   - case_008_pt_003
   - case_008_pt_004
   - case_009_pt_002
   - case_009_pt_003
   - case_010_pt_001
   - case_010_pt_003
   - case_010_pt_005
   - case_010_pt_006
   - case_010_pt_007
   - case_011_pt_002
   - case_011_pt_004
   - case_012_pt_004
   - case_012_pt_006
   - case_013_pt_001
   - case_013_pt_002
   - case_013_pt_005
   - case_013_pt_006
   - case_013_pt_007
   - case_014_pt_001
   - case_014_pt_002
   - case_014_pt_003
   - case_016_pt_001
   - case_016_pt_002
   - case_016_pt_

In [6]:
if not train_valid_overlap and not train_test_overlap and not valid_test_overlap:
    print("\nüéâ CASE-WISE CLEAN SPLIT (patient-independent)")
else:
    print("\n‚ö†Ô∏è IMAGE-WISE SPLIT (patient overlap detected)")


‚ö†Ô∏è IMAGE-WISE SPLIT (patient overlap detected)


In [7]:
train_df_1 = pd.read_csv("/Applications/Projects/Bladder Research/Data/data/train.csv")
train_cases_1 = set(train_df_1["HLY"].apply(extract_case_id).dropna())
test_df_1 = pd.read_csv("/Applications/Projects/Bladder Research/Data/data/test.csv")
test_cases_1 = set(test_df_1["HLY"].apply(extract_case_id).dropna())
valid_df_1 = pd.read_csv("/Applications/Projects/Bladder Research/Data/data/valid.csv")
valid_cases_1 = set(valid_df_1["HLY"].apply(extract_case_id).dropna())

In [8]:
train_valid_overlap_1 = train_cases_1.intersection(valid_cases_1)
train_test_overlap_1  = train_cases_1.intersection(test_cases_1)
valid_test_overlap_1  = valid_cases_1.intersection(test_cases_1)

In [9]:
print_overlap("Train & Validation (data)", train_valid_overlap_1)
print_overlap("Train & Test (data)", train_test_overlap_1)
print_overlap("Validation & Test (data)", valid_test_overlap_1)

‚ùå Overlap found between Train & Validation (data)
   Number of overlapping cases: 61
   - case_002_pt_001
   - case_002_pt_004
   - case_004_pt_001
   - case_004_pt_002
   - case_004_pt_003
   - case_004_pt_004
   - case_005_pt_001
   - case_005_pt_002
   - case_005_pt_004
   - case_005_pt_005
   - case_005_pt_007
   - case_005_pt_008
   - case_005_pt_010
   - case_005_pt_011
   - case_005_pt_012
   - case_005_pt_013
   - case_005_pt_014
   - case_007_pt_002
   - case_008_pt_001
   - case_008_pt_002
   - case_008_pt_003
   - case_008_pt_004
   - case_009_pt_001
   - case_009_pt_003
   - case_009_pt_004
   - case_010_pt_001
   - case_010_pt_002
   - case_010_pt_003
   - case_010_pt_004
   - case_010_pt_005
   - case_010_pt_006
   - case_010_pt_007
   - case_012_pt_004
   - case_012_pt_006
   - case_013_pt_001
   - case_013_pt_002
   - case_013_pt_004
   - case_013_pt_005
   - case_013_pt_006
   - case_014_pt_002
   - case_014_pt_003
   - case_016_pt_001
   - case_016_pt_002
   - case_