In [1]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys, os, time, random, pickle, warnings
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind, chi2_contingency


from pydicom import dcmread
from utils.image_viz import *

# ============= #
# Library Setup #
# ============= #
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

SEED = 2025
def seed_everything():
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

# =================== #
# Predefined Varables #
# =================== #
# File Paths
DICOM_DIR                   = f'dataset/ALL/T2SAG_SLICES=7'
LABEL_FILENAME              = f'dataset/ALL/Labels_Final.csv'
EXCLUDE_ID_FILENAME         = f'dataset/ExcludeID.txt'
VALID_SEQ_FILENAME          = f'dataset/ALL/ValidSequences_Final.csv'

PATIENT_ID_TVAL_FILENAME    = f'dataset/ALL/patient_id_tval.pkl'
PATIENT_ID_TEST_FILENAME    = f'dataset/ALL/patient_id_test.pkl'

# Train/Test Split Parameter(s)
TEST_PROPORTION             = 0.1

### **Generate a List of Patient IDs**

In [2]:
def ExcludeList(filename):
    # ==================================================
    # Input(s)
    # - filename[string] : "ExcludeID_{SUFFIX}.txt"
    #
    # Output(s)
    # - exclude_list : List of patient IDs to exclude
    # ==================================================
    exclude_list = []

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        try: x = int(line)
        except: continue
        exclude_list.append(line.rstrip())

    return exclude_list


def LoadLabelCSV(filename, exclude_list):
    # ==================================================
    # Input(s)
    # - filename[string]    : "Label_{SUFFIX}.csv"
    # - exclude_list        : List of patient IDs to exclude
    #
    # Output(s)
    # - df[DataFrame]       : DataFrame object containing labels
    # ==================================================
    df = pd.read_csv(filename, encoding='utf-8')

    # Rearrange Columns
    df = df[['patient_id', 'sex', 'age', 'date_xr', 'date_diff', 'label', 'x_top_lt', 'y_top_lt', 'x_bot_rt', 'y_bot_rt']]

    # String data
    df['patient_id'] = df['patient_id'].map(lambda x: f'{x:08d}')

    # Labels
    df.dropna(axis=0, inplace=True)
    df['label'] = df['label'].astype(int)

    # Reset Index
    df.set_index('patient_id', inplace=True)

    # 1) Exclude ID from blacklist
    for patient_id in exclude_list:
        try: df.drop(patient_id, axis=0, inplace=True)
        except: continue

    # 2) Exclude ID without labels
    df = df[~df['label'].isna()]

    # 3) Exclude ID without DICOM file
    for patient_id in tqdm(df.index):
        if not os.path.exists(f'{DICOM_DIR}/{patient_id}'):
            try: df.drop(patient_id, axis=0, inplace=True)
            except: continue

    return df

In [3]:
exclude_list = ExcludeList(EXCLUDE_ID_FILENAME)
df_label     = LoadLabelCSV(LABEL_FILENAME, exclude_list)

df_label.to_csv('dataset/ALL/Labels_Final.csv')
df_label

100%|██████████| 7645/7645 [00:00<00:00, 923890.11it/s]




Unnamed: 0_level_0,sex,age,date_xr,date_diff,label,x_top_lt,y_top_lt,x_bot_rt,y_bot_rt
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
35746667,M,44,20070123,4,1,179.0,8.0,317.0,284.0
35870452,F,76,20070124,18,0,465.0,234.0,752.0,808.0
20085353,M,49,20070122,0,1,219.0,150.0,362.0,436.0
28227555,F,63,20070125,41,1,224.0,106.0,377.0,412.0
21645107,F,49,20200410,4,1,163.0,89.0,278.0,319.0
...,...,...,...,...,...,...,...,...,...
50020155,F,67,20221230,3,1,211.0,127.0,349.0,403.0
57193009,F,62,20221228,1,1,253.0,96.0,491.0,572.0
50183621,M,77,20221227,45,1,179.0,66.0,337.0,382.0
14154463,F,83,20221230,0,1,200.0,176.0,428.0,632.0


### **Train/Valid - Test Split**

In [4]:
# ======================== #
# Train/Valid - Test Split #
# ======================== #
patient_id_full = list(df_label.index)
patient_id_tval, patient_id_test = train_test_split(patient_id_full, test_size=TEST_PROPORTION, random_state=SEED)

print(f'===== Index Count =====')
print(f'Train/Valid : {len(patient_id_tval):04d} | {patient_id_tval[:10]}...')
print(f'Test        : {len(patient_id_test):04d} | {patient_id_test[:10]}...')
print()


# ====================== #
# Statistical Evaluation #
# ====================== #
# Age : Independent T-Test
age_tval = df_label.loc[patient_id_tval, 'age']
age_test = df_label.loc[patient_id_test, 'age']
result = ttest_ind(age_tval, age_test)

print(f'===== T-Test Results (Age) =====')
print(f'Mean+/-STD | {age_tval.mean():.2f}+/-{age_tval.std():.2f} vs {age_test.mean():.2f}+/-{age_test.std():.2f}')
print(f'Statistic  | {result.statistic:.5f}')
print(f'P-value    | {result.pvalue:.5f}')
print()

# Date Difference : Independent T-Test
ddif_tval = df_label.loc[patient_id_tval, 'date_diff']
ddif_test = df_label.loc[patient_id_test, 'date_diff']
result = ttest_ind(ddif_tval, ddif_test)

print(f'===== T-Test Results (Date Difference) =====')
print(f'Mean+/-STD | {ddif_tval.mean():.2f}+/-{ddif_tval.std():.2f} vs {ddif_test.mean():.2f}+/-{ddif_test.std():.2f}')
print(f'Statistic  | {result.statistic:.5f}')
print(f'P-value    | {result.pvalue:.5f}')
print()

# Sex : Chi2 Contingency Test
sex_tval = df_label.loc[patient_id_tval, 'sex']
sex_test = df_label.loc[patient_id_test, 'sex']

freq_obs = [
    [sex_tval.value_counts()['M'], sex_tval.value_counts()['F']],
    [sex_test.value_counts()['M'], sex_test.value_counts()['F']]
]
result = chi2_contingency(freq_obs)

print(f'===== Chi Square Test Results (Sex) =====')
print(freq_obs)
print(f'Statistic  | {result.statistic:.5f}')
print(f'P-value    | {result.pvalue:.5f}')
print()

# Label : Chi2 Contingency Test
label_tval = df_label.loc[patient_id_tval, 'label']
label_test = df_label.loc[patient_id_test, 'label']

freq_obs = [
    [label_tval.value_counts()[0], label_tval.value_counts()[1]],
    [label_test.value_counts()[0], label_test.value_counts()[1]]
]
result = chi2_contingency(freq_obs)

print(f'===== Chi Square Test Results (Label) =====')
print(freq_obs)
print(f'Statistic  | {result.statistic:.5f}')
print(f'P-value    | {result.pvalue:.5f}')
print()

# ================ #
# Save Patient IDs #
# ================ #
pickle.dump(patient_id_tval, open(PATIENT_ID_TVAL_FILENAME, 'wb'))
pickle.dump(patient_id_test, open(PATIENT_ID_TEST_FILENAME, 'wb'))

===== Index Count =====
Train/Valid : 6880 | ['25702026', '36791691', '26824462', '51560500', '42782173', '26242907', '44661188', '13107387', '45830660', '50025097']...
Test        : 0765 | ['40004086', '40513650', '34931703', '49674026', '37548186', '42404101', '56339080', '41384266', '53454308', '25515277']...

===== T-Test Results (Age) =====
Mean+/-STD | 56.03+/-14.25 vs 56.47+/-14.43
Statistic  | -0.80283
P-value    | 0.42210

===== T-Test Results (Date Difference) =====
Mean+/-STD | 22.04+/-35.48 vs 21.85+/-33.89
Statistic  | 0.13932
P-value    | 0.88920

===== Chi Square Test Results (Sex) =====
[[3480, 3400], [386, 379]]
Statistic  | 0.00072
P-value    | 0.97854

===== Chi Square Test Results (Label) =====
[[2800, 4080], [331, 434]]
Statistic  | 1.77604
P-value    | 0.18264

