# Dependencies:

In [None]:
# ── Standard Library ───────────────────────────────────────────────────────────
import os
from collections import Counter
from pathlib import Path

# ── Data Handling & ML ───────────────────────────────────────────────────
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import pydicom

# Directory

In [None]:
#Dir. Reference for Imagepath
os.chdir("Dataset Directory")

In [None]:
#Loading Preassembled csv. with Combined Targets and Image Paths
csv_file_path = 'Basic.csv'
df = pd.read_csv(csv_file_path)

# Stratification of Folds

In [None]:
"""
Assign 5-fold CV labels while

1. grouping first by ID_Physio for rows where Non-RRD labels == 1.0
2. grouping the rest by PN

"""

# ──  aliases ────────────────────────────────
LABEL_COL   = "Oct_stage"      
GROUP_COL_1 = "ID_Non_RRD"
GROUP_COL_2 = "PN"


# Description of original labels to true stages.
df = df.rename(columns={
    "OCT staging (1-5) 1:1,2:2,3:3a, 4:3b,5:4,6:5": LABEL_COL
})

# ── initialise 'fold' ──
df["fold"] = -1

# ── PASS 1 ─────────────────────────────────────────────────────────────────────
mask_pass1 = (df["Non_RRD"] == 1.0) & df[GROUP_COL_1].notna()
filtered_df = df.loc[mask_pass1]

id_summary = (
    filtered_df
      .groupby(GROUP_COL_1, sort=False)
      .agg(count=("fold", "size"), label=(LABEL_COL, "first"))
      .reset_index()
)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (_, test_idx) in enumerate(skf.split(id_summary, id_summary["label"])):
    ids_in_fold = id_summary.loc[test_idx, GROUP_COL_1]
    df.loc[df[GROUP_COL_1].isin(ids_in_fold), "fold"] = fold

# ── PASS 2  ─────────────────────────────────────────
remaining_df = df.loc[df["fold"] == -1]

pn_summary = (
    remaining_df
      .groupby(GROUP_COL_2, sort=False)
      .agg(count=("fold", "size"), label=(LABEL_COL, "first"))
      .reset_index()
)

for fold, (_, test_idx) in enumerate(skf.split(pn_summary, pn_summary["label"])):
    pns_in_fold = pn_summary.loc[test_idx, GROUP_COL_2]
    df.loc[df[GROUP_COL_2].isin(pns_in_fold) & (df["fold"] == -1), "fold"] = fold
    
if (df["fold"] == -1).any():
    raise RuntimeError(
        f"{(df['fold'] == -1).sum()} rows never received a fold assignment."
)

# Extract DICOM Shape, Filter for Shape

In [None]:
# ── 1. helper ──────────────────────────────────────────────────────────────────
def get_dicom_shape(fp: Path | str) -> tuple[int, ...] | str:
    try:
        dcm = pydicom.dcmread(fp, force=True)          
        return dcm.pixel_array.shape if hasattr(dcm, "pixel_array") else "No Pixel Array"
    except Exception as exc:                           
        return str(exc)

# ── 2. announce work ───────────────────────────────────────────────────────────
total_rows = len(df)
print(f"Starting scan… {total_rows} paths to inspect")

# ── 3. scan every path ─────────────────────────────────────────────────────────
progress_step = max(1, total_rows // 100)             # print every ~1 %
shapes: list[tuple[int, ...] | str] = []

for idx, path_str in enumerate(df["path3D"], start=1):
    path = Path(path_str)

    shapes.append(get_dicom_shape(path) if path.exists() else "File Not Found")

    if idx % progress_step == 0 or idx == total_rows:
        pct = (idx / total_rows) * 100
        print(f"  {pct:5.1f}%  ({idx}/{total_rows})")
df["shape"] = shapes

# ── 4. keep only the target shape ──────────────────────────────────────────────
TARGET_SHAPE = (256, 992, 512)
df_filtered = df[df["shape"] == TARGET_SHAPE]

# ── 5. stats & output ──────────────────────────────────────────────────────────
shape_counts = Counter(df_filtered["shape"])
shape_summary = (
    pd.DataFrame(shape_counts.items(), columns=["Shape", "Count"])
        .sort_values("Count", ascending=False)
        .reset_index(drop=True)
)

print("\nFiltered unique shapes and counts")
print(shape_summary)

csv_out = Path("PLOSONE_DF.csv")
df_filtered.to_csv(csv_out, index=False)
df = df_filtered                                        
print(f"\nFiltered dataframe saved to {csv_out}. Remaining rows: {len(df)}")

In [None]:
df = df_filtered
df["image_id"] = np.arange(len(df)) 
df_with_folds = df

In [None]:
rows_with_nan = df.isna().any(axis=1)
df_with_nans = df_with_folds[rows_with_nan]
if not df_with_nans.empty:
    print("Columns in the DataFrame with NaN values:")
    print(df_with_nans.columns.tolist())
else:
    print("No NaN values found in the DataFrame.")

In [None]:
input_csv_path_with_folds = 'PLOSONE_DF.csv'
df_with_folds = pd.read_csv(input_csv_path_with_folds)

df_with_folds['Path'] = df_with_folds['Path'].str.replace(r'\\', '/', regex=True)
df_with_folds['path3D'] = df_with_folds['path3D'].str.replace(r'\\', '/', regex=True)

df_with_folds['PN'] = df_with_folds['PN'].astype(str)

output_final_csv_path = 'PLOSONE_DF.csv'
df_with_folds.to_csv(output_final_csv_path, index=False)
print(f"Final CSV file with required columns and normalized image paths saved to {output_final_csv_path}.")

# Label Management for MLA Analysis

## Preparation for MLA: RRD Stages - 2D


In [None]:
inputafterfilter = 'PLOSONE_DF.csv'
df = pd.read_csv(inputafterfilter)

In [None]:
oct_staging_col = 'OCT staging (1-5) 1:1,2:2,3:3a, 4:3b,5:4,6:5'

df['class_0'] = 0
df['class_1'] = 0
df['class_2'] = 0
df['class_3'] = 0
df['class_4'] = 0
df['class_5'] = 0

def update_labels(row):
    if row[oct_staging_col] == 0:
        row['class_0'] = 1
    elif row[oct_staging_col] == 1:
        row['class_1'] = 1
    elif row[oct_staging_col] == 2:
        row['class_2'] = 1
    elif row[oct_staging_col] == 3:
        row['class_3'] = 1
    elif row[oct_staging_col] == 4:
        row['class_4'] = 1
    elif row[oct_staging_col] == 5:
        row['class_5'] = 1
    elif row[oct_staging_col] == 6:
        row['class_5'] = 1    
    return row
df = df.apply(update_labels, axis=1)

In [None]:
# --- settings ---------------------------------------------------------------
oct_staging_col = 'OCT staging (1-5) 1:1,2:2,3:3a, 4:3b,5:4,6:5'
class_cols      = [f'class_{i}' for i in range(6)]

rows = []
for i in range(6):
    mask = df[class_cols[i]] == 1
    row  = df.loc[mask].head(1)
    if not row.empty:
        rows.append(row)

# --- concatenate ---------------------------------------------------
sample_per_class = pd.concat(rows, ignore_index=True)
print(sample_per_class)        
sample_per_class

In [None]:
columns_of_interest = ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5']
sum_of_classes = df[columns_of_interest].sum()
print(sum_of_classes)

In [None]:
columns_of_interest = ['class_1', 'class_2', 'class_3', 'class_4', 'class_5']
instances_with_ones = (df[columns_of_interest] == 1).any(axis=1).sum()
print("Number of instances with at least one '1' in class_1 to class_5:", instances_with_ones)

In [None]:
df.to_csv('PLOSONE_DF-Stages', index=False)

# Preparation for MLA: Macular off vs. Macular on vs. Non-RRD - 3D

In [None]:
inputafterfilter = 'PLOSONE_DF.csv'
df = pd.read_csv(inputafterfilter)

In [None]:
df['Macular status? On:1, Off:2'] = pd.to_numeric(df['Macular status? On:1, Off:2'], errors='coerce') \
                        .fillna(0) \
                        .astype(int)

In [None]:
df['class_0'] = 0
df['class_1'] = 0
df['class_2'] = 0
df.loc[df["Macular status? On:1, Off:2"] == 0, 'class_0'] = 1
df.loc[df["Macular status? On:1, Off:2"] == 1, 'class_1'] = 1
df.loc[df["Macular status? On:1, Off:2"] == 2, 'class_2'] = 1

In [None]:
count_class_0 = df['class_0'].sum()
count_class_1 = df['class_1'].sum()
count_class_2 = df['class_2'].sum()
print(f"there is a {count_class_0} in class_0, a {count_class_1} in class_1, and a {count_class_2} in class_2.")

In [None]:
df.to_csv('PLOSONE_DF-MacularStatus.csv', index=False)

# Preparation for MLA: Duration Estimation - 3D

In [None]:
df = pd.read_csv('PLOSONE_DF.csv') 

In [None]:
unique_counts = df['Duration of macular off in days'].value_counts()
print(unique_counts)

In [None]:
total_count = df["Duration of macular off in days"].count()
print(total_count)

In [None]:
df['Duration of macular off in days'] = df['Duration of macular off in days'].astype(str)
df['Duration of macular off in days'] = df['Duration of macular off in days'].apply(lambda x: x.split('.')[0] if x != 'nan' else '')
df['Duration of macular off in days'] = pd.to_numeric(df['Duration of macular off in days'], errors='coerce').astype('Int64')

print(df['Duration of macular off in days'].unique())
df

In [None]:
count_value = (df["Makula status? On:1, Off:2"] == 2).sum()
print(count_value)

In [None]:
df["class_1"] = 0
df.loc[(df["Macular status? On:1, Off:2"] == 2) & (df["Duration of macular off in days"] <= 3), "class_1"] = 1
print(df["class_1"].sum())

In [None]:
df["class_2"] = 0
df.loc[(df["Macular status? On:1, Off:2"] == 1) | (df["Duration of macular off in days"] >= 4), "class_2"] = 1
print(df["class_2"].sum())

In [None]:
df["class_0"] = 0
df['class_0'] = np.where(df['Non-RRD'] == 1.0, 1, df['class_0'])

In [None]:
sum_class_1 = df['class_0'].sum()
sum_class_2 = df['class_1'].sum()
sum_class_3 = df['class_2'].sum()

print("Summe von class_0", sum_class_1)
print("Summe von class_1:", sum_class_2)
print("Summe von class_2:", sum_class_3)

In [None]:
df.loc[df["class_2"] == 1, "class_0"] += 1
print(df)

In [None]:
sum_class_1 = df['class_0'].sum()
sum_class_2 = df['class_1'].sum()
sum_class_3 = df['class_2'].sum()

print("Summe von class_0", sum_class_1)
print("Summe von class_1:", sum_class_2)
print("Summe von class_2:", sum_class_3)

In [None]:
df_filtered.to_csv('PLOSONE_DF-Time.csv', index=False)