In [10]:
# %%
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import os

# === Load raw data ===
working_dir = "/data/lodhar2/milan"
os.chdir(working_dir)
data = pd.read_csv(f"{working_dir}/data.csv", dtype=str)

# === Construct image paths ===
print("adding image path")
data['img_path'] = (
    "Images/" +
    data['Patient ID'].str.zfill(4) + "_" +
    data['Date'] + "_" +
    data['Image ID'].str.zfill(2) + ".png"
)
data['img_path'] = data['img_path'].astype(str)

# === Filter out rows with missing image files ===
print("removing rows with missing images")
data = data[data['img_path'].apply(lambda x: os.path.exists(x))].reset_index(drop=True)
print(f"\n{data['Class'].value_counts()}")

# === Drop unused columns ===
print("cleaning dataframe")
data = data.drop(columns=['Patient Name', 'MRN', 'Nephrectomy Approach', 'Path note', 'Notes'])

# === One vs Rest predictor ===
# print("removing 'Hybrid' class")
# data = data[data['Class'] != "Hybrid"]
data['Class'].value_counts()

adding image path
removing rows with missing images

Class
Clear_cell        149
Papillary          94
Hybrid             81
Oncocytoma         43
Chromophobe        42
Angiomyolipoma     31
Name: count, dtype: int64
cleaning dataframe


Class
Clear_cell        149
Papillary          94
Hybrid             81
Oncocytoma         43
Chromophobe        42
Angiomyolipoma     31
Name: count, dtype: int64

In [11]:
hoi = "Chromophobe"
print(f"converting non-{hoi} classes to 'Other'")
data['Class'] = data['Class'].apply(lambda x: hoi if x == hoi else "Other")

# === Filter out rare classes (<10 instances) ===
print("filtering rare classes")
value_counts = data['Class'].value_counts()
low_count_types = value_counts[value_counts < 10].index.tolist()
filtered_data = data[~data['Class'].isin(low_count_types)].drop(columns=['Pathology']).reset_index(drop=True)

filtered_data_cleaned = filtered_data[
    filtered_data['Class'].notna() &
    filtered_data['img_path'].apply(lambda x: os.path.exists(x))
]

# === Stratified train/val split ===
print("stratifying dataset")
train_df, val_df = train_test_split(
    filtered_data_cleaned,
    test_size=0.5,
    stratify=filtered_data_cleaned["Class"],
    random_state=42
)

converting non-Chromophobe classes to 'Other'
filtering rare classes
stratifying dataset


In [12]:
# === Upsample training set to balance classes ===
print("upsampling minority classes")
df_list = []
max_class_size = train_df["Class"].value_counts().max()

for label in train_df["Class"].unique():
    subset = train_df[train_df["Class"] == label]
    upsampled = resample(
        subset,
        replace=True,
        n_samples=max_class_size,
        random_state=42
    )
    df_list.append(upsampled)

train_upsampled = pd.concat(df_list)

# === Add 'split' column and combine ===
print("adding train-validation labels")
train_upsampled["split"] = "train"
val_df["split"] = "val"
final_df = pd.concat([train_upsampled, val_df]).reset_index(drop=True)

# === Save to CSV ===
print("saving balanced, filtered dataframe")
final_df.to_csv(f"{working_dir}/data/split_balanced_dataset.csv", index=False)

upsampling minority classes
adding train-validation labels
saving balanced, filtered dataframe
