### 1: Start by splitting your dataset into two separate CSV files:

- x_metadata.csv — contains metadata for each image (including the image filename, age_bin, and tumor_class)

- y_labels.csv — contains the corresponding labels for classification (with filename and label)

Make sure each image has a unique filename, and all images are saved in a dedicated folder (e.g., saved_images/).

In [1]:
import pandas as pd

# Load metadata
x_metadata = pd.read_csv('x_metadata.csv')

x_metadata.head(10)

Unnamed: 0,filename,age_bin,tumor_class
0,img_0000.png,1,2
1,img_0001.png,1,2
2,img_0002.png,0,2
3,img_0003.png,1,2
4,img_0004.png,1,2
5,img_0005.png,0,2
6,img_0006.png,1,2
7,img_0007.png,1,2
8,img_0008.png,1,2
9,img_0009.png,1,2


In [12]:
# Drop 'filename' column
x_metadata = x_metadata.drop(columns=['filename'])

# Print caption for clarity
print("🧾 Captions:")
print("• age_bin: 0 = Age < 40, 1 = Age ≥ 40")
print("• tumor_class: 0 = None, 1 = Mass, 2 = Calcification, 3 = Both")

# Print unique values per column
for col in x_metadata.columns:
    print(f"\nUnique values in '{col}': {x_metadata[col].unique()}")


🧾 Captions:
• age_bin: 0 = Age < 40, 1 = Age ≥ 40
• tumor_class: 0 = None, 1 = Mass, 2 = Calcification, 3 = Both

Unique values in 'age_bin': [1 0]

Unique values in 'tumor_class': [2 1]


In [17]:
import pandas as pd

# Load label data
y_labels = pd.read_csv('y_labels.csv')

y_labels.head(10)


Unnamed: 0,filename,label
0,img_0000.png,0
1,img_0001.png,0
2,img_0002.png,0
3,img_0003.png,0
4,img_0004.png,0
5,img_0005.png,0
6,img_0006.png,0
7,img_0007.png,0
8,img_0008.png,0
9,img_0009.png,0


In [18]:
# Drop filename
y_labels = y_labels.drop(columns=['filename'])

# Print label meaning
print("🏷️ Label Captions:")
print("0 = Benign")
print("1 = HER2")
print("2 = LumA")
print("3 = LumB")
print("4 = TN (Triple Negative)")

# Show unique label values
print("\nUnique label values:", y_labels['label'].unique())

🏷️ Label Captions:
0 = Benign
1 = HER2
2 = LumA
3 = LumB
4 = TN (Triple Negative)

Unique label values: [0 3 2 1 4]


## Step 2: Once you’ve created the two CSV files, you can proceed to load the dataset using them.








In [3]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split

# === 1. Set paths ===
files_path = '.'  # Change to your actual folder
img_dir = os.path.join(files_path, 'saved_images')

# === 2. Load metadata and labels CSVs ===
metadata_df = pd.read_csv(os.path.join(files_path, 'x_metadata.csv'))
labels_df = pd.read_csv(os.path.join(files_path, 'y_labels.csv'))

# === 3. Merge metadata and labels on filename ===
df = pd.merge(metadata_df, labels_df, on='filename')

# === 4. Split into train, val, test ===
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)

# === 5. Helper function to load images and metadata ===
def load_data(subset_df):
    
    images = []
    metadata = []
    labels = []

    for _, row in subset_df.iterrows():
        img_path = os.path.join(img_dir, row['filename'])
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert('RGB')
            img = np.array(img).astype(np.float32) / 255.0
            images.append(img)

            metadata.append([row['age_bin'], row['tumor_class']])
            labels.append(row['label'])

    return np.array(images), np.array(metadata), np.array(labels)

# === 6. Load each split ===
X_train_images, X_train_metadata, y_train = load_data(train_df)
X_val_images,   X_val_metadata,   y_val   = load_data(val_df)
X_test_images,  X_test_metadata,  y_test  = load_data(test_df)

# === 7. Show shapes ===
print("✅ Loaded from CSV + PNGs:")
print("Train images:", X_train_images.shape)
print("Val images:  ", X_val_images.shape)
print("Test images: ", X_test_images.shape)

print("Train metadata:", X_train_metadata.shape)
print("Val metadata:  ", X_val_metadata.shape)
print("Test metadata: ", X_test_metadata.shape)

print("Train labels:", y_train.shape)
print("Val labels:  ", y_val.shape)
print("Test labels: ", y_test.shape)


✅ Loaded from CSV + PNGs:
Train images: (4, 2294, 1914, 3)
Val images:   (6, 2294, 1914, 3)
Test images:  (1, 2294, 1914, 3)
Train metadata: (4, 2)
Val metadata:   (6, 2)
Test metadata:  (1, 2)
Train labels: (4,)
Val labels:   (6,)
Test labels:  (1,)


## You have to make sure you done 2 steps, and ping me to review before going to next steps.