# Derma Scanner — EDA & Preprocessing

> Explore HAM10000, verify splits, visualize samples, and review preprocessing steps.

**Note:** Run this from the project root after preparing splits (`python src/data_prepare.py`).

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import random, os
from collections import Counter

%matplotlib inline
PROJECT = Path('.')
RAW = PROJECT/'data/raw'
PROC = PROJECT/'data/processed'
assert PROC.exists(), 'Run: python src/data_prepare.py'

## Class distribution in processed splits

In [None]:
def count_images(split_dir):
    counts = {}
    for cls in sorted(os.listdir(split_dir)):
        p = split_dir/cls
        if p.is_dir():
            counts[cls] = len([f for f in p.glob('*.jpg')])
    return counts

for split in ['train','val','test']:
    counts = count_images(PROC/split)
    print(split, counts)

## Visualize a few samples per class (from `data/processed/train`)

In [None]:
import math
classes = [d.name for d in (PROC/'train').iterdir() if d.is_dir()]
samples = []
for cls in classes:
    files = list((PROC/'train'/cls).glob('*.jpg'))
    if files:
        samples.append((cls, random.choice(files)))

cols = 4
rows = math.ceil(len(samples)/cols)
plt.figure(figsize=(4*cols, 3*rows))
for i,(cls,fp) in enumerate(samples,1):
    plt.subplot(rows, cols, i)
    img = Image.open(fp)
    plt.imshow(img)
    plt.title(cls)
    plt.axis('off')
plt.tight_layout()
plt.show()

## Preprocessing used in the pipeline
We apply the following steps:
- **Resize** to 224×224
- **Normalize** to ImageNet stats (`mean=[0.485, 0.456, 0.406]`, `std=[0.229, 0.224, 0.225]`)
- **Data augmentation (train only):** random horizontal/vertical flips and light color jitter

These are implemented in `src/train.py` via torchvision `transforms`. You can tune them here as well.

In [None]:
from torchvision import transforms
train_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])
print(train_transforms)