In [5]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl (8.1 MB)
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Downloading scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl (20.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m11.1 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0

In [7]:
%pip install torchvision

Collecting torchvision
  Downloading torchvision-0.25.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.4 kB)
Downloading torchvision-0.25.0-cp312-cp312-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.25.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import os
import torch
import csv
import sklearn
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset

In order to take up less space, CSVs are generated with the specific features to filter out images. Because filenames and their midas_path (which serves as labels) are stored in the CSV, a custom Dataset class needs to be created so that the images can be preprocessed and then loaded into PyTorch's DataLoader.

In [9]:
class SkinImageDataset(Dataset):
    # reads the csv file as a dataframe
    # preprocess is the preprocessing function that should be run on each image
    # mapping is a dictionary that maps the string label to a number
    def __init__(self, file, preprocess, mapping):
        self.images = pd.read_csv(file)
        self.preprocessFunction = preprocess
        self.mapping = mapping

    def __len__(self):
        return len(self.images)
    
    # this will preprocess the image and then return that and its label
    # assuming the images are in data/images
    def __getitem__(self,idx):
        item_details = self.images.iloc[idx]
        image = Image.open(os.path.join("data","images",item_details["midas_file_name"]))
        image_tensor = self.preprocessFunction(image)
        return image_tensor, self.mapping[item_details["midas_path"]]

Since the lesions are guaranteed to be centered in the images, pad the images before center cropping them

In [10]:
preprocess = transforms.Compose([
   transforms.Resize(224),               
   transforms.Pad(padding=16, padding_mode='reflect'),  
   transforms.CenterCrop(224),
   transforms.ToTensor(),
   transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225]),
])

create train test split will split images in the passed in file into a train test split and write them into respective train and test CSVs.
It returns the full path to the train and test csvs and two dictionaries: one mapping string labels to integers and then integers to string labels

In [None]:

def create_train_test_split(filtered_path=None, train_file=None, test_file=None):
    if filtered_path is None:
        filtered_path = os.path.join("data","filtered_midas.csv")
    df = pd.read_csv(filtered_path)
    # midas_path is whether it's bengin/malignant
    imageLabels = df["midas_path"].astype("category")
    label_map = dict(enumerate(imageLabels.cat.categories))
    train_df, test_df = sklearn.model_selection.train_test_split(
        df,test_size=0.2, stratify=df["midas_path"], random_state=42)
    if train_file is None:
        train_file = "train_data.csv"
    train_file_path = os.path.join("data", train_file)
    if test_file is None:
        test_file = "test_data.csv"
    test_file_path = os.path.join("data",test_file)
    train_df.to_csv(train_file_path, index=False)
    test_df.to_csv(test_file_path, index=False)
    return train_file_path, test_file_path, {v: k for k, v in label_map.items()}, label_map

In [None]:
train_csv, test_csv, label_to_int, int_to_label = create_train_test_split()

train_dataset = SkinImageDataset(train_csv,preprocess=preprocess,mapping=label_to_int)
test_dataset = SkinImageDataset(train_csv,preprocess=preprocess,mapping=label_to_int)