In [1]:
import math
import itertools

import pandas as pd

## Metadata


In [2]:
train_df = pd.read_csv("../data/train.csv")

In [3]:
train_df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [4]:
benign = train_df[train_df["target"] == 0]

In [5]:
benign.shape

(32542, 8)

In [6]:
malignant = train_df[train_df["target"] == 1]

In [7]:
malignant.shape

(584, 8)

In [8]:
# Gather files
benign_files = benign["image_name"]
malignant_files = malignant["image_name"]

In [9]:
benign_files

0        ISIC_2637011
1        ISIC_0015719
2        ISIC_0052212
3        ISIC_0068279
4        ISIC_0074268
             ...     
33121    ISIC_9999134
33122    ISIC_9999320
33123    ISIC_9999515
33124    ISIC_9999666
33125    ISIC_9999806
Name: image_name, Length: 32542, dtype: object

In [10]:
# Number of positive pairs for benign
num_benign_comb = math.comb(len(benign_files), 2)
num_benign_comb

529474611

In [11]:
# Number of positive pair for malignant
num_malignant_comb = math.comb(len(malignant_files), 2)

In [12]:
# Number of negative pairs
len(benign_files) * len(malignant_files)

19004528

## Data Splitting


We use a stratified sample to split.


In [13]:
def split_sub_population(df: pd.DataFrame, id_name: str) -> tuple:
    df_train = df.sample(random_state=42, frac=0.8)
    df_test = df[~df[id_name].isin(df_train[id_name])]

    return df_train, df_test

In [14]:
benign_train, benign_test = split_sub_population(benign, "image_name")
benign_train, benign_val = split_sub_population(benign_train, "image_name")
malignant_train, malignant_test = split_sub_population(malignant, "image_name")
malignant_train, malignant_val = split_sub_population(malignant_train, "image_name")

In [15]:
# Verify numbers add up
print(len(benign_train), len(benign_val), len(benign_test))
print(len(benign), len(benign_train) + len(benign_val) + len(benign_test))

20827 5207 6508
32542 32542


In [16]:
def summarise_num_pairs(benign_df: pd.DataFrame, malignant_df: pd.DataFrame) -> None:
    pos_benign = math.comb(len(benign_df), 2)
    pos_malig = math.comb(len(malignant_df), 2)
    neg = len(benign_df) * len(malignant_df)
    print(
        f"Num positive benign pairs {pos_benign}\n"
        f"Num positive malignant pairs {pos_malig}\n"
        f"Num negative pairs {neg}"
    )

In [17]:
summarise_num_pairs(benign_train, malignant_train)

Num positive benign pairs 216871551
Num positive malignant pairs 69751
Num negative pairs 7789298


In [19]:
count = 0
for pair in itertools.combinations(benign_train["image_name"], 2):
    print(pair)
    count += 1
    if count == 5:
        break

('ISIC_0436994', 'ISIC_2791755')
('ISIC_0436994', 'ISIC_6652681')
('ISIC_0436994', 'ISIC_6901611')
('ISIC_0436994', 'ISIC_1851709')
('ISIC_0436994', 'ISIC_6704623')


In [20]:
pairs = list(itertools.combinations(malignant_train["image_name"], 2))

In [21]:
len(pairs)

69751

## Test dataset


## Image data


In [1]:
from torchvision import io

In [4]:
image = io.read_image("../data/train/ISIC_0015719.jpg")

In [5]:
image.shape

torch.Size([3, 4000, 6000])