In [1]:
# imports
import pandas as pd
import numpy as np

# data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# image visualisation
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


# Preparing ISIC 2020 Data for ML

In [2]:
# Attempt to read the file with different encodings
try:
    df_isic = pd.read_csv("data/ISIC_2020_Training_GroundTruth_v2.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        df_isic = pd.read_csv("data/ISIC_2020_Training_GroundTruth_v2.csv", encoding='latin-1')
    except UnicodeDecodeError:
        df_isic = pd.read_csv("data/ISIC_2020_Training_GroundTruth_v2.csv", encoding='cp1252')

# Print data types of variables
print('Data Types')
print(df_isic.info())

# Display summary statistics
print('\nSummary Statistics')
print(df_isic.describe())

# Check for null values
print('\nCheck for Null')
print(df_isic.isna().any())

# View data frames
df_isic

Data Types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33126 entries, 0 to 33125
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     33126 non-null  object 
 1   patient_id                     33126 non-null  object 
 2   lesion_id                      33126 non-null  object 
 3   sex                            33061 non-null  object 
 4   age_approx                     33058 non-null  float64
 5   anatom_site_general_challenge  32599 non-null  object 
 6   diagnosis                      33126 non-null  object 
 7   benign_malignant               33126 non-null  object 
 8   target                         33126 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 2.3+ MB
None

Summary Statistics
         age_approx        target
count  33058.000000  33126.000000
mean      48.870016      0.017630
std       14.380360      0.13160

Unnamed: 0,image_name,patient_id,lesion_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0
...,...,...,...,...,...,...,...,...,...
33121,ISIC_9999134,IP_6526534,IL_2076932,male,50.0,torso,unknown,benign,0
33122,ISIC_9999320,IP_3650745,IL_6891604,male,65.0,torso,unknown,benign,0
33123,ISIC_9999515,IP_2026598,IL_6364820,male,20.0,lower extremity,unknown,benign,0
33124,ISIC_9999666,IP_7702038,IL_6048457,male,50.0,lower extremity,unknown,benign,0


In [3]:
#drop
df_cleaned_isic = df_isic.drop(columns=['benign_malignant','target' ,'patient_id' , 'lesion_id',])

# drop unknown diagnosis
df_cleaned_isic = df_cleaned_isic[df_cleaned_isic['diagnosis'] != 'unknown']

# Remove rows with missing data
df_cleaned_isic = df_cleaned_isic.dropna()

# encode categorical data - sex, localisation
one_hot_encoded_anatom = pd.get_dummies(df_cleaned_isic['anatom_site_general_challenge'])
df_cleaned_isic = pd.concat([df_cleaned_isic, one_hot_encoded_anatom], axis=1)

one_hot_encoded_sex = pd.get_dummies(df_cleaned_isic['sex'])
df_cleaned_isic = pd.concat([df_cleaned_isic, one_hot_encoded_sex], axis=1)

df_cleaned_isic = df_cleaned_isic.drop(columns=['sex', 'anatom_site_general_challenge'])

# Replace values in the 'diagnosis' column using the mapping dictionary
mapping = {
    'atypical melanocytic proliferation': 'aimp',
    'cafe-au-lait macule': 'cal',
    'lentigo NOS': 'lnos',
    'lichenoid keratosis': 'lk',
    'melanoma': 'mel',
    'nevus': 'nv',
    'solar lentigo': 'sl'
}

df_cleaned_isic['diagnosis'] = df_cleaned_isic['diagnosis'].replace(mapping)

# call diagnosis column "target" and put target at the end
df_cleaned_isic = df_cleaned_isic.rename(columns={'diagnosis': 'target'})

columns = list(df_cleaned_isic.columns)
columns.remove('target')
columns.append('target')

df_cleaned_isic = df_cleaned_isic[columns]

df_cleaned_isic

Unnamed: 0,image_name,age_approx,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity,female,male,target
2,ISIC_0052212,50.0,0,1,0,0,0,0,1,0,nv
12,ISIC_0076995,55.0,0,0,0,0,1,0,1,0,nv
26,ISIC_0084086,60.0,0,1,0,0,0,0,0,1,nv
27,ISIC_0084270,40.0,0,1,0,0,0,0,0,1,nv
28,ISIC_0084395,45.0,0,0,0,0,1,0,1,0,nv
...,...,...,...,...,...,...,...,...,...,...,...
33108,ISIC_9995691,50.0,0,0,0,0,0,1,0,1,nv
33113,ISIC_9997614,50.0,0,0,0,0,0,1,1,0,nv
33117,ISIC_9998682,60.0,1,0,0,0,0,0,0,1,mel
33118,ISIC_9998937,40.0,1,0,0,0,0,0,0,1,nv


In [4]:
# turn image_name into image_id for easy concatenation
df_cleaned_isic = df_cleaned_isic.rename(columns={'image_name': 'image_id'})

# turn age_approx into age for easy concatenation
df_cleaned_isic = df_cleaned_isic.rename(columns={'age_approx': 'age'})

df_cleaned_isic

Unnamed: 0,image_id,age,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity,female,male,target
2,ISIC_0052212,50.0,0,1,0,0,0,0,1,0,nv
12,ISIC_0076995,55.0,0,0,0,0,1,0,1,0,nv
26,ISIC_0084086,60.0,0,1,0,0,0,0,0,1,nv
27,ISIC_0084270,40.0,0,1,0,0,0,0,0,1,nv
28,ISIC_0084395,45.0,0,0,0,0,1,0,1,0,nv
...,...,...,...,...,...,...,...,...,...,...,...
33108,ISIC_9995691,50.0,0,0,0,0,0,1,0,1,nv
33113,ISIC_9997614,50.0,0,0,0,0,0,1,1,0,nv
33117,ISIC_9998682,60.0,1,0,0,0,0,0,0,1,mel
33118,ISIC_9998937,40.0,1,0,0,0,0,0,0,1,nv


In [5]:
#save df to csv
df_cleaned_isic.to_csv('data/ISIC_2020_Training_GroundTruth_v2_prepared.csv', index=False)

# Preparing HAM 10000 Data for ML

In [6]:
# Attempt to read the file with different encodings
try:
    df_ham = pd.read_csv("data/HAM 10000/HAM10000_metadata.csv", encoding='utf-8')
except UnicodeDecodeError:
    try:
        df_ham = pd.read_csv("data/HAM 10000/HAM10000_metadata.csv", encoding='latin-1')
    except UnicodeDecodeError:
        df_ham = pd.read_csv("data/HAM 10000/HAM10000_metadata.csv", encoding='cp1252')

# Print data types of variables
print('Data Types')
print(df_ham.info())

# Display summary statistics
print('\nSummary Statistics')
print(df_ham.describe())

# Check for null values
print('\nCheck for Null')
print(df_ham.isna().any())

# View data frames
df_ham

Data Types
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB
None

Summary Statistics
               age
count  9958.000000
mean     51.863828
std      16.968614
min       0.000000
25%      40.000000
50%      50.000000
75%      65.000000
max      85.000000

Check for Null
lesion_id       False
image_id        False
dx              False
dx_type         False
age              True
sex             False
localization    False
dtype: bool


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear
...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face


In [7]:
#drop dx_type and lesion_id as not needed
df_cleaned_ham = df_ham.drop(columns=['dx_type', 'lesion_id'])

# drop rows with missing values
df_cleaned_ham = df_cleaned_ham.dropna()

# drop rows where gender and localiztion is unknown
df_cleaned_ham = df_cleaned_ham[df_cleaned_ham['sex'] != 'unknown']
df_cleaned_ham = df_cleaned_ham[df_cleaned_ham['localization'] != 'unknown']

# encode categorical data - sex, localisation
one_hot_encoded_anatom = pd.get_dummies(df_cleaned_ham['localization'])
df_cleaned_ham = pd.concat([df_cleaned_ham, one_hot_encoded_anatom], axis=1)


one_hot_encoded_sex = pd.get_dummies(df_cleaned_ham['sex'])
df_cleaned_ham = pd.concat([df_cleaned_ham, one_hot_encoded_sex], axis=1)

df_cleaned_ham = df_cleaned_ham.drop(columns=['sex', 'localization'])

# rename dx to target and put it in the last column
df_cleaned_ham.rename(columns={'dx': 'target'}, inplace=True)
columns = list(df_cleaned_ham.columns)
columns.remove('target')
columns.append('target')

df_cleaned_ham = df_cleaned_ham[columns]

df_cleaned_ham

Unnamed: 0,image_id,age,abdomen,acral,back,chest,ear,face,foot,genital,hand,lower extremity,neck,scalp,trunk,upper extremity,female,male,target
0,ISIC_0027419,80.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,bkl
1,ISIC_0025030,80.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,bkl
2,ISIC_0026769,80.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,bkl
3,ISIC_0025661,80.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,bkl
4,ISIC_0031633,75.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,bkl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10010,ISIC_0033084,40.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,akiec
10011,ISIC_0033550,40.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,akiec
10012,ISIC_0033536,40.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,akiec
10013,ISIC_0032854,80.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,akiec


# Combine some localizations together
## For easy concatenation, so that fit into the same categories as ISIC 2020

These categories include: head/neck, lower extremity, oral/genital, palms/soles, torso, upper extremity 

In [8]:
# create 'head/neck' by combining 'face', 'ear', 'neck', 'scalp'
df_cleaned_ham['head/neck'] = df_cleaned_ham[['face', 'ear', 'neck', 'scalp']].sum(axis=1)
df_cleaned_ham = df_cleaned_ham.drop(columns=['face', 'ear', 'neck', 'scalp'])

# create 'oral/genital' by calling 'genital' 'oral/genital'
df_cleaned_ham.rename(columns={'genital': 'oral/genital'}, inplace=True)

# create 'palms/soles' by combining 'foot', 'hand'
df_cleaned_ham['palms/soles'] = df_cleaned_ham[['foot', 'hand']].sum(axis=1)
df_cleaned_ham = df_cleaned_ham.drop(columns=['foot', 'hand'])

# create 'torso' by combining 'abdomen', 'acral', 'back', 'chest', 'trunk'
df_cleaned_ham['torso'] = df_cleaned_ham[['abdomen', 'acral', 'back', 'chest', 'trunk']].sum(axis=1)
df_cleaned_ham = df_cleaned_ham.drop(columns=['abdomen', 'acral', 'back', 'chest', 'trunk'])

df_cleaned_ham

Unnamed: 0,image_id,age,oral/genital,lower extremity,upper extremity,female,male,target,head/neck,palms/soles,torso
0,ISIC_0027419,80.0,0,0,0,0,1,bkl,1,0,0
1,ISIC_0025030,80.0,0,0,0,0,1,bkl,1,0,0
2,ISIC_0026769,80.0,0,0,0,0,1,bkl,1,0,0
3,ISIC_0025661,80.0,0,0,0,0,1,bkl,1,0,0
4,ISIC_0031633,75.0,0,0,0,0,1,bkl,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
10010,ISIC_0033084,40.0,0,0,0,0,1,akiec,0,0,1
10011,ISIC_0033550,40.0,0,0,0,0,1,akiec,0,0,1
10012,ISIC_0033536,40.0,0,0,0,0,1,akiec,0,0,1
10013,ISIC_0032854,80.0,0,0,0,0,1,akiec,1,0,0


In [9]:
#save df to csv
df_cleaned_isic.to_csv('data/HAM 10000/HAM10000_metadata_prepared.csv', index=False)

# Combine HAM10000 and ISIC 2020

In [10]:
combined_df = pd.concat([df_cleaned_isic, df_cleaned_ham], ignore_index=True)
combined_df

Unnamed: 0,image_id,age,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity,female,male,target
0,ISIC_0052212,50.0,0,1,0,0,0,0,1,0,nv
1,ISIC_0076995,55.0,0,0,0,0,1,0,1,0,nv
2,ISIC_0084086,60.0,0,1,0,0,0,0,0,1,nv
3,ISIC_0084270,40.0,0,1,0,0,0,0,0,1,nv
4,ISIC_0084395,45.0,0,0,0,0,1,0,1,0,nv
...,...,...,...,...,...,...,...,...,...,...,...
15697,ISIC_0033084,40.0,0,0,0,0,1,0,0,1,akiec
15698,ISIC_0033550,40.0,0,0,0,0,1,0,0,1,akiec
15699,ISIC_0033536,40.0,0,0,0,0,1,0,0,1,akiec
15700,ISIC_0032854,80.0,1,0,0,0,0,0,0,1,akiec


In [11]:
combined_df.to_csv('data/combined_data_prepared.csv', index=False)

# Delete Images with no meta data