# Data Cleaning for HAM10000 Dataset

Import important libraries

In [39]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

Import dataset

In [40]:
# load dataset
file_path = 'HAM 10000/HAM10000_metadata.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


Data cleaning

In [50]:
# Check for missing values
print(df.isnull().sum())

# Remove rows with missing data
df_cleaned = df.dropna()

# Remove dx_type column
df_cleaned = df_cleaned.drop(columns=['dx_type'])

# Remove rows with 'unknown' sex and localization
df_cleaned = df_cleaned[df_cleaned['sex'] != 'unknown']
df_cleaned = df_cleaned[df_cleaned['localization'] != 'unknown']

# encode categorical data - sex, localisation and diagnosis
label_encoder_sex = LabelEncoder()
label_encoder_local = LabelEncoder()
label_encoder_dx = LabelEncoder()
# label_encoder_dxtype = LabelEncoder()

df_cleaned['sex'] = label_encoder_sex.fit_transform(df_cleaned['sex'])
df_cleaned['localization'] = label_encoder_local.fit_transform(df_cleaned['localization'])
df_cleaned['dx'] = label_encoder_dx.fit_transform(df_cleaned['dx'])
# df_cleaned['dx_type'] = label_encoder_dxtype.fit_transform(df_cleaned['dx_type'])

# Rename columns
df_cleaned.rename(columns={'dx': 'diagnosis'}, inplace=True)

df_cleaned

lesion_id        0
image_id         0
dx               0
dx_type          0
age             57
sex              0
localization     0
dtype: int64


Unnamed: 0,lesion_id,image_id,diagnosis,age,sex,localization
0,HAM_0000118,ISIC_0027419,2,80.0,1,11
1,HAM_0000118,ISIC_0025030,2,80.0,1,11
2,HAM_0002730,ISIC_0026769,2,80.0,1,11
3,HAM_0002730,ISIC_0025661,2,80.0,1,11
4,HAM_0001466,ISIC_0031633,2,75.0,1,4
...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,0,40.0,1,0
10011,HAM_0002867,ISIC_0033550,0,40.0,1,0
10012,HAM_0002867,ISIC_0033536,0,40.0,1,0
10013,HAM_0000239,ISIC_0032854,0,80.0,1,5


Encoding

In [51]:
# check how data has been encoded

# For 'sex' column
sex_classes = label_encoder_sex.classes_
print("Encoded values and their corresponding meanings for 'sex' column:")
for i, value in enumerate(sex_classes):
    print(f"Encoded value {i}: {value}")

# For 'anatom_site_general_challenge' column
anatom_site_classes = label_encoder_local.classes_
print("\nEncoded values and their corresponding meanings for 'localization' column:")
for i, value in enumerate(anatom_site_classes):
    print(f"Encoded value {i}: {value}")

# For 'diagnosis' column
diagnosis_classes = label_encoder_dx.classes_
print("\nEncoded values and their corresponding meanings for 'diagnosis' column:")
for i, value in enumerate(diagnosis_classes):
    print(f"Encoded value {i}: {value}")

Encoded values and their corresponding meanings for 'sex' column:
Encoded value 0: female
Encoded value 1: male

Encoded values and their corresponding meanings for 'localization' column:
Encoded value 0: abdomen
Encoded value 1: acral
Encoded value 2: back
Encoded value 3: chest
Encoded value 4: ear
Encoded value 5: face
Encoded value 6: foot
Encoded value 7: genital
Encoded value 8: hand
Encoded value 9: lower extremity
Encoded value 10: neck
Encoded value 11: scalp
Encoded value 12: trunk
Encoded value 13: upper extremity

Encoded values and their corresponding meanings for 'diagnosis' column:
Encoded value 0: akiec
Encoded value 1: bcc
Encoded value 2: bkl
Encoded value 3: df
Encoded value 4: mel
Encoded value 5: nv
Encoded value 6: vasc
