In [1]:
import numpy as np
import pandas as pd

In [3]:
train = pd.read_csv("train-metadata.csv")
test = pd.read_csv("test-metadata.csv")

  train = pd.read_csv("train-metadata.csv")


In [80]:
train.shape

(401059, 55)

In [81]:
test.shape

(3, 44)

Test has fewer columns than train - only use the intersection so that we can apply to test data <br>

In [82]:
shared_cols = set(train.columns).intersection(set(test.columns))
len(shared_cols)

44

Use second line in the next cell if you want to use all columns instead.

In [83]:
train_trim = train[list(shared_cols)]
# train_trim = train 
train_trim.shape

(401059, 44)

String columns

In [84]:
len(train_trim.select_dtypes(include="object").columns)

10

Number columns

In [85]:
len(train_trim.select_dtypes(include="number").columns)

34

Null checking

In [86]:
nulls = train_trim.apply(lambda col: col.isnull().sum() / col.size * 100)
nulls.head()

tbp_lv_color_std_mean    0.0
tbp_lv_deltaLB           0.0
tbp_lv_stdL              0.0
tbp_lv_perimeterMM       0.0
attribution              0.0
dtype: float64

Columns with nulls

In [87]:
nulls[nulls > 0]

sex                    2.871647
anatom_site_general    1.435200
age_approx             0.697653
dtype: float64

Dropping rows with null age. Sex handled later, anatom_site_general not used.

In [88]:
train_trim = train_trim.dropna(subset=['age_approx'])

Extract string columns

In [89]:
string_cols = train_trim.select_dtypes(include="object")
string_cols.head()

Unnamed: 0,attribution,sex,copyright_license,tbp_lv_location,tbp_tile_type,tbp_lv_location_simple,isic_id,patient_id,image_type,anatom_site_general
0,Memorial Sloan Kettering Cancer Center,male,CC-BY,Right Leg - Upper,3D: white,Right Leg,ISIC_0015670,IP_1235828,TBP tile: close-up,lower extremity
1,Memorial Sloan Kettering Cancer Center,male,CC-BY,Head & Neck,3D: white,Head & Neck,ISIC_0015845,IP_8170065,TBP tile: close-up,head/neck
2,Memorial Sloan Kettering Cancer Center,male,CC-BY,Torso Back Top Third,3D: XP,Torso Back,ISIC_0015864,IP_6724798,TBP tile: close-up,posterior torso
3,ACEMID MIA,male,CC-0,Torso Front Top Half,3D: XP,Torso Front,ISIC_0015902,IP_4111386,TBP tile: close-up,anterior torso
4,Memorial Sloan Kettering Cancer Center,male,CC-BY,Torso Front Top Half,3D: white,Torso Front,ISIC_0024200,IP_8313778,TBP tile: close-up,anterior torso


Check for constant-value columns

In [90]:
string_cols.apply(lambda row: row.unique())

attribution               [Memorial Sloan Kettering Cancer Center, ACEMI...
sex                                                     [male, female, nan]
copyright_license                                   [CC-BY, CC-0, CC-BY-NC]
tbp_lv_location           [Right Leg - Upper, Head & Neck, Torso Back To...
tbp_tile_type                                           [3D: white, 3D: XP]
tbp_lv_location_simple    [Right Leg, Head & Neck, Torso Back, Torso Fro...
isic_id                   [ISIC_0015670, ISIC_0015845, ISIC_0015864, ISI...
patient_id                [IP_1235828, IP_8170065, IP_6724798, IP_411138...
image_type                                             [TBP tile: close-up]
anatom_site_general       [lower extremity, head/neck, posterior torso, ...
dtype: object

Drop ID and other useless string columns

In [91]:
to_drop = ["tbp_tile_type", "copyright_license", "isic_id", "patient_id", "attribution", "image_type"]
set(string_cols.columns).difference(set(to_drop))

{'anatom_site_general', 'sex', 'tbp_lv_location', 'tbp_lv_location_simple'}

3 location columns, increasing specificity. Using the middle one for one-hot encoding

In [92]:
string_cols["anatom_site_general"].value_counts()

anatom_site_general
posterior torso    121143
lower extremity    102071
anterior torso      87256
upper extremity     70088
head/neck           11962
Name: count, dtype: int64

In [93]:
string_cols["tbp_lv_location_simple"].value_counts()

tbp_lv_location_simple
Torso Back     121143
Torso Front     87256
Left Leg        52633
Right Leg       49438
Left Arm        36124
Right Arm       33964
Head & Neck     11962
Unknown          5741
Name: count, dtype: int64

In [94]:
string_cols["tbp_lv_location"].value_counts()

tbp_lv_location
Torso Back Top Third       70703
Torso Front Top Half       62991
Torso Back Middle Third    45859
Left Leg - Lower           27205
Right Leg - Lower          25008
Torso Front Bottom Half    24205
Left Leg - Upper           23454
Right Arm - Upper          22849
Right Leg - Upper          22719
Left Arm - Upper           22713
Head & Neck                11962
Left Arm - Lower           11820
Right Arm - Lower          10514
Unknown                     5741
Torso Back Bottom Third     4572
Left Leg                    1974
Right Leg                   1711
Left Arm                    1591
Right Arm                    601
Torso Front                   60
Torso Back                     9
Name: count, dtype: int64

One-hot encoding the sex column. Using a vector of length 2 here to account for the NaaNs - when the value is "unknown", the vector is [0, 0]. Alternatively, we could drop all NaaN rows and use a single indicator column.

In [95]:
male_indicator = string_cols["sex"].apply(lambda s: int(s == "male" and not s == "unknown"))
male_indicator.name = "Male"
female_indicator = string_cols["sex"].apply(lambda s: int(s == "female" and not s == "unknown"))
female_indicator.name = "Female"

In [96]:
numerical_sex = pd.concat([male_indicator, female_indicator], axis=1)
numerical_sex.tail()

Unnamed: 0,Male,Female
401054,1,0
401055,1,0
401056,0,1
401057,0,1
401058,1,0


One-hot encoding the location column

In [97]:
one_hot_location = pd.Series(string_cols["tbp_lv_location_simple"].value_counts().index[:-1]).apply(
    lambda location: string_cols["tbp_lv_location_simple"].apply(lambda loc: int(loc == location))).transpose()
one_hot_location.columns = string_cols["tbp_lv_location_simple"].value_counts().index[:-1]
one_hot_location.head()


tbp_lv_location_simple,Torso Back,Torso Front,Left Leg,Right Leg,Left Arm,Right Arm,Head & Neck
0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0


Total one-hot encoded columns

In [98]:
one_hot_combined = pd.concat([numerical_sex, one_hot_location], axis=1)
one_hot_combined.head()

Unnamed: 0,Male,Female,Torso Back,Torso Front,Left Leg,Right Leg,Left Arm,Right Arm,Head & Neck
0,1,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,1
2,1,0,1,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0


Normalizing other numerical columns to the [0, 1] range with min-max scaling

In [99]:
numerical = train_trim.select_dtypes(include="number")

for column in numerical.columns: 
    numerical[column] = (numerical[column] - numerical[column].min()) / (numerical[column].max() - numerical[column].min())     

Concat with numerical columns to get the clean data

In [100]:
clean = pd.concat([one_hot_combined, numerical], axis=1)
clean.head()

Unnamed: 0,Male,Female,Torso Back,Torso Front,Left Leg,Right Leg,Left Arm,Right Arm,Head & Neck,tbp_lv_color_std_mean,...,tbp_lv_C,tbp_lv_nevi_confidence,tbp_lv_B,tbp_lv_deltaL,tbp_lv_minorAxisMM,tbp_lv_norm_border,tbp_lv_areaMM2,tbp_lv_symm_2axis_angle,age_approx,tbp_lv_Hext
0,1,0,0,0,0,1,0,0,0,0.0,...,0.54981,2.628592e-05,0.502444,0.836741,0.070059,0.690918,0.008153,0.485714,0.6875,0.267121
1,1,0,0,0,0,0,0,0,1,0.0,...,0.68505,1.334303e-09,0.491697,0.867883,0.03025,0.162262,0.001462,0.314286,0.6875,0.152394
2,1,0,1,0,0,0,0,0,0,0.0,...,0.738108,2.959177e-06,0.703178,0.835366,0.050841,0.447253,0.008491,0.6,0.6875,0.336171
3,1,0,0,1,0,0,0,0,0,0.051695,...,0.407316,0.2198945,0.402979,0.91727,0.12186,0.147329,0.016925,0.742857,0.75,0.308385
4,1,0,0,1,0,0,0,0,0,0.0,...,0.595283,1.378832e-05,0.49413,0.811955,0.036212,0.326168,0.005005,0.114286,0.625,0.230185


Save the clean data

In [101]:
clean.to_csv("clean_train-metadata.csv", index=False)