In [1]:
import pandas as pd
import numpy as np

#
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#
import seaborn as sns
import plotly.express as px

#
import os
import random
import re
import math
import time

from tqdm import tqdm
from tqdm.keras import TqdmCallback

#from pandas_summary import DataFrameSummary
#from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from keras.preprocessing import image
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

import warnings
from missingpy import MissForest
from PIL import Image
import awswrangler as wr
warnings.filterwarnings('ignore') 
%matplotlib notebook
%matplotlib inline

In [16]:
seed_val = 101
random.seed(seed_val)
np.random.seed(seed_val)
# Setting color palette.
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

# Setting plot styling.
plt.style.use('ggplot')
# expand pandas df rows/column widths etc.
pd.set_option("display.max_rows", None, # display all rows
              "display.max_columns", None, # display all columns
              "display.max_colwidth", None, # expand column width
              "display.html.use_mathjax", False
             ) # disable Latex style mathjax rendering

In [17]:
#df = pd.read_csv('./full_data_v2.csv', index_col = 0).rename(columns = {'duplicated': 'duplicate', 'class':'label'})
s3_path = f's3://rubyhan-w210-datasets/full_data.csv'
df = wr.s3.read_csv(path=s3_path, index_col=0).rename(columns={'duplicated':'duplicate', 'class':'label'})
color_df = pd.read_csv('./full_data_with_color_data.csv')
df = df.merge(color_df[['image_id', 'reds', 'greens', 'blues', 'mean_colors']], on='image_id', how="inner")
df = df[df['duplicate'] == False]
df.head()

Unnamed: 0,image_id,diagnosis,age,sex,localization,source,severity,path,label,duplicate,dataset,split_1,split_2,split_3,label_1,label_2,label_3,split_4,split_5,split_6,reds,greens,blues,mean_colors
1182,ISIC_0027419,benign keratosis-like lesions,80.0,male,scalp,ISIC_2018,unknown,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0027419.jpg,Benign Marking or Mole,False,train,train,train,,Benign Marking or Mole,Benign Marking or Mole,Benign Marking or Mole,train,train,train,208.736267,162.703426,181.659333,184.366342
1183,ISIC_0025030,benign keratosis-like lesions,80.0,male,scalp,ISIC_2018,unknown,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025030.jpg,Benign Marking or Mole,False,train,train,train,,Benign Marking or Mole,Benign Marking or Mole,Benign Marking or Mole,test,train,train,197.138056,156.542415,177.196333,176.958935
1184,ISIC_0026769,benign keratosis-like lesions,80.0,male,scalp,ISIC_2018,unknown,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0026769.jpg,Benign Marking or Mole,False,train,train,train,,Benign Marking or Mole,Benign Marking or Mole,Benign Marking or Mole,test,test,test,214.053785,156.414959,174.070967,181.513237
1185,ISIC_0025661,benign keratosis-like lesions,80.0,male,scalp,ISIC_2018,unknown,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025661.jpg,Benign Marking or Mole,False,train,train,train,,Benign Marking or Mole,Benign Marking or Mole,Benign Marking or Mole,test,test,test,195.708563,142.608015,157.175893,165.164157
1186,ISIC_0031633,benign keratosis-like lesions,75.0,male,ear,ISIC_2018,unknown,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0031633.jpg,Benign Marking or Mole,False,val,val,train,,Benign Marking or Mole,Benign Marking or Mole,Benign Marking or Mole,train,train,train,211.709311,167.980289,185.249274,188.312958


In [18]:
for label in ['label']:
    print(df[label].value_counts(dropna = False, normalize = True))

for split in ['split_1', 'split_2', 'split_3']:
    print(df[split].value_counts(dropna = False, normalize = True))

Unclassified                                          0.456911
Benign Marking or Mole                                0.286754
Toxin, Fungal, Bug, Viral, or Bacterial Infections    0.103720
Non-Cancerous Skin Condition                          0.074343
Potentially Malignant Skin Tumors                     0.057793
Autoimmue Disorder                                    0.020478
Name: label, dtype: float64
train    0.689978
val      0.138005
test     0.100000
NaN      0.072017
Name: split_1, dtype: float64
NaN      0.411562
train    0.408740
test     0.097947
val      0.081751
Name: split_2, dtype: float64
NaN      0.711353
train    0.216485
val      0.043297
test     0.028865
Name: split_3, dtype: float64


### Create Split 3 File

In [4]:
df2=df.replace('unknown',np.NaN).replace(0.0, np.NaN).drop(['duplicate', 'source', 'dataset', 'image_id',
                                                           'label_1', 'label_2', 'label_3', 'split_1', 'split_2',
                                                            'split_4', 'split_5', 'split_6', 'diagnosis',
                                                            'severity'], axis=1)
df2 = df2[df2['split_3'].notna()]
df2 = df2[df2['label'].notna()].reset_index().drop(['index'], axis = 1)
#df2['diagnosis'].fillna('unknown', inplace=True)
df2['missing_anatomy'] = df2['localization'].isna().astype(int)
df2['anatomy_impute_mode']=df2['localization'].fillna(df2['localization'].mode()[0])
df2['missing_sex'] = df2['sex'].isna().astype(int)
df2['sex_impute_mode']=df2['sex'].fillna(df2['sex'].mode()[0])
df2['missing_age'] = df2['age'].isna().astype(int)
df2['age_impute_mode']=df2['age'].fillna(df2['age'].mode()[0])
df2['age_impute_median']=df2['age'].fillna(df2['age'].median())
df2['age_impute_mean']=df2['age'].fillna(df2['age'].mean())
df2['age_impute_max']=df2['age'].fillna(df2['age'].max())
df2['age_impute_min']=df2['age'].fillna(df2['age'].min())


path = df2.path
split_3 = df2.split_3
df3 = df2.drop(['path', 'split_3'], axis =1)
dict_of_dfs = {}
for col in ['sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode', 'label']:
    df = pd.DataFrame()
    df3[col] = df3[col].astype('category')
    df = dict(enumerate(df3[col].cat.categories))
    dict_of_dfs[col] = df 
    df3[col + '_cat'] = df3[col].cat.codes.replace(-1,np.NaN).astype('category')
df3.info()


imputer = MissForest()
#label = df3.label
df3_X = df3.drop(['label', 'sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode'], axis = 1)
cat_cols = [df3_X.columns.get_loc(col) for col in df3_X.select_dtypes(['category']).columns.tolist()]
df3_X_imputed = imputer.fit_transform(df3_X, cat_vars=cat_cols)
df3_X_imputed = pd.DataFrame(df3_X_imputed, columns=df3_X.columns.tolist()).rename(columns={'age': 'age_impute_mf'})
#df3_imputed = pd.concat([df3_X_imputed, label], axis = 1)
#df3_imputed = pd.merge(df3_X_imputed, label, left_index=True, right_index=True)



df3_imputed = df3_X_imputed
cat_cols = df3_imputed.loc[:, df3_imputed.columns.str.endswith("_cat")].columns.to_list()
for col in cat_cols:
    df3_imputed[col] = df3_imputed[col].astype('int')
df3_imputed2 = df3_imputed
for d in list(dict_of_dfs.keys()):
    if d in ['sex_impute_mode', 'anatomy_impute_mode', 'label']:
        df3_imputed2[d] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)
    else:
        df3_imputed2[d + '_impute_mf'] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)


dummies = pd.get_dummies(df3_imputed2['sex_impute_mf'], prefix='sex_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['localization_impute_mf'], prefix='anatomy_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['sex_impute_mode'], prefix='sex_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
# dummies = pd.get_dummies(df3_imputed2['sex_impute_research'], prefix='sex_research')
# df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['anatomy_impute_mode'], prefix='anatomy_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)

# dropping not useful columns
df3_imputed2.drop(['sex_impute_mf', 'localization_impute_mf', 'sex_impute_mode', 'anatomy_impute_mode'],\
                  axis=1, inplace=True)

df3_imputed2.reset_index(drop=True, inplace=True)
path.reset_index(drop=True, inplace=True)

# add path back to the file
df3_imputed2 = pd.concat([df3_imputed2, path], axis=1)
# add path back to the file
df3_imputed2 = pd.concat([df3_imputed2, split_3], axis=1)
df3_imputed2.head()

Unnamed: 0,age,sex,localization,path,label,split_3,reds,greens,blues,mean_colors
0,70.0,female,back,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025209.jpg,Benign Marking or Mole,test,223.231963,163.819367,177.606848,188.219393
1,75.0,male,upper extremity,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025915.jpg,Benign Marking or Mole,train,193.084081,150.232596,163.569293,168.96199
2,70.0,female,face,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0032343.jpg,Benign Marking or Mole,train,143.451707,108.244748,134.621444,128.772633
3,60.0,male,back,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025033.jpg,Benign Marking or Mole,train,198.290763,169.923607,168.705607,178.973326
4,75.0,male,upper extremity,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0032128.jpg,Benign Marking or Mole,train,180.339181,155.822085,176.047952,170.736406


In [20]:
df3_imputed2.to_csv('only_split_3.csv', index=True)  

### Create no split file

In [21]:
df2=df.replace('unknown',np.NaN).replace(0.0, np.NaN).drop(['duplicate', 'source', 'dataset', 'image_id',
                                                           'label_1', 'label_2', 'label_3', 'split_1', 'split_2',
                                                            'split_4', 'split_5', 'split_6', 'diagnosis',
                                                            'severity'], axis=1)
df2 = df2[~df2['label'].isin(['Autoimmue Disorder'])]
df2 = df2[df2['label'].notna()].reset_index().drop(['index'], axis = 1)
#df2['diagnosis'].fillna('unknown', inplace=True)
df2['missing_anatomy'] = df2['localization'].isna().astype(int)
df2['anatomy_impute_mode']=df2['localization'].fillna(df2['localization'].mode()[0])
df2['missing_sex'] = df2['sex'].isna().astype(int)
df2['sex_impute_mode']=df2['sex'].fillna(df2['sex'].mode()[0])
df2['missing_age'] = df2['age'].isna().astype(int)
df2['age_impute_mode']=df2['age'].fillna(df2['age'].mode()[0])
df2['age_impute_median']=df2['age'].fillna(df2['age'].median())
df2['age_impute_mean']=df2['age'].fillna(df2['age'].mean())
df2['age_impute_max']=df2['age'].fillna(df2['age'].max())
df2['age_impute_min']=df2['age'].fillna(df2['age'].min())


path = df2.path
split_3 = df2.split_3
df3 = df2.drop(['path', 'split_3'], axis =1)
dict_of_dfs = {}
for col in ['sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode', 'label']:
    df = pd.DataFrame()
    df3[col] = df3[col].astype('category')
    df = dict(enumerate(df3[col].cat.categories))
    dict_of_dfs[col] = df 
    df3[col + '_cat'] = df3[col].cat.codes.replace(-1,np.NaN).astype('category')
    

imputer = MissForest()
#label = df3.label
df3_X = df3.drop(['label', 'sex', 'localization', 'sex_impute_mode', 'anatomy_impute_mode'], axis = 1)
cat_cols = [df3_X.columns.get_loc(col) for col in df3_X.select_dtypes(['category']).columns.tolist()]
df3_X_imputed = imputer.fit_transform(df3_X, cat_vars=cat_cols)
df3_X_imputed = pd.DataFrame(df3_X_imputed, columns=df3_X.columns.tolist()).rename(columns={'age': 'age_impute_mf'})
#df3_imputed = pd.concat([df3_X_imputed, label], axis = 1)
#df3_imputed = pd.merge(df3_X_imputed, label, left_index=True, right_index=True)


df3_imputed = df3_X_imputed
cat_cols = df3_imputed.loc[:, df3_imputed.columns.str.endswith("_cat")].columns.to_list()
for col in cat_cols:
    df3_imputed[col] = df3_imputed[col].astype('int')
df3_imputed2 = df3_imputed
for d in list(dict_of_dfs.keys()):
    if d in ['sex_impute_mode', 'anatomy_impute_mode', 'label']:
        df3_imputed2[d] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)
    else:
        df3_imputed2[d + '_impute_mf'] = df3_imputed[d + '_cat'].map(dict_of_dfs[d])
        df3_imputed2.drop([d + '_cat'], axis = 1, inplace = True)


dummies = pd.get_dummies(df3_imputed2['sex_impute_mf'], prefix='sex_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['localization_impute_mf'], prefix='anatomy_mf')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['sex_impute_mode'], prefix='sex_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
# dummies = pd.get_dummies(df3_imputed2['sex_impute_research'], prefix='sex_research')
# df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)
dummies = pd.get_dummies(df3_imputed2['anatomy_impute_mode'], prefix='anatomy_mode')
df3_imputed2 = pd.concat([df3_imputed2, dummies], axis=1)

# dropping not useful columns
df3_imputed2.drop(['sex_impute_mf', 'localization_impute_mf', 'sex_impute_mode', 'anatomy_impute_mode'],\
                  axis=1, inplace=True)

df3_imputed2.reset_index(drop=True, inplace=True)
path.reset_index(drop=True, inplace=True)

# add path back to the file
df3_imputed2 = pd.concat([df3_imputed2, path], axis=1)
# add path back to the file
df3_imputed2 = pd.concat([df3_imputed2, split_3], axis=1)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61083 entries, 0 to 61082
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      42975 non-null  float64 
 1   sex                      43019 non-null  category
 2   localization             42380 non-null  category
 3   label                    61083 non-null  category
 4   reds                     61083 non-null  float64 
 5   greens                   61083 non-null  float64 
 6   blues                    61083 non-null  float64 
 7   mean_colors              61083 non-null  float64 
 8   missing_anatomy          61083 non-null  int64   
 9   anatomy_impute_mode      61083 non-null  category
 10  missing_sex              61083 non-null  int64   
 11  sex_impute_mode          61083 non-null  category
 12  missing_age              61083 non-null  int64   
 13  age_impute_mode          61083 non-null  float64 
 14  age_im

Unnamed: 0,age_impute_mf,reds,greens,blues,mean_colors,missing_anatomy,missing_sex,missing_age,age_impute_mode,age_impute_median,age_impute_mean,age_impute_max,age_impute_min,label,sex_mf_female,sex_mf_male,anatomy_mf_abdomen,anatomy_mf_acral,anatomy_mf_back,anatomy_mf_chest,anatomy_mf_ear,anatomy_mf_face,anatomy_mf_foot,anatomy_mf_genital,anatomy_mf_hand,anatomy_mf_head/neck,anatomy_mf_lower extremity,anatomy_mf_neck,anatomy_mf_oral/genital,anatomy_mf_palms/soles,anatomy_mf_scalp,anatomy_mf_torso,anatomy_mf_trunk,anatomy_mf_upper extremity,sex_mode_female,sex_mode_male,anatomy_mode_abdomen,anatomy_mode_acral,anatomy_mode_back,anatomy_mode_chest,anatomy_mode_ear,anatomy_mode_face,anatomy_mode_foot,anatomy_mode_genital,anatomy_mode_hand,anatomy_mode_head/neck,anatomy_mode_lower extremity,anatomy_mode_neck,anatomy_mode_oral/genital,anatomy_mode_palms/soles,anatomy_mode_scalp,anatomy_mode_torso,anatomy_mode_trunk,anatomy_mode_upper extremity,path,split_3
0,80.0,208.736267,162.703426,181.659333,184.366342,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,Benign Marking or Mole,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0027419.jpg,
1,80.0,197.138056,156.542415,177.196333,176.958935,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,Benign Marking or Mole,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025030.jpg,
2,80.0,214.053785,156.414959,174.070967,181.513237,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,Benign Marking or Mole,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0026769.jpg,
3,80.0,195.708563,142.608015,157.175893,165.164157,0.0,0.0,0.0,80.0,80.0,80.0,80.0,80.0,Benign Marking or Mole,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0025661.jpg,
4,75.0,211.709311,167.980289,185.249274,188.312958,0.0,0.0,0.0,75.0,75.0,75.0,75.0,75.0,Benign Marking or Mole,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,./Data/ISIC_2018/Train/HAM10000_images_part_1_and_2/ISIC_0031633.jpg,


In [22]:
df3_imputed2.to_csv('all_data_split_3.csv', index=True)  
df3_imputed2.shape

(61083, 56)

In [23]:
df3_imputed2.label.value_counts()

Unclassified                                          28493
Benign Marking or Mole                                17882
Toxin, Fungal, Bug, Viral, or Bacterial Infections     6468
Non-Cancerous Skin Condition                           4636
Potentially Malignant Skin Tumors                      3604
Name: label, dtype: int64