## Setup

### Installs, Packages, Seeds

In [92]:
import pandas as pd
# import matplotlib.pyplot as plt
# import matplotlib.image as mp_image
from IPython import display    # Easily show images in notebook
import os
# import cv2
import numpy as np
# from skimage import io
from PIL import Image
from google.colab import drive # Connect colab to google drive
from glob import glob
from pathlib import PurePath

# sklearn libraries
from sklearn.model_selection import train_test_split

In [100]:
# Set Seeds
np.random.seed(99)
pd_seed = 99
# torch.cuda.manual_seed(10)

### Mount Google Drive

In [8]:
drive.mount('/drive') 
%cd /drive/MyDrive/W210 - Capstone/

Mounted at /drive
/drive/.shortcut-targets-by-id/1oLqejM9KnDiIgUupEGkxGM3_vdqboxlI/W210 - Capstone


## Data Blend

To blend our data we'd like to start with two things: 
1. a list of all image paths **all_image_path**, with a related column to note which dataset it's from
2. a single y vector to represent all of the different classes **df.fill_in**

### First, doing this for ISIC 2018

ISIC 2018 has 3 datasets - train, test, and val. Train is the only set with the Y vector represented in the metadata, so the other images aren't all that interesting (we can't see if we're right or wrong). Because of that, only looking at the 'train' set below. for more info on the full splits, see [GL_modeling_img_only](https://colab.research.google.com/drive/11ytZd4whUOTOsveNcbmIyOHAWU6MDxbi?authuser=1#scrollTo=rUgahMpfsKKF) notebook. 

#### Images

In [5]:
data_dir = './Data/'

# ISIC 2018
ISIC_2018_path = 'ISIC_2018/Train/HAM10000_images_part_1_and_2/'

# ISIC 2020
ISIC_2020_path = 'ISIC_2020/Data/'

# Stanford Diverse
stanford_path = 'diverse_stanford/'

# Dermnet
dermnet_path = 'dermnet/'
dermnet_path_train = dermnet_path + 'train/'
dermnet_path_test = dermnet_path + 'test/'

# Extension Dictionary
exts = {'ISIC': '.jpg', 'stanford': '.png', 'dermnet': '.jpg'}

In [None]:
# Check to make sure we're seeing all of the images in train
print('ISIC 2018 Images:\t', len(os.listdir(data_dir + ISIC_2018_path)))
print('ISIC 2020 Images:\t', len(os.listdir(data_dir + ISIC_2020_path)))
print('Stanford Diverse Images:\t', len(os.listdir(data_dir + stanford_path)))
# print('Dermnet Images:\t', len(os.listdir(data_dir + dermnet_path_train)) + len(os.listdir(data_dir + dermnet_path_test)))

ISIC 2018 Images:	 10015
ISIC 2020 Images:	 33126
Stanford Diverse Images:	 659


In [None]:
# Get a list of all image paths
ISIC_2018_images = glob(os.path.join(data_dir+ISIC_2018_path, '*'+exts['ISIC']))
ISIC_2020_images = glob(os.path.join(data_dir+ISIC_2020_path, '*'+exts['ISIC']))
stanford_images = glob(os.path.join(data_dir+stanford_path, '*'+exts['stanford']))
dermnet_images = glob(os.path.join(data_dir+dermnet_path, '*', '*', '*'+exts['dermnet'])) # one level deeper since train and test are separate
print(len(ISIC_2018_images), len(ISIC_2020_images), len(stanford_images), len(dermnet_images))    # make sure they're the right length

10015 33126 656 19559


#### Metadata

Now, load metadata. We're interested in the: 
- image ID
- diagnosis
- severity (if noted)
- source

Notice each of the DFs has an image_id-like column. 
- ISIC  formats the ID as "ISIC_" with 7 digits trailing, without an extension, front-padding with 0's. 
- Stanford formats the ID as a 6 digit ID with '.png' extension, front-padding with 0's.
    - Because of this, will remove the extension below for consistency
- Dermnet...

In [None]:
# ISIC_2018
ISIC_2018_meta = pd.read_csv(data_dir + 'ISIC_2018/Train/HAM10000_metadata.csv')
ISIC_2018 = ISIC_2018_meta[['image_id', 
                            'dx', 
                            'age', 
                            'sex', 
                            'localization']
                           ].rename(columns = 
                                            {'dx': 'diagnosis'})
ISIC_2018['source'] = 'ISIC_2018'

# ISIC_2020
ISIC_2020_meta = pd.read_csv(data_dir + 'ISIC_2020/train.csv')
ISIC_2020 = ISIC_2020_meta[['image_name', 
                            'diagnosis', 
                            'benign_malignant', 
                            'age_approx', 
                            'sex', 
                            'anatom_site_general_challenge']
                           ].rename(columns = 
                                            {'image_name': 'image_id', 
                                            'benign_malignant':'severity', 
                                            'age_approx':'age', 
                                            'anatom_site_general_challenge':'localization'})
ISIC_2020['source'] = 'ISIC_2020'

# stanford_diverse
stanford_meta = pd.read_csv(data_dir + stanford_path + 'ddi_metadata.csv')
stanford = stanford_meta[['DDI_file', 'disease', 'malignant']].rename(columns = {'DDI_file': 'image_id', 'disease':'diagnosis','malignant':'severity'})
stanford['source'] = 'stanford_diverse'
stanford['image_id'] = stanford['image_id'].str.replace('.png', '') # For consistency? 
stanford['severity'] = stanford['severity'].map({True:'malignant', False:'benign'})

# Dermnet - no metadata, so create it mimicing above
image_id = [os.path.splitext(os.path.basename(i))[0] for i in dermnet_images]
diagnosis = [PurePath(i).parent.name for i in dermnet_images]
dermnet = pd.DataFrame(zip(image_id, diagnosis), columns = ['image_id', 'diagnosis'])
dermnet['source'] = 'dermnet'



In [None]:
len(ISIC_2020.diagnosis.unique())

9

### Concatenate all of the data into a single DF

In [None]:
full_data = pd.concat([ISIC_2018, ISIC_2020, stanford, dermnet])

### Next add image paths to metadata

First, start by creating a dictionary mapping all image_ids to image paths. Next, map to image_id. 

In [None]:
# # Split the path up to grab image ID from all paths
# # Create a dictionary mapping Image ID to Path
all_paths = ISIC_2018_images + ISIC_2020_images + stanford_images + dermnet_images
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_paths}

full_data['path'] = full_data['image_id'].map(imageid_path_dict)

Last thing to do is map the lesion type for ISIC 2018 to 

In [None]:
# Map the lesion code to Full diagnosis name
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

full_data['diagnosis'] = full_data['diagnosis'].replace(lesion_type_dict) # replace instead of map to retain orig values

In [None]:
# for col in full_data.columns: 
#     print(col, full_data[col].isnull().sum())

### Create Data dictionary

We'd like to have our dictionary have the following attributes: 
- Diagnosis Name
- Source - which of the 4 datasets
- Count of image IDs - will help understand if we have enough data
- Severity - Malignant vs. Benign

We can easily get the first three attributes with a groupby, counting the number of image IDs. 

Note: 
1. Not all of the diagnosis have severity data connected to them. 
2. Some diagnosis appear in different datasets, so we need to make sure the severity is the same, when represented. 

In [None]:
# Group by diagnosis and source, only taking the image_id column
full_data_csv = full_data.groupby(by = ['diagnosis', 'source'])\
                         .count()\
                         .sort_values(by = 'image_id', ascending = False)[['image_id']]\
                         .reset_index()

# make all of the diagnosis lowercase, in case there are duplicates
full_data_csv['diagnosis'] = full_data_csv['diagnosis'].str.lower()

Now to add in severity, we are going to have to join data, since we lost it when we aggregated with count. 

To do this, we can create a table grouping diagnosis and severity. 

In [None]:
diagnosis_severity = full_data.groupby(['diagnosis', 'severity']).count()['image_id'].reset_index()[['diagnosis', 'severity']]
diagnosis_severity['diagnosis'] = diagnosis_severity['diagnosis'].str.lower()

We can see that at least one of these diagnoses is over-represented with two severity levels. 

In [None]:
print('Number of diagnosis with severity:', len(diagnosis_severity))
print('Number of diagnosis without severity:', len(full_data[full_data['severity'].isnull()]['diagnosis'].unique()))
print('Total diagnosis:', len(full_data['diagnosis'].unique()))

Number of diagnosis with severity: 86
Number of diagnosis without severity: 30
Total diagnosis: 115


If we were to merge the data, we would see melanoma and dermatofibroma have 2 severity representations, which is a problem. 

In [None]:
full_data_csv.merge(diagnosis_severity, on='diagnosis', how = 'left')['diagnosis'].value_counts()

dermatofibroma                         3
melanoma                               2
unknown                                1
syringocystadenoma-papilliferum        1
lichenoid-keratosis                    1
                                      ..
verruca-vulgaris                       1
seborrheic-keratosis                   1
melanocytic-nevi                       1
seborrheic keratosis                   1
atypical-spindle-cell-nevus-of-reed    1
Name: diagnosis, Length: 114, dtype: int64

Make sure the duplicates are NAs. They only show up with one, so the other is NaN. this is not an issue then, as they won't join and cause duplicates. 

In [None]:
full_data[(full_data['diagnosis'] == 'melanoma') | (full_data['diagnosis'] == 'dermatofibroma')].groupby('diagnosis')['severity'].value_counts()

diagnosis       severity 
dermatofibroma  benign        22
melanoma        malignant    591
Name: severity, dtype: int64

Finally, write the data to a csv to start a data dictionary, first merging our severity in, so we have those for each diagnosis. 

In [None]:
full_data_csv = full_data_csv.merge(diagnosis_severity, on='diagnosis', how = 'left')
full_data_csv.to_csv(data_dir + 'data_dictionary_baseline.csv')

Let's also write the metadata + paths to a CSV so we don't have to re-run this code to access the df.

In [None]:
full_data.to_csv(data_dir + 'full_data.csv')

In [None]:
%ls

[0m[01;34msample_data[0m/


In [80]:
full_data = pd.read_csv(data_dir + 'full_data.csv', index_col = 0)

In [None]:
class_dict = pd.read_csv(data_dir + 'data_dictionary.csv', index_col = 0)

In [None]:
class_dict

Unnamed: 0,diagnosis,grouping_1,action,grouping_2,source,image_id,severity,severity source,% Representation,"Keep? \nY = Yes, M = Maybe, N = No",Removal Note,Similar to,Notes,Researcher (GJ or GL)
19,bullous disease photos,Autoimmue Disorder,See doctor,no cancer risk,dermnet,561,benign,research,1%,N,"Small sample, no similar conditions",,No similar category,Gerrit and George
22,vasculitis photos,Autoimmue Disorder,See doctor,no cancer risk,dermnet,521,benign,research,1%,M,,31?,See if these look similar,Gerrit and George
84,foreign-body-granuloma,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,2,benign,data,0%,N,"Small sample, no similar conditions",,,George
82,graft-vs-host-disease,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,2,benign,data,0%,N,"Small sample, no similar conditions",,,George
99,dermatomyositis,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,1,benign,data,0%,N,"Small sample, no similar conditions",,,George
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,systemic disease,Unclassified,See Doctor,no cancer risk,dermnet,758,benign,research,1%,N,It can cause many different skin conditions,15,It is skin diseases caused by non-skin systemi...,George
21,lupus and other connective tissue diseases,Unclassified,See Doctor,no cancer risk,dermnet,525,benign,research,1%,N,It can cause many different skin conditions,,,George
108,blastic-plasmacytoid-dendritic-cell-neoplasm,Unclassified,See Doctor,malignant,stanford_diverse,1,malignant,data,0%,N,"Small sample, no similar conditions",,,Gerrit and George
90,leukemia-cutis,Unclassified,See Doctor,malignant,stanford_diverse,1,malignant,data,0%,N,"Small sample, no similar conditions",,,Gerrit and George


### Duplicates

From EDA, we know that there are two types of duplicate image_ids: 
1. image_ids where the assigned class is the same. 
2. image_ids where the assigned class is not the same. 

The second of these two types is problematic - how can we expect to train a model where one image has two different types of classes? To deal with this - we add a new column that denotes these two types of duplicates so we can filter them out where needed, before modeling. 

First, mapping true duplicates - case 1

In [41]:
# full_data['duplicated'] = np.where(full_data['image_id'].isin(duplicates), 'true duplicate', 'No')

Next, let's map the problematic duplicates - case 2. From eda we have a list of these - let's pull them in. 

In [82]:
problem_dups = pd.read_csv('./Data/problems.csv', index_col = 0)['image_id']
full_data['duplicated'] = np.where(full_data['image_id'].isin(problem_dups), True, False)

Now we have each of the duplicate groups properly mapped

In [83]:
full_data['duplicated'].value_counts()

False    62360
True       591
Name: duplicated, dtype: int64

Let's remove the true duplicates for ease of modeling. 

In [84]:
print(len(full_data))
full_data = pd.concat([full_data[full_data['duplicated'] == True],
                          full_data[full_data['duplicated'] != True].drop_duplicates(['image_id'], keep = 'first')])
print(len(full_data))

62951
62951


In [85]:
full_data['duplicated'].value_counts()

False    62360
True       591
Name: duplicated, dtype: int64

# Split to Train, Val, Test

In [129]:
# set the splits
train_split = .75
val_split = .15

train_val = train_split + val_split
test_split = 1 - train_val

adj_train_split = train_split/train_val
adj_val_split = val_split/train_val

In [130]:
unique_data = full_data[full_data['duplicated'] == False]


### Test

First, randomly sample 10% of all images to be used for true test at end. Not going to stratify these right now as a true random sample will be most representative of our use case. the stratification is not necessarily representative of likelihood of each disease in the real world.

In [131]:
test_ids = unique_data['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)

In [135]:
full_data['dataset'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train')
unique_data['dataset'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [136]:
test_ids

8175     lupus-chronic-cutaneous-137
13824                   ISIC_4243470
3147                    ISIC_1045706
6414                    ISIC_0025025
11811                   ISIC_3636757
                    ...             
2319                 candida-penis-1
27630                   ISIC_8361472
1841                 lyme-disease-26
32018                   ISIC_9672376
8086                  lupus-acute-48
Name: image_id, Length: 6236, dtype: object

### Val/Train

In [137]:
data = unique_data[unique_data['dataset'] == 'train']

_, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

val_ids = df_val['image_id']

In [138]:
# Note val
full_data['dataset'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['dataset'])

In [142]:
# remove train/val/test label when duplicated - won't use these
full_data['dataset'] = np.where(full_data['duplicated'] == True, '', full_data['dataset'])

In [148]:
# full_data[full_data['duplicated'] == False]['dataset'].value_counts()

In [150]:
#write back to CSV for modeling ingestion
full_data.to_csv('./Data/full_data.csv', index = True)

## During class grouping, can look into this. 

In [None]:
# word_list = full_data_csv.diagnosis.str\
#                                    .cat(sep = ' ')\
#                                    .replace('-', ' ')\
#                                    .split()


In [None]:
# pd.set_option('display.max_rows', None)
# counts = pd.Series(word_list).value_counts()
# counts[counts.values >= 2]
# pd.Series(word_list).value_counts()[pd.Series(word_list).value_counts().values >= 2]

In [None]:
full_data[full_data['diagnosis'] == 'unknown']['path']

0        ./Data/ISIC_2020/Data/ISIC_2637011.jpg
1        ./Data/ISIC_2020/Data/ISIC_0015719.jpg
3        ./Data/ISIC_2020/Data/ISIC_0068279.jpg
4        ./Data/ISIC_2020/Data/ISIC_0074268.jpg
5        ./Data/ISIC_2020/Data/ISIC_0074311.jpg
                          ...                  
33120    ./Data/ISIC_2020/Data/ISIC_9999127.jpg
33121    ./Data/ISIC_2020/Data/ISIC_9999134.jpg
33122    ./Data/ISIC_2020/Data/ISIC_9999320.jpg
33123    ./Data/ISIC_2020/Data/ISIC_9999515.jpg
33124    ./Data/ISIC_2020/Data/ISIC_9999666.jpg
Name: path, Length: 27124, dtype: object