## Setup

### Installs, Packages, Seeds

In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
# import matplotlib.image as mp_image
from IPython import display    # Easily show images in notebook
import os
# import cv2
import numpy as np
# from skimage import io
from PIL import Image
from google.colab import drive # Connect colab to google drive
from glob import glob
from pathlib import PurePath

# sklearn libraries
from sklearn.model_selection import train_test_split

In [2]:
# Set Seeds
np.random.seed(99)
pd_seed = 99
# torch.cuda.manual_seed(10)

### Mount Google Drive

In [3]:
drive.mount('/drive') 
%cd /drive/MyDrive/W210 - Capstone/

Mounted at /drive
/drive/MyDrive/W210 - Capstone


## Data Blend

To blend our data we'd like to start with two things: 
1. a list of all image paths **all_image_path**, with a related column to note which dataset it's from
2. a single y vector to represent all of the different classes **df.fill_in**

### First, doing this for ISIC 2018

ISIC 2018 has 3 datasets - train, test, and val. Train is the only set with the Y vector represented in the metadata, so the other images aren't all that interesting (we can't see if we're right or wrong). Because of that, only looking at the 'train' set below. for more info on the full splits, see [GL_modeling_img_only](https://colab.research.google.com/drive/11ytZd4whUOTOsveNcbmIyOHAWU6MDxbi?authuser=1#scrollTo=rUgahMpfsKKF) notebook. 

#### Images

In [None]:
data_dir = './Data/'

# ISIC 2018
ISIC_2018_path = 'ISIC_2018/Train/HAM10000_images_part_1_and_2/'

# ISIC 2020
ISIC_2020_path = 'ISIC_2020/Data/'

# Stanford Diverse
stanford_path = 'diverse_stanford/'

# Dermnet
dermnet_path = 'dermnet/'
dermnet_path_train = dermnet_path + 'train/'
dermnet_path_test = dermnet_path + 'test/'

# Extension Dictionary
exts = {'ISIC': '.jpg', 'stanford': '.png', 'dermnet': '.jpg'}

In [None]:
# Check to make sure we're seeing all of the images in train
print('ISIC 2018 Images:\t', len(os.listdir(data_dir + ISIC_2018_path)))
print('ISIC 2020 Images:\t', len(os.listdir(data_dir + ISIC_2020_path)))
print('Stanford Diverse Images:\t', len(os.listdir(data_dir + stanford_path)))
# print('Dermnet Images:\t', len(os.listdir(data_dir + dermnet_path_train)) + len(os.listdir(data_dir + dermnet_path_test)))

ISIC 2018 Images:	 10015
ISIC 2020 Images:	 33126
Stanford Diverse Images:	 659


In [None]:
# Get a list of all image paths
ISIC_2018_images = glob(os.path.join(data_dir+ISIC_2018_path, '*'+exts['ISIC']))
ISIC_2020_images = glob(os.path.join(data_dir+ISIC_2020_path, '*'+exts['ISIC']))
stanford_images = glob(os.path.join(data_dir+stanford_path, '*'+exts['stanford']))
dermnet_images = glob(os.path.join(data_dir+dermnet_path, '*', '*', '*'+exts['dermnet'])) # one level deeper since train and test are separate
print(len(ISIC_2018_images), len(ISIC_2020_images), len(stanford_images), len(dermnet_images))    # make sure they're the right length

10015 0 656 19559


#### Metadata

Now, load metadata. We're interested in the: 
- image ID
- diagnosis
- severity (if noted)
- source

Notice each of the DFs has an image_id-like column. 
- ISIC  formats the ID as "ISIC_" with 7 digits trailing, without an extension, front-padding with 0's. 
- Stanford formats the ID as a 6 digit ID with '.png' extension, front-padding with 0's.
    - Because of this, will remove the extension below for consistency
- Dermnet...

In [None]:
# ISIC_2018
ISIC_2018_meta = pd.read_csv(data_dir + 'ISIC_2018/Train/HAM10000_metadata.csv')
ISIC_2018 = ISIC_2018_meta[['image_id', 
                            'dx', 
                            'age', 
                            'sex', 
                            'localization']
                           ].rename(columns = 
                                            {'dx': 'diagnosis'})
ISIC_2018['source'] = 'ISIC_2018'

# ISIC_2020
ISIC_2020_meta = pd.read_csv(data_dir + 'ISIC_2020/train.csv')
ISIC_2020 = ISIC_2020_meta[['image_name', 
                            'diagnosis', 
                            'benign_malignant', 
                            'age_approx', 
                            'sex', 
                            'anatom_site_general_challenge']
                           ].rename(columns = 
                                            {'image_name': 'image_id', 
                                            'benign_malignant':'severity', 
                                            'age_approx':'age', 
                                            'anatom_site_general_challenge':'localization'})
ISIC_2020['source'] = 'ISIC_2020'

# stanford_diverse
stanford_meta = pd.read_csv(data_dir + stanford_path + 'ddi_metadata.csv')
stanford = stanford_meta[['DDI_file', 'disease', 'malignant']].rename(columns = {'DDI_file': 'image_id', 'disease':'diagnosis','malignant':'severity'})
stanford['source'] = 'stanford_diverse'
stanford['image_id'] = stanford['image_id'].str.replace('.png', '') # For consistency? 
stanford['severity'] = stanford['severity'].map({True:'malignant', False:'benign'})

# Dermnet - no metadata, so create it mimicing above
image_id = [os.path.splitext(os.path.basename(i))[0] for i in dermnet_images]
diagnosis = [PurePath(i).parent.name for i in dermnet_images]
dermnet = pd.DataFrame(zip(image_id, diagnosis), columns = ['image_id', 'diagnosis'])
dermnet['source'] = 'dermnet'



In [None]:
len(ISIC_2020.diagnosis.unique())

9

### Concatenate all of the data into a single DF

In [None]:
full_data = pd.concat([ISIC_2018, ISIC_2020, stanford, dermnet])

### Next add image paths to metadata

First, start by creating a dictionary mapping all image_ids to image paths. Next, map to image_id. 

In [None]:
# # Split the path up to grab image ID from all paths
# # Create a dictionary mapping Image ID to Path
all_paths = ISIC_2018_images + ISIC_2020_images + stanford_images + dermnet_images
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in all_paths}

full_data['path'] = full_data['image_id'].map(imageid_path_dict)

Last thing to do is map the lesion type for ISIC 2018 to 

In [None]:
# Map the lesion code to Full diagnosis name
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

full_data['diagnosis'] = full_data['diagnosis'].replace(lesion_type_dict) # replace instead of map to retain orig values

In [None]:
# for col in full_data.columns: 
#     print(col, full_data[col].isnull().sum())

### Create Data dictionary

We'd like to have our dictionary have the following attributes: 
- Diagnosis Name
- Source - which of the 4 datasets
- Count of image IDs - will help understand if we have enough data
- Severity - Malignant vs. Benign

We can easily get the first three attributes with a groupby, counting the number of image IDs. 

Note: 
1. Not all of the diagnosis have severity data connected to them. 
2. Some diagnosis appear in different datasets, so we need to make sure the severity is the same, when represented. 

In [None]:
# Group by diagnosis and source, only taking the image_id column
full_data_csv = full_data.groupby(by = ['diagnosis', 'source'])\
                         .count()\
                         .sort_values(by = 'image_id', ascending = False)[['image_id']]\
                         .reset_index()

# make all of the diagnosis lowercase, in case there are duplicates
full_data_csv['diagnosis'] = full_data_csv['diagnosis'].str.lower()

Now to add in severity, we are going to have to join data, since we lost it when we aggregated with count. 

To do this, we can create a table grouping diagnosis and severity. 

In [None]:
diagnosis_severity = full_data.groupby(['diagnosis', 'severity']).count()['image_id'].reset_index()[['diagnosis', 'severity']]
diagnosis_severity['diagnosis'] = diagnosis_severity['diagnosis'].str.lower()

We can see that at least one of these diagnoses is over-represented with two severity levels. 

In [None]:
print('Number of diagnosis with severity:', len(diagnosis_severity))
print('Number of diagnosis without severity:', len(full_data[full_data['severity'].isnull()]['diagnosis'].unique()))
print('Total diagnosis:', len(full_data['diagnosis'].unique()))

Number of diagnosis with severity: 86
Number of diagnosis without severity: 30
Total diagnosis: 115


If we were to merge the data, we would see melanoma and dermatofibroma have 2 severity representations, which is a problem. 

In [None]:
full_data_csv.merge(diagnosis_severity, on='diagnosis', how = 'left')['diagnosis'].value_counts()

dermatofibroma                         3
melanoma                               2
unknown                                1
syringocystadenoma-papilliferum        1
lichenoid-keratosis                    1
                                      ..
verruca-vulgaris                       1
seborrheic-keratosis                   1
melanocytic-nevi                       1
seborrheic keratosis                   1
atypical-spindle-cell-nevus-of-reed    1
Name: diagnosis, Length: 114, dtype: int64

Make sure the duplicates are NAs. They only show up with one, so the other is NaN. this is not an issue then, as they won't join and cause duplicates. 

In [None]:
full_data[(full_data['diagnosis'] == 'melanoma') | (full_data['diagnosis'] == 'dermatofibroma')].groupby('diagnosis')['severity'].value_counts()

diagnosis       severity 
dermatofibroma  benign        22
melanoma        malignant    591
Name: severity, dtype: int64

Finally, write the data to a csv to start a data dictionary, first merging our severity in, so we have those for each diagnosis. 

In [None]:
full_data_csv = full_data_csv.merge(diagnosis_severity, on='diagnosis', how = 'left')
full_data_csv.to_csv(data_dir + 'data_dictionary_baseline.csv')

Let's also write the metadata + paths to a CSV so we don't have to re-run this code to access the df.

In [None]:
full_data.to_csv(data_dir + 'full_data.csv')

In [None]:
%ls

[0m[01;34msample_data[0m/


In [None]:
full_data = pd.read_csv(data_dir + 'full_data.csv', index_col = 0)

In [None]:
class_dict = pd.read_csv(data_dir + 'data_dictionary.csv', index_col = 0)

In [None]:
class_dict

Unnamed: 0,diagnosis,grouping_1,action,grouping_2,source,image_id,severity,severity source,% Representation,"Keep? \nY = Yes, M = Maybe, N = No",Removal Note,Similar to,Notes,Researcher (GJ or GL)
19,bullous disease photos,Autoimmue Disorder,See doctor,no cancer risk,dermnet,561,benign,research,1%,N,"Small sample, no similar conditions",,No similar category,Gerrit and George
22,vasculitis photos,Autoimmue Disorder,See doctor,no cancer risk,dermnet,521,benign,research,1%,M,,31?,See if these look similar,Gerrit and George
84,foreign-body-granuloma,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,2,benign,data,0%,N,"Small sample, no similar conditions",,,George
82,graft-vs-host-disease,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,2,benign,data,0%,N,"Small sample, no similar conditions",,,George
99,dermatomyositis,Autoimmue Disorder,See doctor,no cancer risk,stanford_diverse,1,benign,data,0%,N,"Small sample, no similar conditions",,,George
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,systemic disease,Unclassified,See Doctor,no cancer risk,dermnet,758,benign,research,1%,N,It can cause many different skin conditions,15,It is skin diseases caused by non-skin systemi...,George
21,lupus and other connective tissue diseases,Unclassified,See Doctor,no cancer risk,dermnet,525,benign,research,1%,N,It can cause many different skin conditions,,,George
108,blastic-plasmacytoid-dendritic-cell-neoplasm,Unclassified,See Doctor,malignant,stanford_diverse,1,malignant,data,0%,N,"Small sample, no similar conditions",,,Gerrit and George
90,leukemia-cutis,Unclassified,See Doctor,malignant,stanford_diverse,1,malignant,data,0%,N,"Small sample, no similar conditions",,,Gerrit and George


### Duplicates

From EDA, we know that there are two types of duplicate image_ids: 
1. image_ids where the assigned class is the same. 
2. image_ids where the assigned class is not the same. 

The second of these two types is problematic - how can we expect to train a model where one image has two different types of classes? To deal with this - we add a new column that denotes these two types of duplicates so we can filter them out where needed, before modeling. 

First, mapping true duplicates - case 1

In [None]:
# full_data['duplicated'] = np.where(full_data['image_id'].isin(duplicates), 'true duplicate', 'No')

Next, let's map the problematic duplicates - case 2. From eda we have a list of these - let's pull them in. 

In [None]:
problem_dups = pd.read_csv('./Data/problems.csv', index_col = 0)['image_id']
full_data['duplicated'] = np.where(full_data['image_id'].isin(problem_dups), True, False)

Now we have each of the duplicate groups properly mapped

In [None]:
full_data['duplicated'].value_counts()

NameError: ignored

Let's remove the true duplicates for ease of modeling. 

In [None]:
print(len(full_data))
full_data = pd.concat([full_data[full_data['duplicated'] == True],
                          full_data[full_data['duplicated'] != True].drop_duplicates(['image_id'], keep = 'first')])
print(len(full_data))

62951
62951


In [None]:
full_data['duplicated'].value_counts()

False    62360
True       591
Name: duplicated, dtype: int64

# Split to Train, Val, Test

Can skip all of the preceding steps bc we've already done this once in 'data_merger'. can just load in full_data, and overwrite below where needed.

In [None]:
!ls ./Data

 archive		       diverse_stanford		  kaggle
'Data Classification.gsheet'   diverse_stanford_resized   problems.csv
'data_dictionary .csv'	       full_data.csv		  problems.gsheet
 data_dictionary.gsheet        ISIC_2018		  UCI
 dermnet		       ISIC_2020


## Adding in a few more classes and splits

#### First, map in the new labels based on the changes made in data_dictionary

In [None]:
! ls ./Data

 archive		       diverse_stanford		  kaggle
'Data Classification.gsheet'   diverse_stanford_resized   problems.csv
'data_dictionary .csv'	       full_data.csv		  problems.gsheet
 data_dictionary.gsheet        ISIC_2018		  UCI
 dermnet		       ISIC_2020


In [4]:
full_data = pd.read_csv('./Data/full_data.csv', index_col = 0)
label_mapper = pd.read_csv('./Data/data_dictionary.csv', index_col = 0)[['diagnosis', 'label_1', 'label_2', 'label_3']]

In [None]:
full_data = full_data.merge(label_mapper, how = 'left', on = 'diagnosis')
full_data.drop_duplicates(inplace = True) # Drop unwanted duplicates created by merge

In [5]:
# set the splits
train_split = .75
val_split = .15

train_val = train_split + val_split
test_split = 1 - train_val

adj_train_split = train_split/train_val
adj_val_split = val_split/train_val

In [6]:
unique_data = full_data[full_data['duplicated'] == False]

In [None]:
full_data['label_1'].unique()

array(['Non-Cancerous Skin Condition', 'Unclassified',
       'Toxin, Fungal, Bug, Viral, or Bacterial Infections',
       'Potentially Malignant Skin Tumors', 'Benign Marking or Mole'],
      dtype=object)

In [None]:
full_data['split_3'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')


## Original Split

### Test

First, randomly sample 10% of all images to be used for true test at end. Not going to stratify these right now as a true random sample will be most representative of our use case. the stratification is not necessarily representative of likelihood of each disease in the real world.

In [None]:
test_ids = unique_data['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)

In [None]:
full_data['dataset'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train')
unique_data['dataset'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Val/Train

In [None]:
data = unique_data[unique_data['dataset'] == 'train']

_, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

val_ids = df_val['image_id']

In [None]:
# Note val
full_data['dataset'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['dataset'])

In [None]:
# remove train/val/test label when duplicated - won't use these
full_data['dataset'] = np.where(full_data['duplicated'] == True, '', full_data['dataset'])

In [None]:
# full_data[full_data['duplicated'] == False]['dataset'].value_counts()

In [None]:
# write back to CSV for modeling ingestion
# full_data.to_csv('./Data/full_data.csv', index = True)

In [None]:
full_data['dataset'].value_counts()

train    46770
val       9354
test      6236
           591
Name: dataset, dtype: int64

## Split 1: Remove dermnet cancer from train

based on model_resnet_full.pt, we know that a majority of our poorly predicted images fell into the bottom two categories:
1. dermnet images
2. Caner vs. non-cancer

There is a significant overlap between these two categories, and we also have a large amount of data on cancer vs. not from both ISIC datasets. So we can see if removing dermnet from train might make our model slightly less confused. 

In [None]:
full_data['split_1'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')

unique_data['split_1'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train_val')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
unique_data['split_1'].value_counts()

train_val    56124
test          6236
Name: split_1, dtype: int64

Before we start our train/val splits, need to ensure that dermnet is not included.

In [None]:
data = unique_data[(unique_data['split_1'] == 'train_val') & 
                   ~((unique_data['source'] == 'dermnet') & 
                        ((unique_data['class'] == 'Benign Marking or Mole') |
                         (unique_data['class'] == 'Potentially Malignant Skin Tumors')))]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

In [None]:
len(data), len(df_train), len(df_val), len(df_train) + len(df_val)

(51633, 43027, 8606, 51633)

In [None]:
full_data['split_1'].value_counts()

train_val    56715
test          6236
Name: split_1, dtype: int64

In [None]:
# Note val
full_data['split_1'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_1'])

# Note Train
full_data['split_1'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_1'])

# Note unused
full_data['split_1'] = np.where(full_data['split_1'] == 'train_val', '', full_data['split_1'])

In [None]:
full_data['split_1'].value_counts()

train    43027
val       8606
test      6236
          5082
Name: split_1, dtype: int64

## Split 2: Remove poorly predicted classes

Now we want to take out the following subsets of data: 
1. ISIC_2020, where diagnosis = 'unknown' [subset of class - 'Unclassified']
2. Autoimmune Disorder (incorrectly spelled as 'autoimmue') [entire class]

Now that we won't be training on autoimmune, we can't have it show up in test with the autoimmune label. we could either leave it in test and re-classify it as unknown, or remove it altogether. for the sake of quick testing - let's frist just remove it entirely


In [None]:
test_ids = unique_data[unique_data['class'] != 'Autoimmue Disorder']['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)

In [None]:
full_data['split_2'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')

unique_data['split_2'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train_val')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data = unique_data[(unique_data['split_2'] == 'train_val') &
                   (unique_data['class'] != 'Autoimmue Disorder') &
                   (unique_data['diagnosis'] != 'unknown')]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

In [None]:
# Note val
full_data['split_2'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_2'])

# Note Train
full_data['split_2'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_2'])

# Note unused
full_data['split_2'] = np.where(full_data['split_2'] == 'train_val', '', full_data['split_2'])

In [None]:
full_data[full_data.split_2 != '']['class'].value_counts()

Benign Marking or Mole                                17882
Toxin, Fungal, Bug, Viral, or Bacterial Infections     6468
Non-Cancerous Skin Condition                           4636
Unclassified                                           4105
Potentially Malignant Skin Tumors                      3604
Name: class, dtype: int64

In [None]:
full_data.split_2.value_counts()

         26256
train    25489
test      6108
val       5098
Name: split_2, dtype: int64

## Split 2+3: Equal class sampling

Starting with split 2 as a baseline, now we will undersample each class down to ~3600 images to match the smallest class - malignant to try and reduce class imbalance. 


In [None]:
labs = ['Non-Cancerous Skin Condition',
       'Toxin, Fungal, Bug, Viral, or Bacterial Infections',
       'Potentially Malignant Skin Tumors', # correct Size
    #    'Autoimmue Disorder',                # excluding
       'Benign Marking or Mole', 
       'Unclassified']

Group data into train, val, and test - all with equal sample size

In [None]:
train_val_test = pd.DataFrame()

for lab in labs:
    s = full_data[(full_data['split_2'] != '') &
                  (full_data['class'] == lab)].\
                   sample(n = 3600, 
                          replace = False, 
                          random_state = pd_seed)
    
    train_val_test = pd.concat([train_val_test, s])

Now we can skim off test: 

In [None]:
test_ids = train_val_test['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)

In [None]:
full_data['split_3'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')

train_val_test['split_3'] = np.where(train_val_test['image_id'].isin(test_ids), 'test', 'train_val')


In [None]:
data = train_val_test[(train_val_test['split_3'] == 'train_val')]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

In [None]:
len(train_ids), len(val_ids), len(test_ids), len(train_val_test), len(train_ids) + len(val_ids) + len(test_ids) 

(13500, 2700, 1800, 18000, 18000)

In [None]:
# Note val
full_data['split_3'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_3'])

# Note Train
full_data['split_3'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_3'])

# Note unused
full_data['split_3'] = np.where(full_data['split_3'] == 'train_val', '', full_data['split_3'])

In [None]:
full_data['split_3'].value_counts()

         44951
train    13500
val       2700
test      1800
Name: split_3, dtype: int64

In [None]:
full_data[full_data.split_3 == 'val']['class'].value_counts()

Benign Marking or Mole                                542
Potentially Malignant Skin Tumors                     542
Unclassified                                          542
Toxin, Fungal, Bug, Viral, or Bacterial Infections    540
Non-Cancerous Skin Condition                          534
Name: class, dtype: int64

## Split 4
- Remove autoimmune diseases altogether
- remove diagnoses within toxin/fungal etc that are not in the top 3 in terms of value counts
- remove data with unknown as diagnosis

In [None]:
test_ids = unique_data[unique_data['diagnosis'] != 'unknown']['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)

In [None]:
full_data['split_4'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')

unique_data['split_4'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train_val')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data = unique_data[(unique_data['split_4'] == 'train_val') &
                   (unique_data['diagnosis'] != 'unknown')]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['label_1'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

In [None]:
# Note val
full_data['split_4'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_4'])

# Note Train
full_data['split_4'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_4'])

# Note unused
full_data['split_4'] = np.where(full_data['split_4'] == 'train_val', '', full_data['split_4'])

## Split 5: 
- Remove autoimmune diseases altogether
- remove diagnoses within toxin/fungal etc that are not in the top 3 in terms of value counts
- remove data with unknown as diagnosis
- remove melanoma skin cancer nevi and moles (dermnet) from Potentially malignant skin tumors class as there are some beningn/regular moles in here, and it’s causing common confusions

In [None]:
test_ids = unique_data[(unique_data['diagnosis'] != 'unknown') &
                       (unique_data['label_2'].isna() == False)]['image_id']\
                                              .sample(frac = test_split, 
                                                      replace = False, 
                                                      random_state = pd_seed)

full_data['split_5'] = np.where((full_data['image_id'].isin(test_ids)) &
                                (full_data['label_2'].isna() == False), 'test', 'train_val')

unique_data['split_5'] = np.where((unique_data['image_id'].isin(test_ids)) &
                                  (unique_data['label_2'].isna() == False), 'test', 'train_val')

data = unique_data[(unique_data['split_5'] == 'train_val') &
                   (unique_data['diagnosis'] != 'unknown') &
                   (unique_data['label_2'].isna() == False)]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['label_2'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
# Note val
full_data['split_5'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_5'])

# Note Train
full_data['split_5'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_5'])

# Note unused
full_data['split_5'] = np.where(full_data['split_5'] == 'train_val', '', full_data['split_5'])

## Split 6: 
- Remove autoimmune diseases altogether
- remove diagnoses within toxin/fungal etc that are not in the top 3 in terms of value counts
- remove data with unknown as diagnosis
- remove melanoma skin cancer nevi and moles (dermnet) from Potentially malignant skin tumors class as there are some beningn/regular moles in here, and it’s causing common confusions
- Combine toxins and non-cancerous skin condition class

In [None]:
test_ids = unique_data[(unique_data['diagnosis'] != 'unknown') &
                       (unique_data['label_3'].isna() == False)]['image_id']\
                                              .sample(frac = test_split, 
                                                      replace = False, 
                                                      random_state = pd_seed)

full_data['split_6'] = np.where((full_data['image_id'].isin(test_ids)) &
                                (full_data['label_3'].isna() == False), 'test', 'train_val')

unique_data['split_6'] = np.where((unique_data['image_id'].isin(test_ids)) &
                                  (unique_data['label_3'].isna() == False), 'test', 'train_val')

data = unique_data[(unique_data['split_6'] == 'train_val') &
                   (unique_data['diagnosis'] != 'unknown') &
                   (unique_data['label_3'].isna() == False)]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['label_3'])

val_ids = df_val['image_id']
train_ids = df_train['image_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
# Note val
full_data['split_6'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_6'])

# Note Train
full_data['split_6'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_6'])

# Note unused
full_data['split_6'] = np.where(full_data['split_6'] == 'train_val', '', full_data['split_6'])

In [None]:
full_data.split_5.value_counts()

         28285
train    25995
val       5200
test      3466
Name: split_5, dtype: int64

In [None]:
full_data.split_6.value_counts()

         28285
train    25995
val       5200
test      3466
Name: split_6, dtype: int64

## Split 7: Same as split 2, but removing dermnet ambiguous benign/malignant diagnosis from malignant class. 

In [8]:
test_ids = unique_data[(unique_data['class'] != 'Autoimmue Disorder') &
                       (unique_data['diagnosis'] != 'melanoma skin cancer nevi and moles')]['image_id']\
          .sample(frac = test_split, 
                  replace = False, 
                  random_state = pd_seed)
          
full_data['split_7'] = np.where(full_data['image_id'].isin(test_ids), 'test', 'train_val')

unique_data['split_7'] = np.where(unique_data['image_id'].isin(test_ids), 'test', 'train_val')

data = unique_data[(unique_data['split_7'] == 'train_val') &
                   (unique_data['class'] != 'Autoimmue Disorder') &
                   (unique_data['diagnosis'] != 'unknown') &
                   (unique_data['diagnosis'] != 'melanoma skin cancer nevi and moles')]

df_train, df_val  = train_test_split(data,
                              test_size = adj_val_split,
                              random_state = pd_seed,
                              stratify = data['class'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [25]:
df_val['class'].value_counts()

Benign Marking or Mole                                2690
Toxin, Fungal, Bug, Viral, or Bacterial Infections     972
Non-Cancerous Skin Condition                           696
Potentially Malignant Skin Tumors                      452
Unclassified                                           208
Name: class, dtype: int64

In [20]:
s7 = data[data.split_7.isna() == False]
s7['class'].value_counts()

Benign Marking or Mole                                16140
Toxin, Fungal, Bug, Viral, or Bacterial Infections     5829
Non-Cancerous Skin Condition                           4178
Potentially Malignant Skin Tumors                      2710
Unclassified                                           1250
Name: class, dtype: int64

In [10]:
val_ids = df_val['image_id']
train_ids = df_train['image_id']

# Note val
full_data['split_7'] = np.where(full_data['image_id'].isin(val_ids), 'val', full_data['split_7'])

# Note Train
full_data['split_7'] = np.where(full_data['image_id'].isin(train_ids), 'train', full_data['split_7'])

# Note unused
full_data['split_7'] = np.where(full_data['split_7'] == 'train_val', '', full_data['split_7'])

In [14]:
full_data.split_2.value_counts()

train    25489
test      6108
val       5098
Name: split_2, dtype: int64

## Write Back Data

In [15]:
full_data.to_csv('./Data/full_data.csv', index = True)

In [None]:
full_data[full_data['duplicated'] == True]['split_6'].value_counts()

    586
Name: split_6, dtype: int64

In [16]:
bx = ['split_1', 'split_2', 'split_3', 'split_4', 'split_5', 'split_6', 'split_7']

for x in bx: 
    print(full_data[x].value_counts())

train    43027
val       8606
test      6236
Name: split_1, dtype: int64
train    25489
test      6108
val       5098
Name: split_2, dtype: int64
train    13500
val       2700
test      1800
Name: split_3, dtype: int64
train    26426
val       5286
test      3524
Name: split_4, dtype: int64
train    25995
val       5200
test      3466
Name: split_5, dtype: int64
train    25995
val       5200
test      3466
Name: split_6, dtype: int64
         26788
train    25089
test      6051
val       5018
Name: split_7, dtype: int64
