In [None]:
from google.colab import drive
import zipfile, os, urllib.request, glob, math, shutil
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

Mounted at /content/drive


Όπως περιέγραψα και στο DataPreprocessing.ipynb, θα θέλαμε το test dataset να μην περιέχει εικόνες από lesions που περιλαμβάνονται και στο train dataset.

In [None]:
dataset = pd.read_csv('drive/MyDrive/dataset.csv')
dataset.head()

Unnamed: 0,image,category,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,5,55.0,anterior torso,Unknown0,female
1,ISIC_0000001,5,30.0,anterior torso,Unknown1,female
2,ISIC_0000002,4,60.0,upper extremity,Unknown2,female
3,ISIC_0000003,5,30.0,upper extremity,Unknown3,male
4,ISIC_0000004,4,80.0,posterior torso,Unknown4,male


In [None]:
temp_d = dataset.groupby('lesion_id').count()
temp_d = temp_d[temp_d['category'] == 1].reset_index()
len(temp_d)

8872

Βλέπουμε ότι 8872 lesions απεικονίζονται από μία μόνο φωτογραφία.

Επιπλέον, είδαμε από το προηγούμενο σημειωματάριο ότι έχουμε 13931 unique lesion_id. Επομένως, τα υπόλοιπα 13931 - 8872 = 5059 lesions αντιστοιχούν σε 25331 - 8872 = 16459 φωτογραφίες.

Για τον διαχωρισμό των δεδομένων, το split θα γίνει με την προυπόθεση ότι κάθε υποσύνολο θα περιέχει όλες τις εικόνες από ένα lesion_id.


Πριν, ακόμα από τον διαχωριμό, θα κάνω από τώρα one- hot encodings στο dataset.

In [None]:
dataset.loc[dataset.sex == 'Unknown'].category.value_counts()

5    271
4     81
2     23
1      6
0      3
Name: category, dtype: int64

In [None]:
dataset.loc[dataset.anatom_site_general == 'Unknown'].category.value_counts()

5    2094
2     275
4     127
1      72
7      31
0      22
6       6
3       4
Name: category, dtype: int64

In [None]:
dataset.loc[dataset.age_approx == -5].category.value_counts()

5    317
4     85
2     26
1      6
0      3
Name: category, dtype: int64

In [None]:
#one-hot encoding
def create_ohe(current_column):

  unique_vals = sorted(current_column.unique().tolist())

  if 'Unknown' in unique_vals:
    unique_vals.remove('Unknown')

  encodings = []
  for value in current_column:

    item_encoding = [0] * len(unique_vals)
    if value == 'Unknown':
      encodings.append(item_encoding)
    else:
      item_encoding[unique_vals.index(value)] = 1
      encodings.append(item_encoding)

  return encodings

for i in ['anatom_site_general', 'sex']:#the categorical vars
  dataset[i] = create_ohe(dataset[i])

dataset.head()

Unnamed: 0,image,category,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,5,55.0,"[1, 0, 0, 0, 0, 0, 0, 0]",Unknown0,"[1, 0]"
1,ISIC_0000001,5,30.0,"[1, 0, 0, 0, 0, 0, 0, 0]",Unknown1,"[1, 0]"
2,ISIC_0000002,4,60.0,"[0, 0, 0, 0, 0, 0, 0, 1]",Unknown2,"[1, 0]"
3,ISIC_0000003,5,30.0,"[0, 0, 0, 0, 0, 0, 0, 1]",Unknown3,"[0, 1]"
4,ISIC_0000004,4,80.0,"[0, 0, 0, 0, 0, 0, 1, 0]",Unknown4,"[0, 1]"


In [None]:
dataset_wdp = dataset.drop_duplicates(subset = ['lesion_id'])
print(len(dataset_wdp))
dataset_wdp.head()

13931


Unnamed: 0,image,category,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,5,55.0,anterior torso,Unknown0,female
1,ISIC_0000001,5,30.0,anterior torso,Unknown1,female
2,ISIC_0000002,4,60.0,upper extremity,Unknown2,female
3,ISIC_0000003,5,30.0,upper extremity,Unknown3,male
4,ISIC_0000004,4,80.0,posterior torso,Unknown4,male


Δημιουργώ τα υποσύνολα train/val/test dataset κάνοντας split με stratify = 'category', καθώς θα θέλαμε το distribution του test dataset να ακολουθεί αυτό των συνολικών δεδομένων όσον αφορά το catagory των εικόνων. Ο διαχωρισμός γίνεται στο dataset με τα unique lesions.

In [None]:
train_dataset, temp_test_dataset = train_test_split(dataset_wdp, test_size=0.2, random_state = 9, stratify=dataset_wdp['category'])
train_dataset = train_dataset.sort_values(by = ['image']).reset_index(drop = True)
temp_test_dataset = temp_test_dataset.sort_values(by = ['image']).reset_index(drop = True)
print("Length of train dataset:", len(train_dataset))
print("Length of temp test dataset:", len(temp_test_dataset))

val_dataset, test_dataset = train_test_split(temp_test_dataset, test_size = 0.5, random_state = 9 , stratify = temp_test_dataset['category'])
val_dataset = val_dataset.sort_values(by = ['image']).reset_index(drop = True)
test_dataset = test_dataset.sort_values(by = ['image']).reset_index(drop = True)
print("Length of validation dataset:", len(val_dataset))
print("Length of test dataset:", len(test_dataset))

Length of train dataset: 11144
Length of temp test dataset: 2787
Length of validation dataset: 1393
Length of test dataset: 1394


Στη συνέχεια, διαχωρίζω τα συνολικά δεδομένα σε train/val/test τοποθετώντας εικόνες από το ίδιο lesion στο ίδιο split.

In [None]:
final_train_dataset = dataset[dataset['lesion_id'].isin(train_dataset['lesion_id'])].reset_index(drop = True)
final_val_dataset = dataset[dataset['lesion_id'].isin(val_dataset['lesion_id'])].reset_index(drop = True)
final_test_dataset = dataset[dataset['lesion_id'].isin(test_dataset['lesion_id'])].reset_index(drop = True)
print("Length of final train dataset:", len(final_train_dataset))
print("Length of final val dataset:", len(final_val_dataset))
print("Length of final test dataset:", len(final_test_dataset))

Length of final train dataset: 20134
Length of final val dataset: 2625
Length of final test dataset: 2572


Ένα test για να επιβεβαιώσουμε ότι όντως τα ίδια lesions είναι στο ίδιο fold.

In [None]:
print(len(final_train_dataset.merge(final_test_dataset,how = 'inner', on = 'lesion_id')))
print(len(final_train_dataset.merge(final_val_dataset,how = 'inner', on = 'lesion_id')))
print(final_train_dataset['lesion_id'].nunique())
print(final_val_dataset['lesion_id'].nunique())
print(final_test_dataset['lesion_id'].nunique())

0
0
11144
1393
1394


Υλοποιώ και το standard scaling της μεταβλητής age_approx.

In [None]:
from sklearn.preprocessing import StandardScaler

def frequency_imputation(train_dataset, val_dataset, test_dataset, column):

  value_counts = train_dataset[column].value_counts().to_dict()

  if 'Unknown' in value_counts:
    del value_counts['Unknown']

  sum = 0
  for i in value_counts.items():
    sum += i[1]

  for i in value_counts.items():
    value_counts[i[0]] = i[1]/sum

  replacement_choices = list(value_counts.keys())
  replacement_probabilities = list(value_counts.values())

  for lesion in train_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    train_dataset.loc[(train_dataset['lesion_id'] == lesion) & (train_dataset[column] == 'Unknown'), column] = random_value

  for lesion in val_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    val_dataset.loc[(val_dataset['lesion_id'] == lesion) & (val_dataset[column] == 'Unknown'), column] = random_value

  for lesion in test_dataset.lesion_id.unique():
    random_value = random.choices(replacement_choices, replacement_probabilities)[0]
    test_dataset.loc[(test_dataset['lesion_id'] == lesion) & (test_dataset[column] == 'Unknown'), column] = random_value

def one_hot_encode(train_dataset, val_dataset, test_dataset,column):

  unique_vals = sorted(train_dataset[column].unique().tolist())

  train_encodings = []
  for value in train_dataset[column]:

    item_encoding = [0] * len(unique_vals)
    item_encoding[unique_vals.index(value)] = 1
    train_encodings.append(item_encoding)

  train_dataset[column] = train_encodings

  val_encodings = []
  for value in val_dataset[column]:

    item_encoding = [0] * len(unique_vals)

    if value not in unique_vals:
      val_encodings.append(item_encoding)
      continue

    item_encoding[unique_vals.index(value)] = 1
    val_encodings.append(item_encoding)

  val_dataset[column] = val_encodings

  test_encodings = []
  for value in test_dataset[column]:

    item_encoding = [0] * len(unique_vals)

    if value not in unique_vals:
      test_encodings.append(item_encoding)
      continue

    item_encoding[unique_vals.index(value)] = 1
    test_encodings.append(item_encoding)

  test_dataset[column] = test_encodings

#numerical encodings
fit_data = np.array(final_train_dataset.loc[final_train_dataset['age_approx'] != -5]['age_approx']).reshape(-1,1)

scaler = StandardScaler()
scaler.fit(np.array(fit_data))

scaler_mean = scaler.mean_

#replace -5(Unknown) values with mean
final_train_dataset.loc[final_train_dataset['age_approx'] == -5, ['age_approx']] = scaler_mean
final_val_dataset.loc[final_val_dataset['age_approx'] == -5, ['age_approx']] = scaler_mean
final_test_dataset.loc[final_test_dataset['age_approx'] == -5, ['age_approx']] = scaler_mean

final_train_dataset['age_approx'] = scaler.transform(np.array(final_train_dataset['age_approx']).reshape(-1,1)).tolist()
final_val_dataset['age_approx'] = scaler.transform(np.array(final_val_dataset['age_approx']).reshape(-1,1)).tolist()
final_test_dataset['age_approx'] = scaler.transform(np.array(final_test_dataset['age_approx']).reshape(-1,1)).tolist()

for cat_var in ['sex', 'anatom_site_general']:
  frequency_imputation(final_train_dataset, final_val_dataset, final_test_dataset, cat_var)
  one_hot_encode(final_train_dataset, final_val_dataset, final_test_dataset, cat_var)

final_train_dataset.head()

Unnamed: 0,image,category,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,5,[0.051918806491940125],"[1, 0, 0, 0, 0, 0, 0, 0]",Unknown0,"[1, 0]"
1,ISIC_0000002,4,[0.32994380437566795],"[0, 0, 0, 0, 0, 0, 0, 1]",Unknown2,"[1, 0]"
2,ISIC_0000003,5,[-1.3382061829266991],"[0, 0, 0, 0, 0, 0, 0, 1]",Unknown3,"[0, 1]"
3,ISIC_0000006,5,[-1.616231180810427],"[0, 0, 0, 0, 0, 0, 1, 0]",Unknown5,"[1, 0]"
4,ISIC_0000007,5,[-1.616231180810427],"[0, 0, 0, 0, 0, 0, 1, 0]",Unknown6,"[1, 0]"


In [None]:
final_test_dataset.sex.value_counts()

[0, 1]    1325
[1, 0]    1247
Name: sex, dtype: int64

In [None]:
final_train_dataset.to_csv('drive/MyDrive/train_dataset_meta_derm.csv.csv',index = False)
final_val_dataset.to_csv('drive/MyDrive/val_dataset_meta_derm.csv.csv',index = False)
final_test_dataset.to_csv('drive/MyDrive/test_dataset_meta_derm.csv.csv',index = False)

Κατέβασμα των δεδομένων.

In [None]:
urllib.request.urlretrieve("https://isic-challenge-data.s3.amazonaws.com/2019/ISIC_2019_Training_Input.zip", "imgs.zip")

with zipfile.ZipFile('imgs.zip', 'r') as zip_ref:
    zip_ref.extractall('img_folder')

!ls img_folder/ISIC_2019_Training_Input | wc -l

25333


Δημιουργία φακέλων και μεταφορά των εικόνων στους αντίστοιχους φακέλους (αναλόγως το split στο οποίο ανήκει η εικόνα και το category της).

In [None]:
categories_dict = {0:'AK',1:'BCC',2:'BKL',3:'DF',4:'MEL',5:'NV',6:'SCC',7:'VASC',8:'UNK'}

def store_to_corresponding_folder(src_path_prefix,dest_path_prefix,current_dataset):

  #create folder for each dataset
  if os.path.exists(dest_path_prefix):
    print("Folder %s already exists..." %dest_path_prefix)
    return
  os.makedirs(dest_path_prefix)

  #create subfolder for each category...
  for i in categories_dict.values():
    folder_path = os.path.join(dest_path_prefix,i)
    os.makedirs(folder_path)

  #store each image to corresponding folder
  for index, row in current_dataset.iterrows():
    category_label = categories_dict[row['category']]
    img_name = row['image']

    src_path = os.path.join(src_path_prefix,img_name + '.jpg')
    dest_path = os.path.join(dest_path_prefix,category_label)
    shutil.copy(src_path,dest_path)

  #print number of elements in each folder
  print("For " + dest_path_prefix + ': ')
  for i in categories_dict.values():
    category_folder = os.path.join(dest_path_prefix,i)
    files_no = len(glob.glob(category_folder + '/*.jpg'))
    print(i + ': ' + str(files_no))

store_to_corresponding_folder('img_folder/ISIC_2019_Training_Input','train_imgs', final_train_dataset)#κανονικά θα τα αποθηκεύω στο drive σε zip
store_to_corresponding_folder('img_folder/ISIC_2019_Training_Input','val_imgs', final_val_dataset)
store_to_corresponding_folder('img_folder/ISIC_2019_Training_Input','test_imgs', final_test_dataset)

For train_imgs: 
AK: 700
BCC: 2663
BKL: 2085
DF: 179
MEL: 3598
NV: 10217
SCC: 492
VASC: 200
UNK: 0
For val_imgs: 
AK: 87
BCC: 341
BKL: 260
DF: 31
MEL: 487
NV: 1319
SCC: 74
VASC: 26
UNK: 0
For test_imgs: 
AK: 80
BCC: 319
BKL: 279
DF: 29
MEL: 437
NV: 1339
SCC: 62
VASC: 27
UNK: 0


In [None]:
print(final_train_dataset.category.value_counts())
print(final_val_dataset.category.value_counts())
print(final_test_dataset.category.value_counts())

5    10217
4     3598
1     2663
2     2085
0      700
6      492
7      200
3      179
Name: category, dtype: int64
5    1319
4     487
1     341
2     260
0      87
6      74
3      31
7      26
Name: category, dtype: int64
5    1339
4     437
1     319
2     279
0      80
6      62
3      29
7      27
Name: category, dtype: int64


Αποθήκευση στο drive.

In [None]:
prefix = 'drive/MyDrive/'

for i in ['train_imgs', 'val_imgs','test_imgs']:
  for j in ['MEL','NV','BCC','AK','BKL','DF','VASC','SCC']:
    name = i + '_' + j
    output_filename = prefix + name
    dir_name = i + '/' + j
    shutil.make_archive(output_filename, 'zip', dir_name)