# Data Collection

This assumes that you have registered an account with Kaggle, and have created a json key to interact with Kaggle's API.  

In [23]:
import os
import shutil
import pandas as pd
import numpy as np

Set current directory

In [24]:
os.chdir(os.path.expanduser('~'))
os.chdir('Desktop/skin')

current_dir = os.getcwd()
current_dir

'/home/pxp/Desktop/skin'

In [25]:
meta_data = pd.read_csv("inputs/skin cancer/HAM10000_metadata.csv")
meta_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


# Data Preparation

## Split train validation test set

Create directories

In [26]:
image_dir_path = 'inputs/skin cancer/Skin Cancer/Skin Cancer' 
train_dir = 'inputs/work/train'
test_dir = 'inputs/work/test'
val_dir = 'inputs/work/validate'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

cancer_types = meta_data["dx"].unique()
for cancer_type in cancer_types:
    os.makedirs(os.path.join(train_dir, cancer_type), exist_ok=True)
    os.makedirs(os.path.join(test_dir, cancer_type), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cancer_type), exist_ok=True)

Remove existing files

In [27]:
for dpath in [train_dir, test_dir, val_dir]:
    for root, dirs, files in os.walk(dpath):
        for file in files:
            os.remove(os.path.join(root, file))

Copy training files to target directories.

In [28]:
classes = meta_data["dx"].unique()
for target in classes:
    subs = meta_data[meta_data['dx']==target]
    subs = subs.sample(frac=1).reset_index(drop=True)  # randomize image order
    subs = subs[:500]  # limit the maximum number of images per class
    train_count = int(subs.shape[0]*.7)
    test_count = int(subs.shape[0]*.2)
    for i in range(subs.shape[0]):
        image_path = os.path.join(image_dir_path, subs["image_id"].iloc[i]+".jpg")
        if i < train_count:
            target = os.path.join(train_dir, subs["dx"].iloc[i])
        elif i < train_count + test_count:
            target = os.path.join(test_dir, subs["dx"].iloc[i])
        else:
            target = os.path.join(val_dir, subs["dx"].iloc[i])
        shutil.copy(image_path, target)

List training files by class.

In [29]:
paths = [train_dir, test_dir, val_dir]
names = ['TRAIN', 'TEST', 'VALIDATE']
nums = pd.DataFrame(columns=names, index=cancer_types)

for name in names: 
    for cancer_type in cancer_types:
        nums.loc[cancer_type, name] = len(os.listdir(os.path.join(paths[names.index(name)], cancer_type)))

nums.loc['TOTAL'] = nums.sum()
nums['TOTAL'] = nums.sum(axis=1)

print('Files present:')
nums 

Files present:


Unnamed: 0,TRAIN,TEST,VALIDATE,TOTAL
bkl,350,100,50,500
nv,350,100,50,500
df,80,23,12,115
mel,350,100,50,500
vasc,99,28,15,142
bcc,350,100,50,500
akiec,228,65,34,327
TOTAL,1807,516,261,2584
