# Data Collection

This assumes that you have registered an account with Kaggle, and have created a json key to interact with Kaggle's API.  

In [1]:
import os
import shutil
import pandas as pd
import numpy as np



Set current directory

In [2]:
# from env import *
# os.chdir(os.environ['WORK_DIR'])

os.getcwd()

'/home/pxp/Desktop/skin/jupyter_notebooks'

In [3]:
meta_data = pd.read_csv("../inputs/source/HAM10000_metadata.csv")
meta_data.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


# Data Preparation

## Split train validation test set

Create directories

In [4]:
image_dir_path = '../inputs/source/Skin Cancer/Skin Cancer' 
train_dir = '../inputs/work/train'
test_dir = '../inputs/work/test'
val_dir = '../inputs/work/validate'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

cancer_types = meta_data["dx"].unique()
for cancer_type in cancer_types:
    os.makedirs(os.path.join(train_dir, cancer_type), exist_ok=True)
    os.makedirs(os.path.join(test_dir, cancer_type), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cancer_type), exist_ok=True)

Remove existing files

In [5]:
for dpath in [train_dir, test_dir, val_dir]:
    for root, dirs, files in os.walk(dpath):
        for file in files:
            os.remove(os.path.join(root, file))

Copy training files to target directories.

In [6]:
classes = meta_data["dx"].unique()
for target in classes:
    subs = meta_data[meta_data['dx']==target]
    subs = subs.sample(frac=1).reset_index(drop=True)  # randomize image order
    subs = subs[:300]  # limit the maximum number of images per class
    train_count = int(subs.shape[0]*.7)
    test_count = int(subs.shape[0]*.2)
    for i in range(subs.shape[0]):
        image_path = os.path.join(image_dir_path, subs["image_id"].iloc[i]+".jpg")
        if i < train_count:
            target = os.path.join(train_dir, subs["dx"].iloc[i])
        elif i < train_count + test_count:
            target = os.path.join(test_dir, subs["dx"].iloc[i])
        else:
            target = os.path.join(val_dir, subs["dx"].iloc[i])
        shutil.copy(image_path, target)

List training files by class.

In [7]:
paths = [train_dir, test_dir, val_dir]
names = ['TRAIN', 'TEST', 'VALIDATE']
nums = pd.DataFrame(columns=names, index=cancer_types)

for name in names: 
    for cancer_type in cancer_types:
        nums.loc[cancer_type, name] = len(os.listdir(os.path.join(paths[names.index(name)], cancer_type)))

# make a copy of nums 
tl = nums.copy()
tl.loc['TOTAL'] = nums.sum()
tl['TOTAL'] = nums.sum(axis=1)

print('Files present:')
tl

Files present:


Unnamed: 0,TRAIN,TEST,VALIDATE,TOTAL
bkl,210,60,30,300.0
nv,210,60,30,300.0
df,80,23,12,115.0
mel,210,60,30,300.0
vasc,99,28,15,142.0
bcc,210,60,30,300.0
akiec,210,60,30,300.0
TOTAL,1229,351,177,


## Generate Augmented images for lacking classes

Define a function to save arrays to file.

In [8]:
from PIL import Image

def save_array_as_image(array, filename):
    array = array.astype(np.uint8)
    img = Image.fromarray(array)
    img.save(filename)

Generate Files

In [9]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

target = nums['TRAIN'].max()
need = target - nums['TRAIN']

need = need[need > 0]
names = need.index.tolist()
need = need.tolist()

aug_dir = train_dir

datagen = ImageDataGenerator(
    rotation_range=180, 
    width_shift_range=0.10, 
    height_shift_range=0.10, 
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True, 
    vertical_flip=True, 
    fill_mode='nearest', 
    # rescale=1./255
)

counter = 0
for i in range(len(need)):
    if need[i] == 0:
        continue

    train_batches = datagen.flow_from_directory(
        train_dir,
        target_size=(224,224),
        batch_size=16,
        classes=[names[i]]
    )
    
    while need[i] > 0:
        need[i] -= 16
        out = next(train_batches)
        for j in range(len(out[0])):
            path = os.path.join(aug_dir, names[i])
            os.makedirs(path, exist_ok=True)
            save_array_as_image(out[0][j], os.path.join(path, "aug-" + names[i] + str(counter)+".jpg"))
            counter += 1


2024-05-09 02:31:34.845542: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 80 images belonging to 1 classes.
Found 99 images belonging to 1 classes.


In [10]:
paths = [train_dir, test_dir, val_dir]
names = ['TRAIN', 'TEST', 'VALIDATE']
nums = pd.DataFrame(columns=names, index=cancer_types)

for name in names: 
    for cancer_type in cancer_types:
        nums.loc[cancer_type, name] = len(os.listdir(os.path.join(paths[names.index(name)], cancer_type)))

nums['TOTAL'] = nums.sum(axis=1)
nums.loc['TOTAL'] = nums.sum()

print('Files present:')
nums

Files present:


Unnamed: 0,TRAIN,TEST,VALIDATE,TOTAL
bkl,210,60,30,300
nv,210,60,30,300
df,224,23,12,259
mel,210,60,30,300
vasc,198,28,15,241
bcc,210,60,30,300
akiec,210,60,30,300
TOTAL,1472,351,177,2000
