# Week 4. Create directories and datasets
Cognitive Systems for Health Technology Applications<br>
Sakari Lukkarinen & Juha Kopu, 4.2.2018<br>
[Helsinki Metropolia University of Applied Sciences](http://metropolia.fi/en)

This is a helper script that:
1. Checks how many samples are in the original downloaded dataset
2. Creates the directory structure for the datasets
3. Creates the directories
4. Splits the data into train, validation, and test sets
5. Copies the data into the directories


In [None]:
import os, shutil

## 1. Check the downloaded data

In [None]:
# List all filenames in the master dataset and count how many samples there are
original_dir = '..\\..\\retinopathy-dataset-master'

class1 = 'nosymptoms'
original_nosymptoms_dir = os.path.join(original_dir, class1)
nosymptoms_fnames = os.listdir(original_nosymptoms_dir)

class2 = 'symptoms'
original_symptoms_dir = os.path.join(original_dir, class2)
symptoms_fnames = os.listdir(original_symptoms_dir)

len(symptoms_fnames), len(nosymptoms_fnames)

## 2. Directory structure

In [None]:
# Base directory is where the datasets will be created
base_dir = '..\\..\\dataset2'

# For training set
train_dir = os.path.join(base_dir, 'train')
train_nosymptoms_dir = os.path.join(base_dir, 'train', 'nosymptoms')
train_symptoms_dir = os.path.join(base_dir, 'train', 'symptoms')

# For validation set
validation_dir = os.path.join(base_dir, 'validation')
validation_nosymptoms_dir = os.path.join(base_dir, 'validation', 'nosymptoms')
validation_symptoms_dir = os.path.join(base_dir, 'validation', 'symptoms')

# For test set
test_dir = os.path.join(base_dir, 'test')
test_nosymptoms_dir = os.path.join(base_dir, 'test', 'nosymptoms')
test_symptoms_dir = os.path.join(base_dir, 'test', 'symptoms')

## 3. Create directories
Some hints: https://github.com/geekcomputers/Python
You might also be intersted in to : [How to delete the contents of a folder in Python?](https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder-in-python)

In [None]:
if not(os.path.exists(base_dir)):
    print('Creating dataset folders to:', base_dir)
    os.mkdir(base_dir)
    os.mkdir(train_dir)
    os.mkdir(train_nosymptoms_dir)
    os.mkdir(train_symptoms_dir)
    os.mkdir(validation_dir)
    os.mkdir(validation_nosymptoms_dir)
    os.mkdir(validation_symptoms_dir)
    os.mkdir(test_dir)
    os.mkdir(test_nosymptoms_dir)
    os.mkdir(test_symptoms_dir)
else:
    print(base_dir, 'already exists!')

## 4. Split the data filenames into train, validation and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Disease (symptom) cases split

# Take 20 % out for testing
train_symptoms_fnames, test_symptoms_fnames = train_test_split(symptoms_fnames, test_size = 0.2)

# From the remaining 80% take 0.25 (=0.8*0.25 = 20% of total) out for validation
train_symptoms_fnames, validation_symptoms_fnames = train_test_split(train_symptoms_fnames, test_size = 0.25)

len(train_symptoms_fnames), len(validation_symptoms_fnames), len(test_symptoms_fnames)
# For debugging purposes, remove the comment marks.
# print(train_symptoms_fnames)
# print(validation_symptoms_fnames)
# print(test_symptoms_fnames)

In [None]:
# Healthy (nosyptom) cases split

# Take 20 % out for testing
train_nosymptoms_fnames, test_nosymptoms_fnames = train_test_split(nosymptoms_fnames, test_size = 0.2)

# From the remaining 80% take 0.25 (20% of total) out for validation
train_nosymptoms_fnames, validation_nosymptoms_fnames = train_test_split(train_nosymptoms_fnames, test_size = 0.25)

len(train_nosymptoms_fnames), len(validation_nosymptoms_fnames), len(test_nosymptoms_fnames)
# For debugging purposes, remove the comment marks.
# print(train_nosymptoms_fnames)
# print(validation_nosymptoms_fnames)
# print(test_nosymptoms_fnames)

## 5. Copy data into the directories

In [None]:
%%timeit

# Copy the original files into the dataset folders

# Training set
# Disease 
for fname in train_symptoms_fnames:
    src = os.path.join(original_symptoms_dir, fname)
    dst = os.path.join(train_symptoms_dir, fname)
    shutil.copyfile(src, dst)
# Healthy 
for fname in train_nosymptoms_fnames:
    src = os.path.join(original_nosymptoms_dir, fname)
    dst = os.path.join(train_nosymptoms_dir, fname)
    shutil.copyfile(src, dst)

# Validation set
# Disease 
for fname in validation_symptoms_fnames:
    src = os.path.join(original_symptoms_dir, fname)
    dst = os.path.join(validation_symptoms_dir, fname)
    shutil.copyfile(src, dst)
# Healthy
for fname in validation_nosymptoms_fnames:
    src = os.path.join(original_nosymptoms_dir, fname)
    dst = os.path.join(validation_nosymptoms_dir, fname)
    shutil.copyfile(src, dst)

# Test set
# Disease
for fname in test_symptoms_fnames:
    src = os.path.join(original_symptoms_dir, fname)
    dst = os.path.join(test_symptoms_dir, fname)
    shutil.copyfile(src, dst)
# Healthy
for fname in test_nosymptoms_fnames:
    src = os.path.join(original_nosymptoms_dir, fname)
    dst = os.path.join(test_nosymptoms_dir, fname)
    shutil.copyfile(src, dst)

