In [9]:
"""
Robert Ruzzo
Utility.ipynb

The purpose of this notebook was to read the file containing the label data, and create a stratified training and 
validation image set which were copied into created directories. This could have been translated directly into 
a python file but I found it unecessary since it was essentially a 1 time use. (In theory)
"""
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import shutil

In [10]:
df_data = pd.read_csv('/floyd/input/cancer/train_labels.csv')
#Make 2 lists of data, one with each label from the CSV
df_0 = df_data[df_data['label'] == 0]
# filter out class 1
df_1 = df_data[df_data['label'] == 1]

df_data['label'].value_counts()


0    130908
1     89117
Name: label, dtype: int64

In [11]:
# train_test_split

# stratify=y creates a balanced validation set.
y = df_data['label']

df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

(198022, 2)
(22003, 2)


In [None]:
# Create a new directory
base_dir = '/floyd/home/data/train'

#[CREATE FOLDERS INSIDE THE BASE DIRECTORY]

# now we create 2 folders inside 'base_dir':

#train_dir
     #a_no_tumor_tissue
     #b_has_tumor_tissue


# [CREATE FOLDERS INSIDE THE TRAIN AND VALIDATION FOLDERS]
# Inside each folder we create seperate folders for each class

# create new folders inside data directory
no_tumor_tissue = os.path.join(base_dir, 'not_a_tumor')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(base_dir, 'tumor')
os.mkdir(has_tumor_tissue)

In [13]:
# Set the id as the index in df_data
df_data.set_index('id', inplace=True)

In [15]:
# Get a list of train and val images
train_list = list(df_train['id'])
val_list = list(df_val['id'])



# Transfer the train images

for image in train_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target == 0:
        label = 'not_a_tumor'
    if target == 1:
        label = 'tumor'
    
    # source path to image
    src = os.path.join('/floyd/home/data/train', fname)
    # destination path to image
    dst = os.path.join(base_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)


# Transfer the val images

for image in val_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target == 0:
        label = 'not_a_tumor'
    if target == 1:
        label = 'tumor'
    

    # source path to image
    src = os.path.join('/floyd/home/data/train', fname)
    # destination path to image
    dst = os.path.join(base_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

In [16]:
# check how many train images we have in each folder

print(len(os.listdir(base_dir+'/not_a_tumor')))
print(len(os.listdir(base_dir+'/tumor')))

130908
89117
