In [None]:
"""
Cell For Papermill Parameters
"""

test_size = 0.4
val_size = 0.5
seed = 0

#SRC Folders
pso_folder = "../../dataset/src/pos"
neg_folder = "../../dataset/src/neg"

#Train
train_pos = "../../dataset/ds1/train/pos/"
train_neg = "../../dataset/ds1/train/neg/"

#Test
test_pos =  "../../dataset/ds1/test/pos/"
test_neg =  "../../dataset/ds1/test/neg/"

#Val
val_neg = "../../dataset/ds1/val/neg/"
val_pos = "../../dataset/ds1/val/pos/"

#File Prefix
file_prefix = 'final'

#Target Flags
labels_file = "../data/TCIA Biopsy Data_2020-07-14.xlsx"

In [None]:
import os
import pandas as pd
import glob
from pathlib import Path
from shutil import copy
from sklearn.model_selection import train_test_split
import numpy

In [None]:
def clean_folder(folder,file_prefix):
    
    """
    clean_folder removes files with file_prefix
    
    :param folder_path: folder to process
    :param file_prefix: file prefix
    """ 
    
    fileList = glob.glob(folder + '*'+ file_prefix + '*')
    for filePath in fileList:
        try:
            os.remove(filePath)
        except:
            print("Error while deleting file : ", filePath)

In [None]:
def make_test_train(f1,f2,f3,f4,seed,file_prefix,file_loc_list):
    
    """
    make_test_train processes DCOM images in the folder
    
    :param f1: folder to process
    :param f2: folder to process
    :param f3: folder to process
    :param f4: folder to process
    :param seed: random seed
    :param file_prefix: file prefix
    :param file_loc_list: file location list
    :return: file_loc_list
    """ 
        
    fileList = glob.glob(f3 + '**/*' + file_prefix +'*' , recursive=True)
    data = numpy.array(fileList)
    x_train ,x_test = train_test_split(data,test_size=test_size,random_state=seed)
    
    x_test,x_val    = train_test_split(x_test,test_size=val_size,random_state=seed)

    for file in x_train:
        file_loc_list.append(["train",file])
        copy(str(file), f1)

    for file in x_test:
        file_loc_list.append(["test",file])
        copy(str(file), f2)
        
    for file in x_val:
        file_loc_list.append(["val",file])
        copy(str(file), f4)
        
    return file_loc_list

# MRI

In [None]:
clean_folder(train_pos,file_prefix)
clean_folder(test_pos,file_prefix)
clean_folder(val_pos,file_prefix)

clean_folder(train_neg,file_prefix)
clean_folder(test_neg,file_prefix)
clean_folder(val_neg,file_prefix)

In [None]:
file_loc_list = []

file_loc_list = make_test_train(train_pos,test_pos,pso_folder,val_pos,seed,file_prefix,file_loc_list)
file_loc_list = make_test_train(train_neg,test_neg,neg_folder,val_neg,seed,file_prefix,file_loc_list)

In [None]:
file_loc_list

# Create Empty DataFrame
df = pd.DataFrame(columns=['s1','s2','s3'])
counter = 0
for entry in file_loc_list:    
    df.loc[counter] = [entry[0]] + [entry[1]] + [entry[1][42:46]]
    counter = counter + 1
    
df.to_csv("train_test_val.csv", sep=',', encoding='utf-8', index=False)

# US

In [None]:
#SRC Folders
pso_folder = "../../dataset/src_US/pos"
neg_folder = "../../dataset/src_US/neg"

#Train
train_pos = "../../dataset/ds2_US/train/pos/"
train_neg = "../../dataset/ds2_US/train/neg/"

#Test
test_pos =  "../../dataset/ds2_US/test/pos/"
test_neg =  "../../dataset/ds2_US/test/neg/"

#Val
val_neg = "../../dataset/ds2_US/val/neg/"
val_pos = "../../dataset/ds2_US/val/pos/"

#File Prefix
file_prefix = 'US_fnl'

In [None]:
def wheretoput(loc_df,infilename):

    """
    wheretoput matches the US and MRI test/train/dev outputfolders for patients
    
    :param loc_df: folder location
    :param infilename: filename of image
    :return: file_loc
    """ 
    
    row = loc_df[loc_df['s3'] == infilename].head(1)    
    return row["s1"].values.tolist()

In [None]:
clean_folder(train_pos,file_prefix)
clean_folder(test_pos,file_prefix)
clean_folder(val_pos,file_prefix)

clean_folder(train_neg,file_prefix)
clean_folder(test_neg,file_prefix)
clean_folder(val_neg,file_prefix)

In [None]:
def move_us_data(src_folder,train,test,val):

    """
    move_us_data moves US data to match test/train/dev for MRI
    
    :param src_folder: source folder location
    :param train: train folder location
    :param test: test folder location
    :param val: validation folder location
    """ 
    
    fileList = glob.glob(src_folder + '**/*' + file_prefix +'*' , recursive=True)
    for row in fileList:
        patient_id = row[46:50]
        loc = wheretoput(df,patient_id)

        if len(loc) == 1:
            if loc[0] == 'test':
                copy(row, test)
            if loc[0] == 'train':
                copy(row, train)
            if loc[0] == 'val':
                copy(row, val)
        else:
            copy(row, train)

In [None]:
move_us_data(pso_folder,train_pos,test_pos,val_pos)
move_us_data(neg_folder,train_neg,test_neg,val_neg)