In [1]:
import numpy as np
import pandas as pd

import os

In [2]:
def getFiles(organName, listDir, prefix=None):
    """
    Generate list of files associa-
    ted with each organ of interest
    in the list of files.
    --------------------------
    Input:
        organ_list: list of organs
                    str
        listDir:    list of files  
                    to search
        prefix:     OS path prefix
    Output:
        lists:      list of files 
                    associated with
                    organ
    """
    # get the files from listDir that contain 'organName'
    matching = [file for file in listDir if organName in file]
    
    # append the path prefix so that files are accessible
    if prefix:
        matchingFixed = [prefix + file for file in matching]
        return matchingFixed
    
    else:
        return matching

In [3]:
def filesTasks(file_list, organ, categories):
    
    organFiles = getFiles(organ, file_list)
    
    taskList = []
    
    # append all task files for a specific organ
    for c in categories:
        
        organTask = getFiles(c, organFiles)
        taskList.append(organTask)

    # organ specific task files
    return taskList

In [4]:
def filestoDF(file_list):
    """
    Return a list of data frames from the given
    list of files.
    """
    dfs = [pd.read_csv(file, low_memory=False) for file in file_list]
    return dfs

In [5]:
def getOrgans(filelist):
    """
    Returns list of all available
    organ names.
    -----------------------------
    Input: List of files
    Output: List of organ names
    """
    organs = []
    
    for f in filelist:
        
        if f.endswith('csv'):
            name = f.split('.')[0].split('_')[-1]
            if name not in organs:
                organs.append(name)
        else:
            continue
            
    return organs

In [6]:
# given a list, recombine the two csv files, output a new csv.
def csv_concat(fileList):
       
    allDFs = []
    
    # for list in fileList, each list is a task
    for i,l in enumerate(fileList):
        
        fixedFiles = []

        # for each file in list
        for file in l:   
        
            # fix path
            path = "../data_200/" + file
            csv = pd.read_csv(path)
            fixedFiles.append(csv)
            
        final = pd.concat(fixedFiles, axis=0, ignore_index=True)
        allDFs.append(final)
    
    return allDFs

In [7]:
def saveCSVs(dfs, organ):
    
    cats=['Abnormal Findings', 'disease_location',
     'Previous Surgeries', 'Indeterminate nodules']
    
    fixCats = [cat +'_'+ organ for cat in cats]
    
    for j, df in enumerate(dfs):
        savePath = '../data_200/recombined/' + fixCats[j] + '.csv'
        df.to_csv(savePath, index=False)

### import data

In [8]:
data_small = "../data_200/data_silvia/"
data_big = '../data_200/'
files_small = sorted(os.listdir(data_small))
files_big = os.listdir(data_big)

In [9]:
categories = ['Abnormal Findings',
       'disease_location',
       'Previous Surgeries',
       'Indeterminate nodules',
       'disease_status']
organNames = getOrgans(files_big)


In [11]:
# --------What's happening here?
# From the big list of files that are already split per category per organ,
# we compile all files associated with an organ first (1st line)
# then within those files, we find the files associated with a specific category (abnormal findings, disease loc, prev surge)(lines 2-4)
# place all of those files in a list (line 5)
# concat all of the files for each category (say abnormal findings for liver --train and dev, recombine) (line 6)
# this results in all tasks PER organ recombined so that there are no longer dev or train set files

# disease_status is its own separate thing for each patient (summary at the end),
# hence why the last line is commented out


liver = getFiles("Liver", files_big)
liver_af = getFiles(categories[0], liver)
liver_dl = getFiles(categories[1], liver)
liver_ps = getFiles(categories[2], liver)
#liver_in = getFiles(categories[3], liver)
liverList = [liver_af, liver_dl, liver_ps]
fixLiver = csv_concat(liverList)
saveCSVs(fixLiver, 'Liver')

other = getFiles("Other", files_big)
other_af = getFiles(categories[0], other)
other_dl = getFiles(categories[1], other)
other_ps = getFiles(categories[2], other)
#other_in = getFiles(categories[3], other)
#other_ds = getFiles(categories[4], other)
otherList = [other_af, other_dl, other_ps]
fixOther = csv_concat(otherList)
saveCSVs(fixOther, 'Other')

adrenals = getFiles("Adrenals", files_big)
adrenals_af = getFiles(categories[0], adrenals)
adrenals_dl = getFiles(categories[1], adrenals)
adrenals_ps = getFiles(categories[2], adrenals)
#adrenals_in = getFiles(categories[3], adrenals)
#adrenals_ds = getFiles(categories[4], adrenals)
adrenalsList = [adrenals_af, adrenals_dl, adrenals_ps]
fixAdrenals = csv_concat(adrenalsList)
saveCSVs(fixAdrenals, 'Adrenals')

pancreas = getFiles("Pancreas", files_big)
pancreas_af = getFiles(categories[0], pancreas)
pancreas_dl = getFiles(categories[1], pancreas)
pancreas_ps = getFiles(categories[2], pancreas)
#pancreas_in = getFiles(categories[3], pancreas)
pancreasList = [pancreas_af, pancreas_dl, pancreas_ps]
fixPancreas = csv_concat(pancreasList)
saveCSVs(fixPancreas, 'Pancreas')

pelvis = getFiles("Pelvis", files_big)
pelvis_af = getFiles(categories[0], pelvis)
pelvis_dl = getFiles(categories[1], pelvis)
pelvis_ps = getFiles(categories[2], pelvis)
#pelvis_in = getFiles(categories[3], pelvis)
#pelvis_ds = getFiles(categories[4], pelvis)
pelvisList = [pelvis_af, pelvis_dl, pelvis_ps]
fixPelvis = csv_concat(pelvisList)
saveCSVs(fixPelvis, 'Pelvis')

kidney = getFiles("Kidney", files_big)
kidney_af = getFiles(categories[0], kidney)
kidney_dl = getFiles(categories[1], kidney)
kidney_ps = getFiles(categories[2], kidney)
kidneyList = [kidney_af, kidney_dl, kidney_ps]
fixKidney = csv_concat(kidneyList)
saveCSVs(fixKidney, 'Kidney')

peritoneum = getFiles("Peritoneum", files_big)
peritoneum_af = getFiles(categories[0], peritoneum)
peritoneum_dl = getFiles(categories[1], peritoneum)
peritoneum_ps = getFiles(categories[2], peritoneum)
peritoneumList = [peritoneum_af, peritoneum_dl, peritoneum_ps]
fixPeritoneum = csv_concat(peritoneumList)
saveCSVs(fixPeritoneum, 'Peritoneum')

upperGI = getFiles("Upper GI lumen", files_big)
upperGI_af = getFiles(categories[0], upperGI)
upperGI_dl = getFiles(categories[1], upperGI)
upperGI_ps = getFiles(categories[2], upperGI)
upperGIList = [upperGI_af, upperGI_dl, upperGI_ps]
fixUpperGI = csv_concat(upperGIList)
saveCSVs(fixUpperGI, 'Upper GI lumen')

lowerGI = getFiles("Lower GI lumen", files_big)
lowerGI_af = getFiles(categories[0], lowerGI)
lowerGI_dl = getFiles(categories[1], lowerGI)
lowerGI_ps = getFiles(categories[2], lowerGI)
lowerGIList = [lowerGI_af, lowerGI_dl, lowerGI_ps]
fixLowerGI = csv_concat(lowerGIList)
saveCSVs(fixLowerGI, 'Lower GI lumen')

lymph = getFiles("Lymph nodes", files_big)
lymph_af = getFiles(categories[0], lymph)
lymph_dl = getFiles(categories[1], lymph)
lymph_ps = getFiles(categories[2], lymph)
lymphList = [lymph_af, lymph_dl, lymph_ps]
fixLymph = csv_concat(lymphList)
saveCSVs(fixLymph, 'Lymph nodes')

lungs = getFiles("Lungs", files_big)
lungs_af = getFiles(categories[0], lungs)
lungs_dl = getFiles(categories[1], lungs)
lungs_ps = getFiles(categories[2], lungs)
lungsList = [lungs_af, lungs_dl, lungs_ps]
fixLungs = csv_concat(lungsList)
saveCSVs(fixLungs, 'Lungs')

In [None]:
# seeing order in which these df's were generated (keeping track of file names)
# ad-hoc...but solves it for now.

for i in pancreasList:
    print(i)

In [None]:
cats = ['Abnormal Findings_Pancreas', 'disease_location_Pancreas',
     'Previous Surgeries_Pancreas', 'Indeterminate nodules_Pancreas']

for j,df in enumerate(fixPancreas):
    savePath = '../data_200/recombined/' + cats[j] + '.csv'
    df.to_csv(savePath, index=False)