In [1]:
# basid data manipulation
import numpy as np
import pandas as pd

# system
import os

In [2]:
def getFiles(organName, listDir, prefix=None):
    """
    Generate list of files associa-
    ted with each organ of interest
    in the list of files.
    --------------------------
    Input:
        organ_list: list of organs
                    str
        listDir:    list of files  
                    to search
        prefix:     OS path prefix
    Output:
        lists:      list of files 
                    associated with
                    organ
    """
    # get the files from listDir that contain 'organName'
    matching = [file for file in listDir if organName in file]
    
    # append the path prefix so that files are accessible
    if prefix:
        matchingFixed = [prefix + file for file in matching]
        return matchingFixed
    
    else:
        return matching

In [3]:
def filestoDF(file_list):
    """
    Return a list of data frames from the given
    list of files.
    """
    dfs = [pd.read_csv(file, low_memory=False) for file in file_list]
    return dfs

In [4]:
def combineFiles(organ_files, disease_files, separate=False):
    """
    Combining File columns for organs and disease status.
    Here the columns are added to the train and dev files
    of each organ. The added columns correspond to the labels
    and impressions of each idx.
    
    After columns are combined, a new DF is created
    and saved as a csv. These are outputted as a list of DFs.
    ---------------------------------------------------------
    Inputs:
        organ_files:   list of file lists. each correspond to a 
                       list of files associated with a particular
                       organ. Here organ_files are for ex all files
                       associated with liver (train and dev).
        disease_files: list of disease status files (train and
                       dev). These contain impression and labels.
    """
    # if we are respecting train/test splits of each organ
    if separate:
        # get train and dev files separately (to
        #merge impressions separately)
        trainFiles = getFiles('train', organ_files)
        devFiles = getFiles('dev', organ_files)
        organ_files = [trainFiles, devFiles]
        
        
        # concatenate disease files vertically to
        # merge on left (organs), here including dev
        # and train to match on idx
        diseaseCombined = pd.concat([disease_files[0],
                                     disease_files[1]])
        
        splits = True # flagging
    
    new_dfs = []
    # train or dev list of files
    for fileList in organ_files: 
        
        for file in fileList:
            # saving file name 
            old = file.split('/')[-1]
            new = old.split('.')[0] + '_disease_status.csv'
            
            # really a path
            fileName = '../../data_200/data_silvia/' + new
            
            # read as a pandas DF
            fileDF = pd.read_csv(file, low_memory=False)
            
            if splits:
            # merge df with concatenated disease status file
                combine = pd.merge(fileDF, diseaseCombined,
                                   how='inner', on ='idx')
            # rename columns for cleanliness, idx remains the same
            combine = combine.rename(columns={'sentence_x':'organ_sentence',
                                             'sentence_y': 'disease_sentence',
                                             'label_x': 'organ_label',
                                            'label_y': 'disease_label'})

            # new column for organ specific + impression
            combine['Impression and Note'] = combine['organ_sentence'] + combine['disease_sentence']
            
            new_dfs.append(combine)
            combine.to_csv(fileName, index=False)
            
    return new_dfs      

### Combine impressions with notes 
For now focus only in organs:
- Liver
- Pancreas

In [5]:
# os path to access all files
path = '../data_200'
dir_list = os.listdir(path)

In [6]:
# get list of files associated with organs of interest and read as df's
liverFiles = getFiles('Liver', dir_list, prefix = path + '/')
pancreasFiles = getFiles('Pancreas', dir_list, prefix = path + '/')

# disease status files
trainDS = pd.read_csv(path + '/' + 'train_disease_status.csv', low_memory = False)
devDS = pd.read_csv(path + '/' + 'dev_disease_status.csv', low_memory = False)
disease_status = [trainDS, devDS]

In [7]:
liverCombined = combineFiles(liverFiles, disease_status, separate=True)
pancreasCombined = combineFiles(pancreasFiles, disease_status, separate=True)