In [None]:
import glob
import os
from PIL import Image

def check_image(path):
    try:
        Image.open(path)
        return True
    except:
        return False

imgs = [f for f in glob.glob("physionet.org/**", recursive=True) if os.path.isfile(f) and check_image(f)]
stemns = {img: os.path.basename(os.path.dirname(img)) for img in imgs}

In [None]:
import pandas as pd
# don't orient on index
img_df = pd.DataFrame.from_dict(stemns, orient='index')
# drop index
img_df.reset_index(inplace=True)
img_df.columns = ['path', 'study']
print(img_df)

In [None]:
txt_files = [f for f in glob.glob('files/p10/**', recursive=True) if os.path.isfile(f) and f.endswith('.txt')]

# open each text file and save in a dictionary
# key: filename
# value: list of lines
txts = {}
for txt_file in txt_files:
    with open(txt_file) as f:
        txts[os.path.splitext(os.path.basename(txt_file))[0]] = f.read()
print(next(iter(txts.items())))

In [None]:
# convert to pandas dataframe
import pandas as pd
df = pd.DataFrame.from_dict(txts, orient='index')
df.reset_index(inplace=True)
df.columns = ['study', 'report']
print(df)

In [None]:
def section_start(lines, section=' IMPRESSION'):
    """Finds line index that is the start of the section."""
    for idx, line in enumerate(lines):
        if line.startswith(section):
            return idx
    return -1

def extract_impressions(df):
    """Generates a csv containing report impressions."""
    df_imp = df.copy()
    for index, row in df_imp.iterrows():
        report = row['report'].splitlines()
        impression_idx = section_start(report)
        impression_and_findings_idx = section_start(report, section=' FINDINGS AND IMPRESSION:')
        seperator = ''
        if impression_idx != -1:
            impression = seperator.join(report[impression_idx:]).replace('IMPRESSION:', '').replace('\n', '').strip()
        elif impression_and_findings_idx != -1:
            impression = seperator.join(report[impression_and_findings_idx:]).replace('FINDINGS AND IMPRESSION:', '').replace('\n', '').strip()
        else:
            impression = ''

        df_imp.at[index,'report']= impression
    return df_imp

txt_df = extract_impressions(df)
txt_df

In [40]:
join_df = img_df.merge(txt_df, on='study', how='left')
# remove the 'path' column 
join_df.drop('study', axis=1, inplace=True)
# remove rows with empty reports
join_df['report'].replace('', pd.np.nan, inplace=True)
join_df.dropna(subset=['report'], inplace=True)
print(join_df)

                                                 path  \
0   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
1   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
2   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
3   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
4   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
5   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
6   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
7   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
8   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
9   physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
10  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
11  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
12  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
13  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
14  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
15  physionet.org/files/mimic-cxr-jpg/2.0.0/files/...   
16  physionet.org/files/mimic-c

  join_df['report'].replace('', pd.np.nan, inplace=True)


In [41]:
join_df.to_csv('joined.csv', index=False)