In [1]:
import glob
import os
from PIL import Image

def check_image(path):
    try:
        Image.open(path)
        return True
    except:
        return False

imgs = [f for f in glob.glob("/Users/rajpurkar/Documents/Code/197-CS-Harvard/lec17/physionet.org/**", recursive=True) if os.path.isfile(f) and check_image(f)]
stemns = {os.path.basename(os.path.dirname(img)): img for img in imgs}

import pandas as pd
img_df = pd.DataFrame.from_dict(stemns, orient='index')
img_df.columns = ['path']
img_df.index.name = 'id'
print(img_df)

                                                        path
id                                                          
s59664767  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s57254866  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s58578322  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s52812156  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s59445954  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
...                                                      ...
s57157809  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s54850191  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s54394630  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s58922574  /Users/rajpurkar/Documents/Code/197-CS-Harvard...
s55602594  /Users/rajpurkar/Documents/Code/197-CS-Harvard...

[948 rows x 1 columns]


In [3]:
txt_files = [f for f in glob.glob('files/p10/**', recursive=True) if os.path.isfile(f) and f.endswith('.txt')]

# open each text file and save in a dictionary
# key: filename
# value: list of lines
txts = {}
for txt_file in txt_files:
    with open(txt_file) as f:
        txts[os.path.splitext(os.path.basename(txt_file))[0]] = f.read()

# convert to pandas dataframe
import pandas as pd
df = pd.DataFrame.from_dict(txts, orient='index')
df.columns = ['report']
df.index.name = 'id'
print(df)

                                                      report
id                                                          
s54807932   WET READ: ___ ___ ___ 6:03 PM\n  No acute car...
s55661237                                   FINAL REPORT\...
s55557490                                   FINAL REPORT\...
s56803082                                   FINAL REPORT\...
s55546501                                   FINAL REPORT\...
...                                                      ...
s59465958                                   FINAL REPORT\...
s54873685                                   FINAL REPORT\...
s51491771                                   FINAL REPORT\...
s58535769                                   FINAL REPORT\...
s59876900                                   FINAL REPORT\...

[22197 rows x 1 columns]


In [4]:
def section_start(lines, section=' IMPRESSION'):
    """Finds line index that is the start of the section."""
    for idx, line in enumerate(lines):
        if line.startswith(section):
            return idx
    return -1

def extract_impressions(df):
    """Generates a csv containing report impressions."""
    df_imp = df.copy()
    for index, row in df_imp.iterrows():
        report = row['report'].splitlines()
        impression_idx = section_start(report)
        impression_and_findings_idx = section_start(report, section=' FINDINGS AND IMPRESSION:')
        seperator = ''
        if impression_idx != -1:
            impression = seperator.join(report[impression_idx:]).replace('IMPRESSION:', '').replace('\n', '').strip()
        elif impression_and_findings_idx != -1:
            impression = seperator.join(report[impression_and_findings_idx:]).replace('FINDINGS AND IMPRESSION:', '').replace('\n', '').strip()
        else:
            impression = ''

        df_imp.at[index,'report']= impression
    return df_imp

df_imp = extract_impressions(df)
df_imp

Unnamed: 0_level_0,report
id,Unnamed: 1_level_1
s54807932,No acute cardiopulmonary abnormality.
s55661237,No acute cardiopulmonary process.
s55557490,Cardiac and mediastinal contours are stable. ...
s56803082,Left pleural effusion is small. Moderate port...
s55546501,No acute cardiopulmonary abnormality.
...,...
s59465958,Asymmetric left greater than right basilar opa...
s54873685,No acute cardiopulmonary process.
s51491771,
s58535769,No acute cardiopulmonary abnormalities


In [8]:
join_df = img_df.join(df_imp, on='id')
# drop the id column
join_df = join_df.reset_index(drop=True)

In [10]:
join_df.to_csv('joined.csv', index=False)

In [14]:
imgs = join_df['path']
titles = join_df['report']

In [16]:
titles.to_list()

['No acute cardiopulmonary process.',
 'Low lung volumes with patchy bibasilar airspace opacities likely reflective of atelectasis.  No pulmonary edema.',
 'Comparison to ___.  No relevant change is noted.  Stable mild platelike atelectasis at the right lung bases.  Moderate cardiomegaly with minimal fluid overload but no overt pulmonary edema.  No evidence of pleural effusions on the frontal or lateral radiograph.  No pneumonia.',
 'No acute cardiopulmonary abnormality.',
 'No acute cardiopulmonary process.',
 'No evidence of acute cardiopulmonary process.  Although no rib fractures are identified, this study has poor sensitivity for detection of rib fractures.  If there is further concern, dedicated rib views should be performed.',
 'No focal consolidation.',
 'No acute cardiopulmonary process.',
 'No acute cardiopulmonary process.',
 'No acute cardiopulmonary process.',
 'No comparison.  Borderline size of the cardiac silhouette.  No pleural effusions.  Mild fluid overload but no ov