In [1]:
import pandas as pd

notes_mimic_iii = pd.read_csv('data/mimiciii-14/NOTEEVENTS.csv',usecols=['TEXT']).rename(columns={'TEXT':'text'})
notes_mimic_iii

Unnamed: 0,text
0,Admission Date: [**2151-7-16**] Dischar...
1,Admission Date: [**2118-6-2**] Discharg...
2,Admission Date: [**2119-5-4**] D...
3,Admission Date: [**2124-7-21**] ...
4,Admission Date: [**2162-3-3**] D...
...,...
2083175,NPN\n\n\n#1 Infant remains in RA with O2 sats...
2083176,"Neonatology\nDOL #5, CGA 36 weeks.\n\nCVR: Con..."
2083177,Family Meeting Note\nFamily meeting held with ...
2083178,NPN 1800\n\n\n#1 Resp: [**Known lastname 2243*...


In [21]:
import pandas as pd

data = pd.read_csv('filtered_notes_for_annotation_task.csv').drop('ROW_ID',axis=1)
y= data.iloc[33,1:]
list(y)

[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [2]:

notes_mimic_cxr = pd.read_csv('data/mimic-cxr/cxr-study-list.csv.gz',usecols=['path'])

notes_mimic_cxr

Unnamed: 0,path
0,files/p10/p10000032/s50414267.txt
1,files/p10/p10000032/s53189527.txt
2,files/p10/p10000032/s53911762.txt
3,files/p10/p10000032/s56699142.txt
4,files/p10/p10000764/s57375967.txt
...,...
227830,files/p19/p19999442/s58708861.txt
227831,files/p19/p19999733/s57132437.txt
227832,files/p19/p19999987/s55368167.txt
227833,files/p19/p19999987/s58621812.txt


In [3]:

# notes_mimic_cxr = pd.read_csv('data/mimim-cxr/cxr-study-list.csv.gz',usecols=['path'])
#make sure reports are downloaded and unzipped properly
# from tqdm.notebook import tqdm_notebook
from tqdm import tqdm

def get_text(row):
    fpath= 'data/mimic-cxr/'+row
    with open(fpath, 'r') as file:
        text = file.read()
    return text

tqdm.pandas(desc='Assigning text for radiology studies from directory...')
notes_mimic_cxr['text'] = notes_mimic_cxr['path'].progress_apply(get_text)

notes_mimic_cxr

Assigning text for radiology studies from directory...: 100%|██████████| 227835/227835 [01:01<00:00, 3733.71it/s]


Unnamed: 0,path,text
0,files/p10/p10000032/s50414267.txt,FINAL REPORT\...
1,files/p10/p10000032/s53189527.txt,FINAL REPORT\...
2,files/p10/p10000032/s53911762.txt,FINAL REPORT\...
3,files/p10/p10000032/s56699142.txt,FINAL REPORT\...
4,files/p10/p10000764/s57375967.txt,FINAL REPORT\...
...,...,...
227830,files/p19/p19999442/s58708861.txt,FINAL REPORT\...
227831,files/p19/p19999733/s57132437.txt,FINAL REPORT\...
227832,files/p19/p19999987/s55368167.txt,FINAL REPORT\...
227833,files/p19/p19999987/s58621812.txt,FINAL REPORT\...


In [4]:
notes_mimic_cxr = notes_mimic_cxr.drop('path',axis=1)

In [5]:
notes_mimic_cxr.iloc[107].text

'                                 FINAL REPORT\n SINGLE FRONTAL VIEW OF THE CHEST\n \n REASON FOR EXAM:  Intubated patient, hypoxic respiratory failure.\n \n Comparison is made with prior study, ___.\n \n Cardiac size is normal.  Lines and tubes are in the standard position.  Large\n right and moderate left pleural effusions are grossly unchanged allowing the\n differences in positioning of the patient.  Right upper lobe opacity has\n improved consistent with improving atelectasis.  Pleural effusions are\n associated with atelectasis, larger on the right side.  There is mild vascular\n congestion.\n'

In [6]:
combined = pd.concat([notes_mimic_iii,notes_mimic_cxr])

In [79]:


def preprocess_and_clean_notes(admin_language, notes_df: pd.DataFrame) -> pd.DataFrame:
    """remove redundant information from the free text, which are discharge summaries,
    using both common NLP techniques and heuristic rules

    Args:
        notes_df (pd.DataFrame): MimicIII's NOTEEVENTS.csv.gz, including the columns:
            ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME',
            'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT']

    Returns:
        pd.DataFrame: notes_df, filtered of redundant text
    """
    logger.info(
        "Removing de-id token, admin language and other cruft...")
    with tqdm(total=3+len(admin_language)+6) as pbar:
        # notes_df["TEXT"] = notes_df["TEXT"].str.lower()
        # pbar.update(1)
        notes_df["text"] = notes_df["text"].replace(r"\[.*?\]", "", regex=True)
        pbar.update(1)
        for admin_token in admin_language:
            # Removing admin language...
            notes_df["text"] = notes_df["text"].str.replace(admin_token, "")
            pbar.update(1)
        for original, replacement in [
            ("\n", " "),
            ("\n\n", " "),
            ("\n\n\n", " "),
            ("w/", "with"),
            ("_", ""),
            ("#", ""),
            ("\d+", ""),
            ('\s+', ' '),
            ('\"', '')
        ]:
            notes_df["text"] = notes_df["text"].str.replace(
                original, replacement)
            pbar.update(1)
        pbar.update(1)
    return notes_df


class AdminLanguage:
    def __init__(self):
        self.explicit_removal = [
        "FINAL REPORT",
        "Admission Date",
        "Discharge Date",
        "Date of Birth",
        "Phone",
        "Date/Time",
        "ID",
        "Completed by",
        "Dictated By",
        "Attending",
        "Provider: ",
        "Provider",
        "Primary",
        "Secondary",
        " MD Phone",
        " M.D. Phone",
        " MD",
        " PHD",
        " X",
        " IV",
        " VI",
        " III",
        " II",
        " VIII",
        "JOB#",
        "JOB#: cc",
        "# Code",
        "Metoprolol Tartrate 25 mg Tablet Sig",
        ")",
        "000 unit/mL Suspension Sig",
        "0.5 % Drops ",
        "   Status: Inpatient DOB",
        "Levothyroxine 50 mcg Tablet Sig",
        "0.5 % Drops Sig",
        "Lidocaine 5 %(700 mg/patch) Adhesive Patch",
        "Clopidogrel Bisulfate 75 mg Tablet Sig",
        "Levofloxacin 500 mg Tablet Sig",
        "Albuterol 90 mcg/Actuation Aerosol ",
        "None Tech Quality: Adequate Tape #",
        "000 unit/mL Solution Sig",
        " x",
        " am",
        " pm",
    ]

In [80]:
combined_sample = combined[-1000:]

In [81]:
from loguru import logger

In [82]:
combined_sample = preprocess_and_clean_notes(AdminLanguage().explicit_removal,combined_sample)
combined_sample

2020-11-02 21:52:08.453 | INFO     | __main__:preprocess_and_clean_notes:14 - Removing de-id token, admin language and other cruft...
54it [00:00, 447.56it/s]                        


Unnamed: 0,text
226835,"HISTORY: Difficulty breathing, to assess for ..."
226836,"HISTORY: Elevated white count, to assess for ..."
226837,HISTORY: Chest congestion not resolved with a...
226838,EXAMINATION: CHEST (PA AND LAT INDICATION: ye...
226839,CHEST RADIOGRAPH PERFORMED ON . COMPARISON: N...
...,...
227830,PORTABLE AP CHEST-RAY INDICATION: Patient wit...
227831,INDICATION: -year-old with chest pain. TECHNI...
227832,PORTABLE CHEST OF COMPARISON: radiograph. FIN...
227833,CHEST RADIOGRAPH PERFORMED ON COMPARISON: Pri...


In [83]:
combined_sample = combined_sample['text'] + '\n\n'

In [76]:
import csv

In [84]:
combined_sample.to_csv('TEST.raw',sep='\n',header=None,index=None)

In [87]:
pd.read_csv('TEST.raw',header=None)

Unnamed: 0,0
0,"HISTORY: Difficulty breathing, to assess for ..."
1,"HISTORY: Elevated white count, to assess for ..."
2,HISTORY: Chest congestion not resolved with a...
3,EXAMINATION: CHEST (PA AND LAT INDICATION: ye...
4,CHEST RADIOGRAPH PERFORMED ON . COMPARISON: N...
...,...
995,PORTABLE AP CHEST-RAY INDICATION: Patient wit...
996,INDICATION: -year-old with chest pain. TECHNI...
997,PORTABLE CHEST OF COMPARISON: radiograph. FIN...
998,CHEST RADIOGRAPH PERFORMED ON COMPARISON: Pri...
