# EXTRACT TEXT

# 0. prepare

In [239]:
import pandas as pd

import os
import re, json, ast
import argparse

from tqdm import tqdm, trange
from typing import Union, List



ROOT_PATH = 'result'
if not os.path.isdir(ROOT_PATH):
    os.mkdir(ROOT_PATH)
    

def load_csv_file(file_path):
    df = pd.read_csv(file_path)
    return df

    
def save_csv_file(csv_data, file_path):
    csv_data.to_csv(file_path, index=False)
    return print('save successfully!')


def load_noteevents(file_path):

    df = pd.read_csv(file_path)

    # dataframe dtype config
    df.CHARTDATE = pd.to_datetime(df.CHARTDATE, format='%Y-%m-%d', errors='raise')
    df.CHARTTIME = pd.to_datetime(df.CHARTTIME, format='%Y-%m-%d %H:%M:%S', errors='raise')
    df.STORETIME = pd.to_datetime(df.STORETIME)

    return df

## 1. extract sections

**input:**
- mimic_table/NOTEEVENTS.csv

**output:**
- result/NOTEEVENTS_SECTIONS.csv

In [2]:
'''
Table --> Sections

1. load NOTEEVENTS.csv

2. get discharge sumamry notes
    a) NOTEVENTS.CATEGORY = 'Discharge Summary'
    b) NOTEVENTS.DESCRIPTION = 'Report'
    c) eliminate a short-note

3. preprocess discharge sumamry notes
    a) clean text
    b) split sections by headers
    
4. save csv file
    a) PK: NOTEVENTS.ROW_ID
    b) TEXT: string(doubled-list)
'''
def get_discharge_summary(df_notevents):

    cond1 = (df_notevents.CATEGORY == 'Discharge summary')
    cond2 = (df_notevents.DESCRIPTION == 'Report')

    df_discharge_smmary = df_notevents[cond1&cond2]
    df_discharge_smmary = df_discharge_smmary[['ROW_ID', 'TEXT']]
    
    # eliminate a short-note (subject_id=30561, hadm_id=178941)
    df_discharge_smmary = df_discharge_smmary[df_discharge_smmary.TEXT.apply(lambda x: len(x) > 100)]

    return df_discharge_smmary


def pattern_repl(matchobj):
    # Return a replacement string to be used for match object
    return ' '.rjust(len(matchobj.group(0)))  


def clean_text(text):
    # 1. Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    
    # 2. Replace `_` with spaces.
    new_text = re.sub(r'_', ' ', text)
    
    return new_text


def split_section(text):
    headers, sections = [], []
#     pattern = "^([A-z0-9 ]+)(:)|Discharge Date:|Sex:|JOB#:|Unit No:|FOLLOW-UP PLANS:"
    except_pattern = "(?!(Sig:)|(disp:))"
    include_keywords = "(Discharge Date:)|(Sex:)|(JOB#:)|(Unit No:)|(FOLLOW-UP PLANS:)"
    pattern = "^" + except_pattern + "([A-z0-9 ]+)(:)|" + include_keywords
    SEPERATORS = re.compile(pattern, re.I | re.M)
    start = 0
    
    for matcher in SEPERATORS.finditer(text):
        # cut off by the position of later SEPERATOR
        end = matcher.start()
        if end != start: # except for first line
            section = text[start:end]
            if ':' not in section: #
                pass
            else:
                section = section[len(header):].strip() # except for header in section
                sections.append(section)
        start = end
        end = matcher.end()
        
        # collect each title in the beginning of section
        header = text[start:end].lower()
        headers.append(header)
        
    # add last section
    section = text[start:]
    section = section[len(header):].strip()
    sections.append(section)
    
    return headers, sections


def clean_header(header):
    # delete : (colon)
    header = re.sub(r',', '', header)
    new_header = re.sub(r':', '', header)
    new_header = new_header.strip()
    return new_header


def clean_section(section):
    # Replace multiple spaces with a space.
    new_section = ' '.join(section.split())
    return new_section


def preprocess_discharge_summary(text):
    text = clean_text(text)
    headers, sections = split_section(text)
    
    new_headers, new_sections = [], []
    for idx in range(len(headers)):
        h = clean_header(headers[idx])
        s = clean_section(sections[idx])
        new_headers.append(h)
        new_sections.append(s)
    return [new_headers, new_sections]

In [24]:
LOAD_FILE_PATH = 'mimic_table/NOTEEVENTS.csv'
SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'NOTEEVENTS_SECTIONS.csv')


def main_notes():
    
    data = load_noteevents(file_path=LOAD_FILE_PATH)
    print('Load NOTEEVENTS successfully!')
    
    data = get_discharge_summary(data)
    print('Get discharge summary successfully!')
    
    notes = data.TEXT.apply(lambda x: json.dumps(preprocess_discharge_summary(x)))
    print('Preprocess notes successfully!')
    
    new_data = pd.concat([data.ROW_ID, notes], axis=1)
    save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main_notes()

Load NOTEEVENTS successfully!
Get discharge summary successfully!
Preprocess notes successfully!
save successfully!


## 2. extract xx_sections

**input:**
- result/NOTEEVENTS_SECTIONS.csv  

**output:**
- result/xx_sections.csv
    - (1) px = procedures
    - (2) dx = diagnosis
    - (3) rx = prescriptions

### 2-1. px

In [4]:
PX_HEADERS = ['major surgical or invasive procedure']

In [154]:
'''
Sections --> PX_Sections

1. load NOTEEVENTS_SECTIONS.csv

2. extract px_section
    a) find px_section by finding PX_HEADERS
        - for now, just support the following
        'major surgical or invasive procedure' (38374)
        
3. preprocess px_section
    a) drop NA in header part
    b) drop NA in section part
    c) drop 'None'/'none'
    
4. save csv file
    a) PK: NOTEVENTS.ROW_ID
    b) TEXT
'''


# for now, not support multiple tgt_headers
def extract_px_section(text: str, tgt_headers=PX_HEADERS) -> List[List[str]]:
    
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    px_pos = []
    px_headers, px_sections = [], []
    
    for idx in range(len(headers)):
        h = headers[idx]
        if h in tgt_headers:
            pos = headers.index(h)
            px_pos.append(pos)
        else:
            pass
    
    if len(px_pos) > 0:
        for pos in sorted(px_pos):
            px_headers.append(headers[pos])
            px_sections.append(sections[pos])

    return [px_headers, px_sections]


# def extract_prx_section(text):
#     prx_section = []
#     text = json.loads(text) # change string format to dict
#     headers, sections = text[0], text[1]
    
#     query = 'major surgical or invasive procedure'
#     try:
#         pos = headers.index(query)
#     except:
#         pos = ""
        
#     if pos:
#         prx_section = sections[pos]
            
#     return prx_section


def preprocess_px_sections(px_sections: pd.Series) -> pd.Series:
    
    # drop NA
    px_sections = px_sections[~px_sections.apply(lambda x: len(x[0]) == 0)]
    
    # drop NA
    px_sections = px_sections[~px_sections.apply(lambda x: len(''.join(x[1])) == 0)]
    
    # drop NA string form like 'None', 'none'
    string_NA = [['None'], ['none'], ['NA']]
    px_sections = px_sections[~px_sections.apply(lambda x: x[1] in string_NA)]
    
    return px_sections
        
    
# def eda_px_section(notes: pd.Series, option='cnt') -> int:
#     notes = notes.apply(lambda x: extract_px_section(text=x))
#     if option == 'cnt':
#         output = notes.apply(lambda x: len(x) > 0).sum()
#     return output

In [155]:
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'NOTEEVENTS_SECTIONS.csv')
SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'px_sections.csv')


def main_px():
    
    data = load_csv_file(file_path=LOAD_FILE_PATH)
    print('Load NOTEEVENTS_SECTIONS successfully!', len(data))
    
    notes = data['TEXT'].apply(lambda x: extract_px_section(x))
    print('Extract px sections succesfully!', len(notes))
    
    notes = preprocess_px_sections(notes)
    print('Preprocess px sections successfully!', len(notes))
    
    new_data   = pd.concat([data.ROW_ID, notes], axis=1)
    save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main_px()

Load NOTEEVENTS_SECTIONS successfully! 55176
Extract px sections succesfully! 55176
Preprocess px sections successfully! 27989
save successfully!


### 2-2. dx

In [156]:
DX_HEADERS = [
    'discharge diagnosis',  # 41335 
    'discharge diagnoses', # 7615
    'primary diagnosis', # 2796
    'primary diagnoses',
    'primary dx',
    'primary', # 4869
    'secondary diagnosis', # 1800
    'secondary diagnoses',
    'secondary', # 4402
    'secondary dx',
    'secondary dianogsis',
    'final diagnosis',  # 1691
    'other diagnosis'
]

In [159]:
'''
Sections --> DX_Sections

1. load NOTEEVENTS_SECTIONS.csv

2. extract dx_section
    a) find dx_section by finding DX_HEADERS
    b) (for now) must satisfy that last dx_section be right before 'discharge conditon'
        
3. preprocess dx_section
    a) drop NA in header part
    b) drop NA in section part
    c) drop 'None'/'none'
    
4. save csv file
    a) PK: NOTEVENTS.ROW_ID
    b) TEXT
'''
    

def extract_dx_section(text: str, tgt_headers=DX_HEADERS) -> List[List[str]]:
    
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    dx_pos = []
    dx_headers, dx_sections = [], []
    
    pos_next = -999
    h_next = ['discharge condition']
    
    for idx in range(len(headers)):
        h = headers[idx]
        if h in tgt_headers:
            pos = headers.index(h)
            dx_pos.append(pos) 
#             dx_headers.append(headers[pos])
#             dx_sections.append(sections[pos])
        if h in h_next:
            pos_next = headers.index(h_next[0])
    
    if len(dx_pos) > 0:
        for pos in sorted(dx_pos):
            dx_headers.append(headers[pos])
            dx_sections.append(sections[pos])
            
        if pos_next == (max(dx_pos)+1):
            pass
        else: 
            dx_headers = []
            dx_sections = []
    
    return [dx_headers, dx_sections]


def preprocess_dx_sections(dx_sections: pd.Series) -> pd.Series:
    
    # drop NA in header part
    dx_sections = dx_sections[~dx_sections.apply(lambda x: len(x[0]) == 0)]
    
    # drop NA in section part
    dx_sections = dx_sections[~dx_sections.apply(lambda x: len(''.join(x[1])) == 0)]
    
    # drop NA string form like 'None', 'none'
    string_NA = [['None'], ['none'], ['NA']]
    dx_sections = dx_sections[~dx_sections.apply(lambda x: x[1] in string_NA)]
    
    return dx_sections


# def eda_dx_section(notes: pd.Series, option='cnt') -> int:
#     notes = notes.apply(lambda x: extract_dx_section(text=x))
#     if option == 'cnt':
#         output = notes.apply(lambda x: len(x[0]) > 0).sum()
#     return output

In [160]:
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'NOTEEVENTS_SECTIONS.csv')
SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'dx_sections.csv')


def main_dx():
    
    data = load_csv_file(file_path=LOAD_FILE_PATH)
    print('Load NOTEEVENTS_SECTIONS successfully!', len(data))
    
    notes = data.TEXT.apply(lambda x: extract_dx_section(x))
    print('Extract dx sections succesfully!', len(notes))
    
    notes = preprocess_dx_sections(notes)
    print('Preprocess dx sections successfully!', len(notes))
    
    new_data   = pd.concat([data.ROW_ID, notes], axis=1)
    save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main_dx()

Load NOTEEVENTS_SECTIONS successfully! 55176
Extract dx sections succesfully! 55176
Preprocess dx sections successfully! 36015
save successfully!


### 2-3. rx

In [161]:
RX_HEADERS = ['discharge medications']

In [162]:
'''
Sections --> RX_Sections

1. load NOTEEVENTS_SECTIONS.csv

2. extract rx_section
        
3. preprocess rx_section
    
4. save csv file
    
'''


# def extract_rx_section(text: str, tgt_headers=RX_HEADERS) -> List[List[str]]:
    
#     text = json.loads(text) # change string format to dict
#     headers, sections = text[0], text[1]
    
#     rx_headers, rx_sections = [], []
    
    
#     h_next = ['discharge condition']
#     pos_next = - 99999
    
#     for idx in range(len(headers)):
#         h = headers[idx]
#         if h in tgt_headers:
#             pos = headers.index(h)
#             dx_pos.append(pos) 
#             dx_headers.append(headers[pos])
#             dx_sections.append(sections[pos])
#         if h in h_next:
#             pos_next = headers.index(h_next[0])
    
#     if len(dx_pos) > 0:
#         if pos_next == max(dx_pos)+1 :
#             pass
#         else: 
#             dx_headers = []
#             dx_sections = []
    
#     return [dx_headers, dx_sections]

    
def extract_rx_section(text:str) -> List[List[str]]:
    
    text = json.loads(text) # change string format to dict
    headers, sections = text[0], text[1]
    
    rx_headers, rx_sections = [], []
    
    pos1, pos2, pos3, pos4 = -999, -999, -999, -999
    
    h1 = 'discharge medications'
    h2 = 'discharge disposition'
    h3 = 'discharge diagnosis'
    h4 = 'discharge condition'
    
    if h1 in headers:
        pos1 = headers.index(h1)
    if h2 in headers:
        pos2 = headers.index(h2)
    if h3 in headers:
        pos3 = headers.index(h3)
    if h4 in headers:
        pos4 = headers.index(h4)

    if pos1 + pos2 + pos3 + pos4 > 0: # have all together
        if pos1 < pos2 < pos3 < pos4: # well organized
            rx_headers = headers[pos1:pos2]
            rx_sections = sections[pos1:pos2]
#             rx_section = ' '.join(sections[pos1:pos2])
            
    return [rx_headers, rx_sections]


def preprocess_rx_sections(rx_sections: pd.Series) -> pd.Series:
    
    # drop NA in header part
    rx_sections = rx_sections[~rx_sections.apply(lambda x: len(x[0]) == 0)]
    
    # drop NA in section part
    rx_sections = rx_sections[~rx_sections.apply(lambda x: len(''.join(x[1])) == 0)]
    
    # drop NA string form like 'None', 'none'
    string_NA = [['None'], ['none'], ['NA']]
    rx_sections = rx_sections[~rx_sections.apply(lambda x: x[1] in string_NA)]
    
    return rx_sections

In [163]:
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'NOTEEVENTS_SECTIONS.csv')
SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'rx_sections.csv')


def main_rx():
    
    data = load_csv_file(file_path=LOAD_FILE_PATH)
    print('Load NOTEEVENTS_SECTIONS successfully!', len(data))
    
    notes = data['TEXT'].apply(lambda x: extract_rx_section(x))
    print('Extract rx sections succesfully!', len(notes))
    
    notes = preprocess_rx_sections(notes)
    print('Preprocess rx sections successfully!', len(notes))
    
    new_data   = pd.concat([data.ROW_ID, notes], axis=1)
    save_csv_file(csv_data=new_data, file_path=SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main_rx()

Load NOTEEVENTS_SECTIONS successfully! 55176
Extract rx sections succesfully! 55176
Preprocess rx sections successfully! 35149
save successfully!


## 2. tokenize by scispacy and recover hadm_ids

**input:**
- result/xx_sections.csv

**output:**
- result/xx_sections.txt, result/xx_hadm_ids.txt
    - (1) px = procedures
    - (2) dx = diagnosis
    - (3) rx = prescriptions

In [164]:
# XX_SECTIONS = ['px', 'dx', 'rx']

In [281]:
from tqdm import tqdm, trange
import spacy, scispacy
nlp = spacy.load("en_core_sci_sm")


def save_txt_file(txt_file, save_file_path):
    with open(save_file_path, "w") as file:
        for txt in txt_file:
            file.write(txt)
            file.write('\n')
            file.write('\n')
    return print('save successfully!')


def preprocess_scispacy(nlp, section):
    tokenized_section = ' '.join([token.text for token in nlp(section)])
    return tokenized_section


# def recover_hadm_ids_from_noteevents(row_ids):
    
#     # load NOTEEVENTS.csv
#     noteevents = load_noteevents(file_path=NOTE_PATH)
#     noteevents = noteevents[['ROW_ID', 'HADM_ID']]
    
#     # convert ROW_ID to HADM_ID
#     hadm_ids = noteevents[noteevents['ROW_ID'].isin(target_df['ROW_ID'])].HADM_ID
#     hadm_ids = hadm_ids.astype(int).astype(str)
#     return hadm_ids


def convert_as_textual_data_form(data: str, xx_type: str):
    
    textual_data = ''
    
    # convert string to doubled-list
    text = ast.literal_eval(data)
    
    headers, sections = text[0], text[1]
    n_headers = len(headers)
    
    if xx_type == 'px':
        # n_headers = 1
        textual_data = sections[0]
        
    elif xx_type == 'dx':
        textual_data = ' '.join(sections) 
    
    else: # rx
        textual_data += (sections[0] + ' ')
        for idx in range(1, n_headers):
            textual_data += (headers[idx] + ' ')
            textual_data += (sections[idx] + ' ')
    return textual_data


def recover_hadm_ids(data_row_id: pd.Series, save_file_path: str):
    
     # load NOTEEVENTS.csv
    noteevents = load_noteevents(file_path=NOTE_PATH)
    noteevents = noteevents[['ROW_ID', 'HADM_ID']]
    
    # convert ROW_ID to HADM_ID
    hadm_ids = noteevents[noteevents['ROW_ID'].isin(data_row_id)].HADM_ID
    hadm_ids = hadm_ids.astype(int).astype(str)
    
    save_txt_file(txt_file=hadm_ids, save_file_path=save_file_path)
    print('save hadm_id.txt successfully!!', len(hadm_ids))


def tokenize_scispacy_sections(data_text: pd.Series, save_file_path: str):

    # tokenized by scispacy
    nlp = spacy.load("en_core_sci_sm")
    
    data_text_tokenized = data_text.copy()
#     data.TEXT = data_t.apply(lambda x: preprocess_scispacy(nlp, x))
    for i in trange(len(data_text)):
        data_text_tokenized.iloc[i] = preprocess_scispacy(nlp, data_text.iloc[i])
    
    # save section corpus
    save_txt_file(txt_file=data_text_tokenized, save_file_path=save_file_path)
    print('save section.txt successfully!!', len(data_text_tokenized))

### 2-1. px

In [275]:
NOTE_PATH = os.path.join('mimic_table','NOTEEVENTS.csv')
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'px_sections.csv')
ADM_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'px_hadm_ids.txt')
SEC_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'px_sections.txt')


def preprocess_px_sections_csv_files(load_file_path: str):
    
    data = load_csv_file(file_path=load_file_path)  # load data
    data = data[data['TEXT'].notna()]  # drop na
    data_text = data['TEXT'].apply(lambda x: convert_as_textual_data_form(data=x, xx_type='px'))  # convert
    
    data_row_id = data['ROW_ID']
    
    return data_row_id, data_text


def main_px_2():
    data_r, data_t = preprocess_px_sections_csv_files(load_file_path=LOAD_FILE_PATH)
    recover_hadm_ids(data_row_id=data_r, save_file_path=ADM_SAVE_FILE_PATH)
    tokenize_scispacy_sections(data_text=data_t, save_file_path=SEC_SAVE_FILE_PATH)
    
    
if __name__ == '__main__':
    main_px_2()



save successfully!
save hadm_id.txt successfully!! 27989


100%|██████████| 27989/27989 [04:11<00:00, 111.45it/s]

save successfully!
save section.txt successfully!! 27989





### 2-2. dx

In [279]:
NOTE_PATH = os.path.join('mimic_table','NOTEEVENTS.csv')
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'dx_sections.csv')
ADM_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'dx_hadm_ids.txt')
SEC_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'dx_sections.txt')


def preprocess_dx_sections_csv_files(load_file_path):
    
    data = load_csv_file(file_path=load_file_path)  # load data
    data = data[data['TEXT'].notna()]  # drop na
    data_text = data['TEXT'].apply(lambda x: convert_as_textual_data_form(data=x, xx_type='dx'))  # convert

    data_row_id = data['ROW_ID']
    
    return data_row_id, data_text

    
def main_dx_2():
    data_r, data_t = preprocess_dx_sections_csv_files(load_file_path=LOAD_FILE_PATH)
    recover_hadm_ids(data_row_id=data_r, save_file_path=ADM_SAVE_FILE_PATH)
    tokenize_scispacy_sections(data_text=data_t, save_file_path=SEC_SAVE_FILE_PATH)

    
if __name__ == '__main__':
    main_dx_2()



save successfully!
save hadm_id.txt successfully!! 36015


100%|██████████| 36015/36015 [06:05<00:00, 98.45it/s] 


save successfully!
save section.txt successfully!! 36015


### 2-3. rx

In [283]:
NOTE_PATH = os.path.join('mimic_table','NOTEEVENTS.csv')
LOAD_FILE_PATH = os.path.join(ROOT_PATH, 'rx_sections.csv')
ADM_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'rx_hadm_ids.txt')
SEC_SAVE_FILE_PATH = os.path.join(ROOT_PATH, 'rx_sections.txt')


def preprocess_rx_sections_csv_files(load_file_path):
    
    data = load_csv_file(file_path=load_file_path)  # load data
    data = data[data['TEXT'].notna()]  # drop na
    data_text = data['TEXT'].apply(lambda x: convert_as_textual_data_form(data=x, xx_type='rx'))  # convert
    
    # length check
    data = data[data_text.apply(lambda x: len(x) > 200)]
    
    data_row_id = data['ROW_ID']
    data_text = data['TEXT']
    
    return data_row_id, data_text

    
def main_rx_2():
    data_r, data_t = preprocess_rx_sections_csv_files(load_file_path=LOAD_FILE_PATH)
    recover_hadm_ids(data_row_id=data_r, save_file_path=ADM_SAVE_FILE_PATH)
    tokenize_scispacy_sections(data_text=data_t, save_file_path=SEC_SAVE_FILE_PATH)
    
    
if __name__ == '__main__':
    main_rx_2()



save successfully!
save hadm_id.txt successfully!! 32218


100%|██████████| 32218/32218 [24:45<00:00, 21.68it/s]  


save successfully!
save section.txt successfully!! 32218


In [4]:

# def main():
#     data       = load_csv_file(file_path=LOAD_FILE_PATH)
#     # data = data.iloc[:10]
    
#     # not na 
#     data = data[data.TEXT.notna()]

#     # length > 200
#     data = data[data.TEXT.apply(lambda x: len(x) > 200)]

#     # delete ""
#     data1 = data.copy()
#     data1.TEXT = data.TEXT.apply(lambda x: x[1:-1])

#     # preprocessed by scispacy
#     nlp = spacy.load("en_core_sci_sm")
#     data.TEXT = data1.TEXT.apply(lambda x: preprocess_scispacy(nlp, x))
#     del data1
#     print('preprocess successfully!')

#     # recover and extract full info of data(subject_id, hamd_id)
#     noteevents = load_csv_file(file_path=NOTE_PAHT)
#     noteevents = noteevents[['ROW_ID', 'SUBJECT_ID', 'HADM_ID']]

#     # data=p / noteevents
#     data1 = noteevents[noteevents.ROW_ID.isin(data.ROW_ID)]
    
#     # save txt file
#     print('data len: {}', len(data))
#     save_txt_file(txt_file=data.TEXT, file_path=SEC_SAVE_FILE_PATH)
#     hadm_id = data1.HADM_ID.astype(int).astype(str)
#     print('data1 len: {}', len(hadm_id))
#     save_txt_file(txt_file=hadm_id, file_path=ADM_SAVE_FILE_PATH)


# if __name__ == '__main__':
#     main()