In [None]:
ENTITY_FILE_PATH = '/entity2id.txt'
TRIPLE_FILE_PATH = '/train2id.txt'

In [2]:
import pandas as pd
from collections import OrderedDict

In [3]:
def read_txt_by_df(filepath, mapping: OrderedDict):
    df = pd.read_csv(filepath_or_buffer=filepath,
                    sep='\t',
                    names=list(mapping.keys()),
                    dtype=mapping,
                    skiprows=[0]) # skip rows 0
    return df

In [4]:
entity_mapping = OrderedDict({'entity':str, 'id': int})
triple_mapping = OrderedDict({'h':int, 't':int, 'r':int})

In [5]:
df_entity = read_txt_by_df(filepath=ENTITY_FILE_PATH, mapping=entity_mapping)
df_triple = read_txt_by_df(filepath=TRIPLE_FILE_PATH, mapping=triple_mapping)

In [6]:
def filter_literal_by_relation(triple: pd.DataFrame,
                               entity: pd.DataFrame,
                               relation_id: int) -> pd.DataFrame:
    t_in_triples_when_rid = triple[triple.r == relation_id].t.values
    literals = entity[entity.id.isin(t_in_triples_when_rid)]
    literal = literals.copy()
    literal.entity = literals.entity.apply(lambda x: x.split('^^')[0].strip())
    return literal

In [7]:
literal_r2 = filter_literal_by_relation(triple=df_triple, entity=df_entity, relation_id=2)

In [8]:
len(literal_r2)

3190

단위별 값 범위 추출

In [9]:
literal_vals = literal_r2.entity

In [10]:
def is_digit(str):
    try:
        float(str)
        return True
    except ValueError:
        return False

In [11]:
def replace_all(text: str, dic: dict) -> str:
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

In [12]:
def seperate_string_number(string):
    previous_character = string[0]
    groups = []
    newword = string[0]
    for x, i in enumerate(string[1:]):
        if i.isalpha() and previous_character.isalpha():
            newword += i
        elif i.isnumeric() and previous_character.isnumeric():
            newword += i
        else:
            groups.append(newword)
            newword = i

        previous_character = i

        if x == len(string) - 2:
            groups.append(newword)
            newword = ''
    return groups

In [13]:
def seperate_unitstr_literalval(string):
    previous_character = string[0]
    groups = []
    newword = string[0]
    for x, i in enumerate(string[1:]):
        unitstr_cond1 = (i.isalpha() or i in ['/', ' '])
        unitstr_cond2 = (previous_character.isalpha() or previous_character in ['/', ' '])
        numeric_cond1 = (i.isnumeric() or i in ['-', '.', ','])
        numeric_cond2 = (previous_character.isnumeric() or previous_character in ['-', '.', ','])
        if unitstr_cond1 and unitstr_cond2:
            newword += i
        elif numeric_cond1 and numeric_cond2:
            newword += i
        else:
            groups.append(newword.strip())
            newword = i

        previous_character = i

        if x == len(string) - 2:
            groups.append(newword.strip())
            newword = ''
    return groups

In [14]:
def preprocess_literal_vals(literal_val: str) -> (str, str, bool):
    exception_dict = {' ': '', '_': '', 'about': '', '--': '-'}
    
    string = replace_all(literal_val, exception_dict)
    groups = seperate_unitstr_literalval(string)
    
    num_val, unit_str, data_state = None, None, False
    
    if len(groups) == 1:
        groups = groups[0]
        num_val, unit_str = groups[0], groups[1]
        
    elif len(groups) == 2:
        n_dash = len(groups[0].split('-'))
        if (n_dash <= 2) & (groups[0][0].isnumeric()):
            if groups[0][-1] == '-':
                groups[0] = groups[0][:-1]
            num_val, unit_str, data_state = groups[0], groups[1], True
        
    elif len(groups) == 3:
        if groups[0].isalpha():
            num_val, unit_str, data_state = groups[1], groups[2], True
            
    elif len(groups) == 4:
        if 'to' in groups:
            num_val, unit_str, data_state = groups[0]+'-'+groups[2], groups[-1], True
        elif (groups[1] == groups[-1]) & (groups[2][0] == '-'):
            num_val, unit_str, data_state = groups[0]+groups[2], groups[-1], True

    else: # len(groups) = 5
        pass
    
    if data_state:
        correction_dict = {'to': '-', ',': ''}
        num_val = replace_all(num_val, correction_dict)
    
    return literal_val, num_val, unit_str, data_state

In [15]:
literal_vals_p = literal_vals.apply(lambda x: preprocess_literal_vals(x))

In [16]:
def organize_literal_vals_p(df_p):
    df = pd.DataFrame()
    df['origin_val'] = df_p.apply(lambda x: x[0])
    df['range_val_s'] = None
    df['range_val_e'] = None
    df['num_val'] = df_p.apply(lambda x: x[1])
    df['unit_str'] = df_p.apply(lambda x: x[2])
    df['data_state'] = df_p.apply(lambda x: x[3])
    df['range_yn'] = False
    
    # df1 = state_True
    df1 = df[df.data_state == True].copy()
    df1['range_val_s'] = df1.num_val.apply(lambda x: x.split('-')[0] if len(x.split('-'))==2 else None)
    df1['range_val_e'] = df1.num_val.apply(lambda x: x.split('-')[1] if len(x.split('-'))==2 else None)
    df1.loc[~df1['range_val_s'].isna(), 'num_val'] = None
    df1.loc[~df1['range_val_s'].isna(), 'range_yn'] = True
    
    output = df.copy()
    output.loc[df1.index] = df1
    return output

In [17]:
result = organize_literal_vals_p(literal_vals_p)

In [18]:
def check_min_max_val_of_each_unit(df):
    df = df[df.data_state == True] # get only valid data
    df_v = df[df.range_yn == False] # only value
    df_r = df[df.range_yn == True] # only range
    
    df_v = df_v[['num_val', 'unit_str']]
    df_rs = df_r[['range_val_s','unit_str']].rename(columns={'range_val_s':'num_val'})
    df_re = df_r[['range_val_e','unit_str']].rename(columns={'range_val_e':'num_val'})
    
    df_tgt = pd.concat([df_v, df_rs, df_re], axis=0).sort_index()
    
    output_cnt = df_tgt.groupby(['unit_str']).count().rename(columns={'num_val':'cnt'})
    output_min = df_tgt.groupby(['unit_str']).min().rename(columns={'num_val':'min_val'})
    output_max = df_tgt.groupby(['unit_str']).max().rename(columns={'num_val':'max_val'})
    
    return pd.concat([output_cnt, output_min, output_max], axis=1)

In [19]:
result2 = check_min_max_val_of_each_unit(result)

In [20]:
result2 = result2.sort_values(by='cnt', ascending=False)
result2.head(30)

Unnamed: 0_level_0,cnt,min_val,max_val
unit_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mg,1514,0.0,9950.0
unit,862,0.0,9900.0
ml,476,0.0,94.27
mcg,271,0.0,92.5
gm,100,0.0,90.0
puff,90,0.0,9.0
meq,82,0.0,95.0
g,78,0.0,90.0
mgpe,39,100.0,900.0
drop,36,0.0,9.0


---

Information of dosage in KG: it comes from PRESCRIPTIONS table, especially in DOSE_VAL_RX, DOSE_UNIT_RX column

In [21]:
# pre-defined units name in PRESCRIPTIONS table in MIMIC-III
UNITS = ['-', '_unit', 'aero', 'amp', 'appl', 'bag', 'bags', 'bar', 'box',
       'btl', 'bulk', 'cadd', 'can', 'cap', 'cart', 'con', 'conc', 'crea',
       'dbtl', 'dev', 'dose', 'drop', 'drp', 'ea', 'ene', 'enema', 'g',
       'g/118ml', 'gm', 'gr', 'gtt', 'in', 'inh', 'inj', 'iu', 'jar',
       'kit', 'l', 'liq', 'lot', 'loz', 'mcg', 'mcg/h', 'mcg/hr',
       'mcg/kg', 'mcg/kg/min', 'mcg/ml', 'md to order daily dose', 'meq',
       'mg', 'mg pe', 'mg/24h', 'mg/250 ml', 'mg/50 ml', 'mg/day',
       'mg/hr', 'mg/kg', 'mg/kg/hr', 'mg/min', 'mg/ml', 'million units',
       'ml', 'ml/hr', 'ml/syringe', 'mmol', 'mu', 'neb', 'none', 'oint',
       'pack', 'pad', 'pe (phenytoin sodium equivalent)', 'pkg', 'pkt',
       'ptch', 'puff', 'puff(s)', 'pwd', 'ring', 'scp', 'soln', 'spry',
       'stck', 'strp', 'supp', 'susp', 'syr', 'syrp', 'tab', 'trmt',
       'troc', 'tube', 'udcup', 'unit', 'unit/hr', 'unit/min', 'units',
       'vial', 'waf']