In [1]:
# !!! Only For Google Colab Debugging
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My Drive/lxmert/data/

Mounted at /content/gdrive
/content/gdrive/My Drive/lxmert/data


In [2]:
import pandas as pd
from collections import OrderedDict

In [3]:
def read_txt_by_df(filepath, mapping: OrderedDict):
    df = pd.read_csv(filepath_or_buffer=filepath,
                    sep='\t',
                     
                    names=list(mapping.keys()),
                    dtype=mapping,
                    skiprows=[0]) # skip rows 0
    return df

In [4]:
entity_mapping = OrderedDict({'entity':str, 'id': int})
triple_mapping = OrderedDict({'h':int, 't':int, 'r':int})

In [5]:
df_entity = read_txt_by_df(filepath='entity2id.txt', mapping=entity_mapping)
df_triple = read_txt_by_df(filepath='train2id.txt', mapping=triple_mapping)

dosage(relation_id=2) 관련 literal 값 추출

In [6]:
def filter_literal_by_relation(triple: pd.DataFrame,
                               entity: pd.DataFrame,
                               relation_id: int) -> pd.DataFrame:
    t_in_triples_when_rid = triple[triple.r == relation_id].t.values
    literals = entity[entity.id.isin(t_in_triples_when_rid)]
    literal = literals.copy()
    literal.entity = literals.entity.apply(lambda x: x.split('^^')[0].strip())
    return literal

In [7]:
literal_r2 = filter_literal_by_relation(triple=df_triple, entity=df_entity, relation_id=2)

In [8]:
len(literal_r2)

3190

단위별 값 범위 추출

In [9]:
literal_vals = literal_r2.entity

In [10]:
def is_digit(str):
    try:
        float(str)
        return True
    except ValueError:
        return False

In [11]:
def replace_all(text: str, dic: dict) -> str:
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

In [12]:
def seperate_string_number(string):
    previous_character = string[0]
    groups = []
    newword = string[0]
    for x, i in enumerate(string[1:]):
        if i.isalpha() and previous_character.isalpha():
            newword += i
        elif i.isnumeric() and previous_character.isnumeric():
            newword += i
        else:
            groups.append(newword)
            newword = i

        previous_character = i

        if x == len(string) - 2:
            groups.append(newword)
            newword = ''
    return groups

In [13]:
def seperate_unitstr_literalval(string):
    previous_character = string[0]
    groups = []
    newword = string[0]
    for x, i in enumerate(string[1:]):
        unitstr_cond1 = (i.isalpha() or i in ['/', ' '])
        unitstr_cond2 = (previous_character.isalpha() or previous_character in ['/', ' '])
        numeric_cond1 = (i.isnumeric() or i in ['-', '.', ','])
        numeric_cond2 = (previous_character.isnumeric() or previous_character in ['-', '.', ','])
        if unitstr_cond1 and unitstr_cond2:
            newword += i
        elif numeric_cond1 and numeric_cond2:
            newword += i
        else:
            groups.append(newword.strip())
            newword = i

        previous_character = i

        if x == len(string) - 2:
            groups.append(newword.strip())
            newword = ''
    return groups

In [14]:
def preprocess_literal_vals(literal_val: str) -> (str, str, bool):
    exception_dict = {' ': '', '_': '', 'about': '', '--': '-'}
    
    string = replace_all(literal_val, exception_dict)
    groups = seperate_unitstr_literalval(string)
    
    num_val, unit_str, data_state = None, None, False
    
    if len(groups) == 1:
        groups = groups[0]
        num_val, unit_str = groups[0], groups[1]
        
    elif len(groups) == 2:
        n_dash = len(groups[0].split('-'))
        if (n_dash <= 2) & (groups[0][0].isnumeric()):
            num_val, unit_str, data_state = groups[0], groups[1], True
        
    elif len(groups) == 3:
        if groups[0].isalpha():
            num_val, unit_str, data_state = groups[1], groups[2], True
            
    elif len(groups) == 4:
        if 'to' in groups:
            num_val, unit_str, data_state = groups[0]+'-'+groups[2], groups[-1], True
        elif (groups[1] == groups[-1]) & (groups[2][0] == '-'):
            num_val, unit_str, data_state = groups[0]+groups[2], groups[-1], True

    else: # len(groups) = 5
        pass
    
    if data_state:
        correction_dict = {'to': '-', ',': ''}
        num_val = replace_all(num_val, correction_dict)
    
    return literal_val, num_val, unit_str, data_state

In [15]:
literal_vals_p = literal_vals.apply(lambda x: preprocess_literal_vals(x))

In [16]:
def organize_literal_vals_p(df_p):
    df = pd.DataFrame()
    df['origin_val'] = df_p.apply(lambda x: x[0])
    df['range_val_s'] = None
    df['range_val_e'] = None
    df['num_val'] = df_p.apply(lambda x: x[1])
    df['unit_str'] = df_p.apply(lambda x: x[2])
    df['data_state'] = df_p.apply(lambda x: x[3])
    df['range_yn'] = False
    
    # df1 = state_True
    df1 = df[df.data_state == True].copy()
    df1['range_val_s'] = df1.num_val.apply(lambda x: x.split('-')[0] if len(x.split('-'))==2 else None)
    df1['range_val_e'] = df1.num_val.apply(lambda x: x.split('-')[1] if len(x.split('-'))==2 else None)
    df1.loc[~df1['range_val_s'].isna(), 'num_val'] = None
    df1.loc[~df1['range_val_s'].isna(), 'range_yn'] = True
    
    output = df.copy()
    output.loc[df1.index] = df1
    return output

In [17]:
result = organize_literal_vals_p(literal_vals_p)

In [18]:
result

Unnamed: 0,origin_val,range_val_s,range_val_e,num_val,unit_str,data_state,range_yn
6,100ml,,,100,ml,True,False
10,40-60meq,40,60,,meq,True,True
26,1000mg,,,1000,mg,True,False
32,250ml,,,250,ml,True,False
36,0unit,,,0,unit,True,False
...,...,...,...,...,...,...,...
2901109,55gm,,,55,gm,True,False
2901110,1mcg/ml,,,1,mcg/ml,True,False
2901112,32units,,,32,units,True,False
2901114,208.33ml,,,208.33,ml,True,False


In [19]:
result[['range_val_s', 'range_val_e', 'num_val']].min(level=0)

Unnamed: 0,range_val_s,range_val_e,num_val
6,,,100
10,40,60,
26,,,1000
32,,,250
36,,,0
...,...,...,...
2901109,,,55
2901110,,,1
2901112,,,32
2901114,,,208.33


In [20]:
def check_min_max_val_of_each_unit(df):
    df = df[df.data_state == True]
    df_v = df[df.range_yn == False]
    df_r = df[df.range_yn == True]
    
    v1 = df_r[['range_val_s','range_val_e']].min(axis=1)
    v2 = df_v['num_val']
    v = pd.DataFrame(pd.concat([v1,v2]), columns=['val'])
    
    output = df[['unit_str']].join(v, how='left')
    output_cnt = output.groupby(['unit_str']).count().rename(columns={'val':'cnt'})
    output_min = output.groupby(['unit_str']).min().rename(columns={'val':'min_val'})
    output_max = output.groupby(['unit_str']).max().rename(columns={'val':'max_val'})
    return pd.concat([output_cnt, output_min, output_max], axis=1)

In [21]:
result2 = check_min_max_val_of_each_unit(result)

In [22]:
units = result2.index
try:
  cnt=0
  for (x,y,z) in result2.values:
    test =  [float(x),float(y),float(z)] 
    cnt+=1
except:
  print(x)
  print(y)
  print(z)

  #ranges = [[float(x),float(y),float(z)] for (x,y,z) in result2.values]

14

3


In [23]:
result2

Unnamed: 0_level_0,cnt,min_val,max_val
unit_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aero,3,1,80
amp,5,0.5,3
appl,13,0,8
bag,14,,3
bags,2,1,1
...,...,...,...
unit,580,,9900
unit/hr,1,3,3
units,17,0,80
vial,16,0,6


In [24]:
result[result['unit_str']=='bag']

Unnamed: 0,origin_val,range_val_s,range_val_e,num_val,unit_str,data_state,range_yn
1908,2bag,,,2.0,bag,True,False
2803,1bag,,,1.0,bag,True,False
89003,1-2bag,1.0,2.0,,bag,True,True
766627,1-bag,1.0,,,bag,True,True
1772320,0.5bag,,,0.5,bag,True,False
2596907,1.5bag,,,1.5,bag,True,False
2746728,11-2bag,11.0,2.0,,bag,True,True
2790404,1-1bag,1.0,1.0,,bag,True,True
2832367,0bag,,,0.0,bag,True,False
2864475,3bag,,,3.0,bag,True,False


In [38]:
import torch
a = torch.zeros(5)
b = torch.tensor([True,False,True,False,False],dtype=torch.bool)
#print(a)
torch.where(b,a,torch.tensor(100).type_as(a))
#print(a.masked_fill_(b,value=True))

tensor([  0., 100.,   0., 100., 100.])