In [54]:
import pandas as pd
import os

from tqdm.auto import tqdm
from constants import *

filename = "20443150"

MAX_DIST = 20

In [55]:
with open(os.path.join(ENTITIES_PATH, filename+".csv"), "r") as f:
    entities_df = pd.read_csv(f).drop("Unnamed: 0", axis=1).dropna(subset=["Word"])

    
with open(os.path.join(ROOT_PATH, "data", "2021AB_SN", "SRSTRE1")) as f:
    umls_relations_df = pd.read_csv(f, delimiter='|', names=["FirstTUI", "RelationTUI", "EndTUI"], index_col=False)

def get_UMLS_score(StartTUI, EndTUI, umls_relations_df):
    return len(umls_relations_df["RelationTUI"].loc[umls_relations_df["FirstTUI"]
                                                    == StartTUI].loc[umls_relations_df["EndTUI"] == EndTUI])

In [56]:
relations_dicts = []

for i in range(len(entities_df)):
    forward_df = entities_df.iloc[i + 1:].loc[entities_df["Sentence"] == entities_df.iloc[i]["Sentence"]]
    valid_relations = forward_df.loc[forward_df["EndWord"] <= entities_df["StartWord"].iloc[i] + MAX_DIST]
    for j in range(len(valid_relations)):
        relations_dicts.append({"First": i,
                                "End": i + j + 1,
                                "FirstWord": entities_df.iloc[i]["Word"],
                                "EndWord": valid_relations.iloc[j]["Word"],
                                "FirstTUI": entities_df.iloc[i]["TUI"],
                                "EndTUI": valid_relations.iloc[j]["TUI"],
                                "FirstGroup": entities_df.iloc[i]["Group"],
                                "EndGroup": valid_relations.iloc[j]["Group"],
                                "Distance": valid_relations.iloc[j]["StartWord"] - entities_df.iloc[i]["EndWord"],
                                "NumUMLS": get_UMLS_score(entities_df.iloc[i]["TUI"], valid_relations.iloc[j]["TUI"], umls_relations_df)})
                                    
relations_df = pd.DataFrame(relations_dicts)

In [57]:
relations_df = relations_df[relations_df["FirstWord"] != relations_df["EndWord"]]
relations_df = relations_df.loc[relations_df["NumUMLS"] > 0]

In [58]:
relations_df

Unnamed: 0,First,End,FirstWord,EndWord,FirstTUI,EndTUI,FirstGroup,EndGroup,Distance,NumUMLS
12,5,8,study,endoscopic deployment,T062,T052,PROC,ACTI,9,1
14,5,10,study,management,T062,T057,PROC,ACTI,18,1
22,7,8,efficacy,endoscopic deployment,T080,T052,CONC,ACTI,2,1
24,7,10,efficacy,management,T080,T057,CONC,ACTI,11,1
26,7,12,efficacy,sleeve gastrectomy,T080,T061,CONC,PROC,15,1
...,...,...,...,...,...,...,...,...,...,...
2197,676,680,sleeve gastrectomy,leak complication,T061,T046,PROC,DISO,12,4
2198,677,678,effective,minimally invasive option,T080,T061,CONC,PROC,1,1
2199,677,679,effective,management,T080,T057,CONC,ACTI,6,1
2201,678,679,minimally invasive option,management,T061,T057,PROC,ACTI,3,1
