In [9]:
import os
import re

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

import json

from pyvis.network import Network

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm

from constants import *

In [10]:
cnt = 0
PER_FILE = 10
list_keep_df = []

for filename in os.listdir(RELATIONS_PATH):
    
    with open(os.path.join(RELATIONS_PATH, filename), "r") as f:
        rel_df = pd.read_csv(f).drop("Unnamed: 0", axis=1)
    
    if len(rel_df) == 0:
        continue
    rel_df = rel_df.loc[rel_df["ScoreUMLS"] == 0]
    keep_df, _ = train_test_split(rel_df, train_size=PER_FILE)
    list_keep_df.append(keep_df)
    
unlabeled_df = pd.concat(list_keep_df, axis=0)

In [11]:
unlabel = [0] * len(unlabeled_df)
unlabel_name = ["No Label"] * len(unlabeled_df)
unlabeled_df["Label Id"] = unlabel
unlabeled_df["Label Full"] = unlabel_name

In [12]:
with open("relations_full.csv", "r") as f:
    labeled_df = pd.read_csv(f).drop("Unnamed: 0", axis=1)

labeled_df = labeled_df.sort_values("Document", axis=0)

label2id = {}
i = 1

for l in labeled_df["Label"].unique():
    label2id[l] = i
    i += 1

In [13]:
labeled_df = labeled_df.rename(columns={"Label": "Label Full"})
labeled_df["Label Id"] = labeled_df["Label Full"].apply(lambda x: label2id[x])

In [14]:
labeled_df

Unnamed: 0,First,End,FirstWord,EndWord,FirstCUI,EndCUI,Distance,ScoreUMLS,Sentence,Document,Label Full,Label Id
4806,342,343,embryological,avascular plane,C0013943,C0682475,1,0,57,19912285,QB,1
4832,1387,1388,neoadjuvant,adjuvant treatment,C0600558,C0600558,2,1,213,19912285,SY,2
4831,574,575,tumour,treatment,C0027651,C0039798,3,0,82,19912285,AQ,3
4830,570,574,tumour,treatment,C0027651,C0039798,19,1,82,19912285,AQ,3
4829,488,489,mortality,death,C0026565,C0011065,4,0,71,19912285,RO,4
...,...,...,...,...,...,...,...,...,...,...,...,...
10454,102,107,disease,tumour,C0012634,C0027651,10,1,14,35472516,CHD,9
10453,102,107,disease,tumour,C0012634,C0027651,10,1,14,35472516,RN,8
10450,43,48,severity,severe,C0439793,C0205082,14,1,7,35472516,RO,4
10462,200,206,leucocytosis,treatment,C0023518,C0039798,18,1,25,35472516,AQ,3


In [15]:
relations_full_df = pd.concat([labeled_df, unlabeled_df], axis=0).reset_index(drop=True).sort_values(["Document", "Sentence", "First"])

In [16]:
relations_full_df

Unnamed: 0,First,End,FirstWord,EndWord,FirstCUI,EndCUI,Distance,ScoreUMLS,Sentence,Document,Label Full,Label Id
31,199,202,registry,Registry,C0034975,C0034975,7,0,36,19912285,SY,2
28,276,277,control patient,patient,C0030705,C0030705,5,0,47,19912285,SY,2
30,276,277,control patient,patient,C0030705,C0030705,5,0,47,19912285,RQ,5
18252,276,279,control patient,AL,C0030705,C0001895,15,0,47,19912285,No Label,0
27,339,345,total mesorectal excision,rectum,C1273428,C0034896,17,1,57,19912285,RO,4
...,...,...,...,...,...,...,...,...,...,...,...,...
17643,195,202,quality,pus,C0332306,C0034161,13,0,25,35472516,No Label,0
16233,199,206,fever,treatment,C0015967,C0039798,19,0,25,35472516,AQ,3
16243,200,206,leucocytosis,treatment,C0023518,C0039798,18,1,25,35472516,AQ,3
17638,227,230,early identification,reduce,C0814435,C0392756,7,0,28,35472516,No Label,0


In [17]:
sentences_full_list = []

last_document = None

for first_id, end_id, first_word, second_word, document, sentence in tqdm(list(relations_full_df[["First", "End", "FirstWord", "EndWord", "Document", "Sentence"]].itertuples(index=False, name=None))):
    
    # Grabbing text to build sentences for the current document (should not happen too much, relations are ordered by document id)
    if not last_document or last_document != document:
        last_document = document
        
        with open(os.path.join(DATA_CLEAN_PATH, str(last_document) + ".txt"), "r") as f:
            text = f.read()
        
        with open(os.path.join(ENTITIES_PATH, str(last_document) + ".csv"), "r") as f:
            entities_df = pd.read_csv(f)
        
        sentences = []
        cursor = 0
        last_end = 0
        
        for word, start_char, end_char in list(entities_df[["Word", "StartChar", "EndChar"]].itertuples(index=False, name=None)):
            dot = re.search("\.", text[cursor:start_char])
            if dot:
                sentences.append({"Text": text[last_end:cursor + dot.span()[0]],
                                  "StartChar": last_end,
                                  "EndChar": cursor + dot.span()[0]})
                last_end = cursor + dot.span()[1]
            else:
                pass
            cursor = end_char
        
        sentences_df = pd.DataFrame(sentences)
    
    if sentence >= len(sentences_df):
        sentences_full_list.append(None)
        continue
        
    sent_text = sentences_df.iloc[sentence]["Text"]
    sent_start = sentences_df.iloc[sentence]["StartChar"]
    sent_end = sentences_df.iloc[sentence]["EndChar"]

    first_start_char = entities_df.iloc[first_id]["StartChar"]
    first_end_char = entities_df.iloc[first_id]["EndChar"]
    second_start_char = entities_df.iloc[end_id]["StartChar"]
    second_end_char = entities_df.iloc[end_id]["EndChar"]

    sentence_full = "[CLS] " + sent_text[:first_start_char - sent_start].strip() + \
                    " <e1>" + str(first_word) + "</e1>" + \
                    sent_text[first_end_char - sent_start:second_start_char - sent_start] + \
                    "<e2>" + str(second_word) + "</e2> " + \
                    sent_text[second_end_char - sent_start:].strip() + " [SEP]"
    sentences_full_list.append(sentence_full)
        
relations_full_df["Sentence Indexed"] = sentences_full_list
labels_full_list = list(relations_full_df["Label Id"].values)

  0%|          | 0/23255 [00:00<?, ?it/s]

In [19]:
entities_df

Unnamed: 0.1,Unnamed: 0,Word,Type,Source,StartChar,EndChar,Document,CUI,Score,TUI,Group,StartWord,EndWord,Sentence
0,0,meta-analysis,ENTITY,SciSpacy MD,18,31,35472516,C0282458,1.0,T170,CONC,4,4,0
1,1,systematic review,ENTITY,SciSpacy MD,36,53,35472516,C1955832,1.0,T170,CONC,6,7,0
2,2,early detection,ENTITY,SciSpacy MD,61,76,35472516,C0596473,0.9999999403953552,T060,PROC,10,11,0
3,3,anastomotic leakage,ENTITY,SciSpacy MD,80,99,35472516,C0919691,0.9999998807907104,T046,DISO,13,14,0
4,4,colorectal surgery,ENTITY,SciSpacy MD,103,121,35472516,C0009369,1.0,T091,OCCU,16,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,321,Commentary/Letter,ENTITY,SciSpacy MD,6696,6713,35472516,UNDEF,UNDEF,UNDEF,UNDEF,1005,1005,42
322,322,Editor,ENTITY,SciSpacy MD,6721,6727,35472516,C1707883,0.9999999403953552,T090,OCCU,1008,1008,42
323,323,author,ENTITY,SciSpacy MD,6733,6740,35472516,C3812881,1.0,T097,LIVB,1011,1011,43
324,324,manuscript,ENTITY,SciSpacy MD,6769,6779,35472516,C0600659,1.0,T073,OBJC,1017,1017,43


In [11]:
relations_full_df = relations_full_df.dropna(axis=0, subset="Sentence Indexed")

In [12]:
TRAIN_CLASS = 500
TEST_CLASS = 100

df_train, df_test = train_test_split(relations_full_df.loc[relations_full_df["Label Id"] == 0], train_size=TRAIN_CLASS, test_size=TEST_CLASS)

for i in range(1, len(relations_full_df["Label Id"].unique())):
    df_train_i, df_test_i = train_test_split(relations_full_df.loc[relations_full_df["Label Id"] == i], train_size=TRAIN_CLASS, test_size=TEST_CLASS)
    df_train = pd.concat([df_train, df_train_i], axis=0)
    df_test = pd.concat([df_test, df_test_i], axis=0)

In [13]:
json_train_sentences = json.dumps(list(df_train["Sentence Indexed"].values))
json_train_labels = json.dumps([int(x) for x in df_train["Label Id"].values])

with open("train_sentence.json", "w") as f:
    f.write(json_train_sentences)

with open("train_label_id.json", "w") as f:
    f.write(json_train_labels)
    
json_test_sentences = json.dumps(list(df_test["Sentence Indexed"].values))
json_test_labels = json.dumps([int(x) for x in df_test["Label Id"].values])

with open("test_sentence.json", "w") as f:
    f.write(json_test_sentences)

with open("test_label_id.json", "w") as f:
    f.write(json_test_labels)

In [15]:
relations_full_df.sample(n=10)

Unnamed: 0,First,End,FirstWord,EndWord,FirstCUI,EndCUI,Distance,ScoreUMLS,Sentence,Document,Label Full,Label Id,Sentence Indexed
22584,510,511,risk,bias,C0035647,C0005346,2,0,79,33555423,No Label,0,[CLS] Twelve studies had low risk of bias and ...
4889,111,115,bariatric surgery,bariatric procedure,C1456587,C1456587,16,0,15,26341085,SY,2,[CLS] Trend analyses on <e1>bariatric surgery<...
12858,11,12,endoscopic technique,treat,C0025664,C0087111,3,0,2,33403464,QB,1,[CLS] Aim of the present paper is to describe ...
4197,1052,1053,stomal site,wound ostomy,C1955856,C0029473,4,0,176,25633276,RQ,5,[CLS] <e1>stomal site</e1><e2>wound ostomy</e...
22094,578,579,esophageal,endomaxx,C1522619,UNDEF,3,0,74,33730227,No Label,0,[CLS] One patient began with an Evolution sten...
15169,521,525,program,single program,C0376691,C0376691,13,1,63,34519893,SY,2,"[CLS] Concerning patient-related variables, pa..."
20180,140,143,3-cm,seromuscular layer,UNDEF,C0225358,7,0,30,34270974,No Label,0,[CLS] <e1>3-cm</e1><e2>seromuscular layer</e2...
10493,251,254,die,patient death,C0011065,C0011065,9,0,30,32052298,SY,2,[CLS] The majority <e1>die</e1>the patients wh...
1519,740,742,re-anastomosis,anastomosis,C0677554,C0332853,8,0,102,22688419,RO,4,[CLS] <e1>re-anastomosis</e1> without hesitat...
2879,685,693,evaluation,assess,C0220825,C1516048,18,0,87,24002761,RO,4,[CLS] While we have continued to obtain a cont...


In [16]:
with open(os.path.join(DATA_CLEAN_PATH, "25633276.txt"), "r") as f:
    text = f.read()

print(text)

The role of fecal diversion using a loop ileostomy in patients undergoing rectal resection and anastomosis is controversial. There has been conflicting evidence on the perceived benefit vs. the morbidity of a defunctioning stoma. This is a review of the relevant surgical literature evaluating the risks, benefits, and costs of constructing a diverting ileostomy in current colorectal surgical practice.
Retrospective and prospective articles spanning the past 50 years were reviewed to identify the definition of an anastomotic leak, evaluate risk factors for AL, and assess methods of evaluation of the anastomosis. We then pooled the evidence for and against fecal diversion, the incidence and consequences of stomal complications, and the evidence comparing loop ileostomy vs. loop colostomy as the optimal method of fecal diversion.
Evidence shows that despite the fact that fecal diversion does not decrease postoperative mortality, it does significantly decrease the risk of anastomotic leak a

In [21]:
relations_full_df.loc[relations_full_df["Document"] == 25633276].sort_values("First").reset_index(drop=True)

Unnamed: 0,First,End,FirstWord,EndWord,FirstCUI,EndCUI,Distance,ScoreUMLS,Sentence,Document,Label Full,Label Id,Sentence Indexed
0,10,12,review,surgical literature,C0282443,C0023866,4,0,3,25633276,No Label,0,[CLS] This is a <e1>review</e1> of the relevan...
1,19,20,retrospective,prospective article,C0035363,C0023981,2,0,4,25633276,PAR,7,[CLS] <e1>retrospective</e1> and <e2>prospect...
2,25,28,evaluate,assess,C0220825,C1516048,6,0,4,25633276,RO,4,[CLS] Retrospective and prospective articles s...
3,28,30,assess,evaluation,C1516048,C0220825,3,0,4,25633276,RO,4,[CLS] Retrospective and prospective articles s...
4,90,91,trend,colorectal surgery,C0040833,C0009369,2,0,13,25633276,QB,1,[CLS] Some of the most prevalent <e1>trend</e1...
5,115,116,pelvic anastomosis,anastomotic leak,C0677554,C0919691,3,0,16,25633276,RO,4,[CLS] The most feared and devastating complica...
6,252,256,retrospective analysis,30-year,UNDEF,UNDEF,10,0,37,25633276,No Label,0,"[CLS] Conversely, a <e1>retrospective analysis..."
7,383,385,five,endoscopic testing,UNDEF,C0014245,6,0,57,25633276,No Label,0,[CLS] A systematic review of <e1>five</e1> non...
8,407,410,intraoperative,positive test,C0456904,C1446409,8,0,60,25633276,No Label,0,[CLS] It should be noted that the AL rate in t...
9,431,433,test arm,intervention,C0022885,C0184661,6,1,65,25633276,PAR,7,[CLS] Seven patients in the <e1>test arm</e1> ...
