<a href="https://colab.research.google.com/github/soulofshadow/KELM_for_UMLS/blob/main/KELM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## library and dataset

In [1]:
from google.colab import drive
drive.mount("/content/drive")

# Here is the path of the root dir of this folder in your google drive
path="/content/drive/My Drive/Colab_Notebooks/KELM"


import os
import sys
os.chdir(path)
sys.path.append(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import torch
import os
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
tqdm.pandas()

dataset_path = os.path.join(path, "data")

#### Load text datasets

In [None]:
#get the text dataset

'''
Notice the dataset load here are in the shape of
{ 'text': the original sentence
  'entity': a list of entities extracted using MetaMap
}

each entity is a dict with {'cui': the cui identifier of this entity,
                            'name': the name of this entity,
                            'type': the type of this entity,
                            'pos': the position of this entiy in the sentence
                                for multiple apparence in the sentence, here only list one instance,
                            'triger': the substring in the sentence which recognised as this entity
                            }
'''

text_file = 'test.csv'
text_path = os.path.join(dataset_path, text_file)

sentences = pd.read_csv(text_path)
sentences = sentences.drop(['Unnamed: 0'], axis = 1)

# sentences = pd.read_csv(text_path, header=None, sep="\t")
# sentences.columns = ['idk','text']
# sentences = sentences.drop(['idk'], axis = 1)

In [None]:
sentences.head()

In [5]:
len(sentences)

6385

In [None]:
#idk why when I write the csv it turn None to nan, so here I transfer it back
sentences['triplets'] = sentences['triplets'].where(sentences['triplets'].notnull(), None)
#the entity default by pandas is string, turn to list of dict
sentences['entity'] = sentences['entity'].apply(eval)

#### if not have triplets.csv file, build triplets from umls

In [5]:
#get the umls
#from the umls, MRREL.RRF file we can build the triplets
from utils.load_umls import UMLS
umls = UMLS(dataset_path)

8751471it [02:22, 61570.64it/s]


cui count: 3695485
str2cui count: 13396819
MRCONSO count: 6131827


25369590it [02:33, 165162.42it/s]


rel count: 18702888


4010842it [00:18, 216076.72it/s]

sty count: 3695485





In [6]:
#build the triplets
triplets = []

for rel in umls.rel:
    triplet = rel.strip().split("\t")
    if len(triplet) != 4:
        continue;
    
    sub = triplet[0]
    obj= triplet[1]
    relation = triplet[3]

    tri = [sub, obj, relation]
    triplets.append(tri)

print("triplets count:", len(triplets))

triplets count: 11110308


In [8]:
triplets = pd.DataFrame(triplets)

In [9]:
triplets.columns = ['subject', 'object', 'relation']
triplets.drop_duplicates()

triplets.to_csv('triplets.csv')

#### Load triplets

In [6]:
#get triplets
text_file = 'triplets.csv'
text_path = os.path.join(dataset_path, text_file)

triplets = pd.read_csv(text_path)
triplets = triplets.drop(['Unnamed: 0'], axis = 1)

In [7]:
triplets.head()

Unnamed: 0,subject,object,relation
0,C2347441,C4762419,has_ingredient
1,C0022877,C0803531,has_class
2,C0301042,C3160584,has_inactive_ingredient
3,C0027530,C1953956,has_system
4,C0040300,C1507501,has_system


### Align

In [8]:
#transfer from str CUI to int CUI
#like "C1022345" to 1022345

def tran_to_index(x):
    return int(x[1:])

In [9]:
triplets['subject'] = triplets['subject'].map(tran_to_index)
triplets['object'] = triplets['object'].map(tran_to_index)

In [11]:
triplets.head()

Unnamed: 0,subject,object,relation
0,2347441,4762419,has_ingredient
1,22877,803531,has_class
2,301042,3160584,has_inactive_ingredient
3,27530,1953956,has_system
4,40300,1507501,has_system


In [51]:
#transfer it to dict of shape
'''
{(subject, object): relation}
'''
dict_triplets = dict(zip(zip(triplets['subject'], triplets['object']), triplets['relation']))

In [63]:
def find_triplets(entities, triplets):

    if triplets is not None:
        return triplets

    #also do str CUI to int CUI for entity of text
    for entity in entities:
        if not isinstance(entity['cui'],int):
            entity['cui'] = tran_to_index(entity['cui'])

    aligned_triplets = []
    for entity1 in entities:
        for entity2 in entities:
            # we use a pair of entitis to detect whether they have a relation in the UMLS triplets
            if entity1['cui'] != entity2['cui']:
                if (entity1['cui'], entity2['cui']) in dict_triplets.keys():
                    aligned_triplets.append((entity1['name'], entity2['name'], dict_triplets[(entity1['cui'], entity2['cui'])]))

    return aligned_triplets

In [64]:
sentences['triplets'] = sentences.progress_apply(lambda x: find_triplets(x['entity'], x['triplets']), axis = 1)

100%|██████████| 1/1 [00:00<00:00, 60.70it/s]


In [67]:
sentences

Unnamed: 0,text,entity,triplets
0,The hepatic ultrastructural aspect and the hep...,"[{'cui': 41623, 'name': 'Ultrastructure', 'typ...","[(Anemia, Hemolytic, Chronic hemolytic anemia,..."
