In [1]:
import mysql.connector
import json, os
import pandas as pd
from semrep import get_oneof_type
from semrep import filter_relations

# Extraction relations from Semrep output

## Specify mapping to parse lines from Semrep text.out

In [2]:
mappings = {
        "path": 1,
        "text": {
            "sent_id": 4,
            "sent_text": 6
        },
        "entity": {
            'cuid': 6,
            'label': 7,
            'sem_types': 8,
            'score': 15
        },
        "relation": {
            'subject_cui': 8,
            'subject_label': 9,
            'subject_sem_types': 10,
            'subject_sem_type': 11,
            'subject_score': 18,
            'predicate_type': 21,
            'predicate': 22,
            'negation': 23,
            'object_cui': 28,
            'object_label': 29,
            'object_sem_types': 30,
            'object_sem_type': 31,
            'object_score': 38,
        }
    }

## Parse lines in relations and get all extracted relations in file

In [3]:
text_out = "output_relations/custom_license_pdf/text.out"

In [4]:
extraction = {}

file = open(text_out,encoding='utf-8')
for line in file:
    results = {'relations': []}
    # If Sentence
    if line.startswith('SE'):
        elements = line.split('|')
        path = elements[1]
        if elements[5] == 'relation':
            tmp = {}
            for key, ind in mappings['relation'].items():
                if 'sem_types' in key:
                    tmp[key] = elements[ind].split(',')
                else:
                    
                    tmp[key] = elements[ind]
            results['relations'].append(tmp)
            extraction[path] = extraction.get(path, []) + results['relations']
file.close()

## Output

In [5]:
with open("output_relations/custom_license_pdf/extraction.json",'w') as file:
    file.write(json.dumps(extraction, indent=4))

# Filter relations

## read all relations and form triple of relations

In [6]:
input_file = "output_relations/custom_license_pdf/extraction.json"

In [7]:
all_relations = []
with open(input_file) as file:
    doc = json.loads(file.read())
    for key, value in doc.items():
        # form quadruple of relations
        tuple_relations = [(relation["subject_cui"], relation["predicate"].upper(), relation["object_cui"])\
                        if relation['negation']=='' else (relation["subject_cui"], 'NEG_'+relation["predicate"].upper(), relation["object_cui"]) \
                        for relation in value ]

            
        all_relations.append({"path":key, "relations":tuple_relations})

## filter relations

Only get relations that connect entities from one of the entity types ['T047', 'T028', 'T121', 'T103', 'T184'], that is [Disease, Gene, PharmaSub, Chemical, Symptom]

In [8]:
# initialize mysql connector


cnx = mysql.connector.connect(user='root', password='12345678',
                              host='127.0.0.1',
                              database='umls')
cursor = cnx.cursor()

filtered_relations = []
for relation in all_relations:
    filtered_relation = filter_relations(cursor, relation["relations"])
    filtered_relations.append({"path":relation["path"],"relations":filtered_relation})

cursor.close()
cnx.close()

## Output to file

In [9]:
output_filtered_relations = "output_relations/filtered_semrep_relations/custom_license_pdf.json"

In [10]:
with open(output_filtered_relations, 'w') as file:
    file.write(json.dumps(filtered_relations, indent=4))

# Solidate relations from various sources

## read .json files from given folder to solidate

In [11]:
input_folder = "output_relations/filtered_semrep_relations"

## solidate all relations

In [12]:
paths = os.listdir(input_folder)

relations = set()

for path in paths:
    if path.endswith('.json'):
        with open(os.path.join(input_folder, path), encoding='utf-8') as file:
            rela = json.loads(file.read())
            for path in rela:
                relation = path['relations']
                for r in relation:
                    relations.add((r[0],r[1],r[2]))

In [13]:
df = pd.DataFrame(list(relations), columns=['subject','predicate','object'])

# Append semantic type to CUI

In [14]:
type_name = {'T047':'Disease', 'T028':'Gene', 
             'T121':'PharmaSub', 'T103':'Chemical', 'T184':'Symptom'}
entity_types = set(['T047', 'T028', 'T121', 'T103', 'T184'])

In [15]:
# initialize mysql connector
import mysql.connector

cnx = mysql.connector.connect(user='root', password='12345678',
                              host='127.0.0.1',
                              database='umls')
cursor = cnx.cursor()

In [16]:
def append_type_name(cui):
    tuis = get_oneof_type(cursor, cui, entity_types)
    assert len(tuis) == 1
    tui = tuis.pop()
    return type_name[tui] + "_" + cui

In [17]:
df['subject'] = df['subject'].apply(append_type_name)
df['object'] = df['object'].apply(append_type_name)

In [18]:
cursor.close()
cnx.close()

## output to file

In [19]:
df.to_csv("output_relations/semrep_rela.csv",index=False)

In [20]:
df

Unnamed: 0,subject,predicate,object
0,Disease_C0037278,CAUSES,Disease_C0012634
1,Disease_C0023518,COEXISTS_WITH,Disease_C0004626
2,PharmaSub_C1827106,DISRUPTS,Disease_C0022660
3,Gene_C1540289,INTERACTS_WITH,Gene_C1423633
4,PharmaSub_C0026698,TREATS,Symptom_C0231218
5,Disease_C0014038,ISA,Symptom_C0037088
6,Symptom_C1457887,COEXISTS_WITH,Disease_C1175175
7,Disease_C0014059,CAUSES,Disease_C0338474
8,PharmaSub_C0250480,TREATS,Disease_C0010674
9,Disease_C0022680,CAUSES,Disease_C0035078
