In [1]:
import mysql.connector
import json, os
import pandas as pd
from semrep import get_oneof_type
from semrep import filter_relations, filter_entities

# Extraction relations from Semrep output

## Specify mapping to parse lines from Semrep text.out

In [2]:
mappings = {
        "path": 1,
        "text": {
            "sent_id": 4,
            "sent_text": 6
        },
        "entity": {
            'cuid': 6,
            'label': 7,
            'sem_types': 8,
            'score': 15
        },
        "relation": {
            'subject_cui': 8,
            'subject_label': 9,
            'subject_sem_types': 10,
            'subject_sem_type': 11,
            'subject_score': 18,
            'predicate_type': 21,
            'predicate': 22,
            'negation': 23,
            'object_cui': 28,
            'object_label': 29,
            'object_sem_types': 30,
            'object_sem_type': 31,
            'object_score': 38,
        }
    }

## Parse lines in relations and get all extracted entities in file

In [3]:
text_out = "output_relations/custom_license_pdf/text.out"

In [None]:
extraction = {}

file = open(text_out,encoding='utf-8')
for line in file:
    results = {'entities': []}
    # If Sentence
    if line.startswith('SE'):
        elements = line.split('|')
        path = elements[1]
        if elements[5] == 'entity':
            tmp = {}
            for key, ind in mappings['entity'].items():
                if 'sem_types' in key:
                    tmp[key] = elements[ind].split(',')
                else:
                    
                    tmp[key] = elements[ind]
            results['entities'].append(tmp)
            extraction[path] = extraction.get(path, []) + results['entities']
file.close()

## Output

In [None]:
with open("output_relations/custom_license_pdf/entities.json",'w') as file:
    file.write(json.dumps(extraction, indent=4))

# Filter entities

## Form tuples of (FileId, ISABOUT , CUI)

In [None]:
input_file = "output_relations/custom_license_pdf/entities.json"

In [None]:
all_entities = []
with open(input_file) as file:
    doc = json.loads(file.read())
    for key, value in doc.items():
        # form quadruple of relations
        tuple_entities = [(key, 'ISABOUT', entity["cuid"])\
                        for entity in value ]

            
        all_entities += tuple_entities

## filter relations

Only get relations that connect entities from one of the entity types ['T047', 'T028', 'T121', 'T103', 'T184'], that is [Disease, Gene, PharmaSub, Chemical, Symptom]

In [None]:
# initialize mysql connector
cnx = mysql.connector.connect(user='root', password='12345678',
                              host='127.0.0.1',
                              database='umls')
cursor = cnx.cursor()
filtered_entities = filter_entities(cursor, all_entities)


cursor.close()
cnx.close()

In [None]:
# filter duplicate entities
filtered_entities = set(filtered_entities)

In [None]:
# form data frame
df = pd.DataFrame(list(filtered_entities), columns=['subject','predicate','object'])

# Append semantic type to CUI

In [None]:
type_name = {'T047':'Disease', 'T028':'Gene', 
             'T121':'PharmaSub', 'T103':'Chemical', 'T184':'Symptom'}
entity_types = set(['T047', 'T028', 'T121', 'T103', 'T184'])

In [None]:
# initialize mysql connector
import mysql.connector

cnx = mysql.connector.connect(user='root', password='12345678',
                              host='127.0.0.1',
                              database='umls')
cursor = cnx.cursor()

In [None]:
def append_type_name(cui):
    tuis = get_oneof_type(cursor, cui, entity_types)
    assert len(tuis) == 1
    tui = tuis.pop()
    return type_name[tui] + "_" + cui

def strip_extension(path):
    return path.split('.')[0]

In [None]:
df['object'] = df['object'].apply(append_type_name)
df['subject'] = df['subject'].apply(strip_extension)

## Output to file

In [None]:
df.to_csv("output_relations/filtered_semrep_entities/custom_license_pdf.csv",index=False)

# Consolidate several sources

In [None]:
input_folder = "output_relations/filtered_semrep_entities"

## read dataframes

In [None]:
dfs = []
for path in os.listdir(input_folder):
    dfs.append(pd.read_csv(os.path.join(input_folder, path)))
    
df = pd.concat(dfs)

In [None]:
df.to_csv('output_relations/solidated_semrep_entity.csv', index=False)