# Code for Venny

This notebook contains code that is used for calculating the overlap of terms between genetic entities that were extracted from the NER model against the terms that are found in the crewDB datatables (reader, eraser, writer csv tables). The overlap in terms was visualized using the online Venn Diagram generator Venny 2.1.0 (https://bioinfogp.cnb.csic.es/tools/venny/). 

In [1]:
import pickle
import numpy as np
import pandas as pd

In [2]:
dir = '..' # directory that contains NER output pickle files and crewDB csv tables

########## PREPROCESSING ##########

# load data from pickle files
# discard the "text" data and mark which journal each article originated from
with open(f'{dir}/savedCellArticles.pkl', 'rb') as f:
    cell_data = pickle.load(f) # 924 articles
    filtered_cell_data = {article: [entities, 'cell'] for article, [text, entities] in cell_data.items()}

with open(f'{dir}/savedNatureArticles.pkl', 'rb') as f:
    nature_data = pickle.load(f) # 401 articles
    filtered_nature_data = {article: [entities, 'nature'] for article, [text, entities] in nature_data.items()}

# raw combined data of the form {article: [entities, origin]}
raw_combined_data = filtered_cell_data | filtered_nature_data # 1325 articles

In [3]:
########## UMLS LEXICON NORMALIZATION ##########
# for more information about Norm tool see here: https://lhncbc.nlm.nih.gov/LSG/Projects/lvg/current/docs/userDoc/install/install.html

# read in the Norm tool output file, process each line of output
normalized_words = []
delimiter = 'delim'
with open(f'NormOutput.txt', 'r', encoding='utf-8') as f:
    cache = ''
    while True:
        line = f.readline().strip()
        if not line:
            break
        if line == f'{delimiter}|{delimiter}':
            normalized_words.append(cache)
        else:
            cache = line
normalized_words = [word.split('|')[1] for word in normalized_words]

# populate the "normalized word" column of the corresponding entities dataframe 
for article, [entities, origin] in raw_combined_data.items():
    n = len(entities['word'])
    normalized_words_subset = normalized_words[:n]
    del normalized_words[:n]
    entities['normalized word'] = normalized_words_subset

In [4]:
########## FILTER LONG WORDS ##########

# remove words which are longer than 150 characters
filtered_combined_data = {}
for article, [entities, origin] in raw_combined_data.items():
    filtered_combined_data[article] = [entities[entities['word'].str.len() < 125], origin]

In [5]:
########## UNIQUE TERM FINDING ##########

# make unique list of all words that were extracted by the NER
unique_words = []
for doi, [entities, origin] in filtered_combined_data.items():
    unique_words += list(entities[entities['entity'] == 'genetic']['word'])
unique_words = list(set(unique_words))

In [6]:
# get list of all terms in the crewDB database
cr_types = ['reader', 'eraser', 'writer']
unique_crs = []
for cr_type in cr_types:
    crew_df = pd.read_csv(f'{dir}/{cr_type}tbl.csv')
    unique_crs += list(crew_df['gene'])
    unique_crs += list(crew_df['domain'])
    for target_entity in list(crew_df['Target entity'].dropna()):
        unique_crs += [entity.strip() for entity in target_entity.split(',')]
unique_crs = list(set(unique_crs))

In [7]:
venny_dir = 'venny_files' # directory to store venny files

# write unique terms to txt file to be inputted to venny
with open(f'{venny_dir}/unique_NER.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(unique_words))
with open(f'{venny_dir}/unique_crewDB.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(unique_crs))