# Disease analysis

This notebook contains code that was used for analyzing specifically disease entities from the NER output (see "NLP-selected/savedCellArticles.pkl" and 'NLP-selected/savedNatureArticles.pkl" for output) in order to categorize them and find the most frequent disease categories. This was part of the process in order to find most frequent re

In [3]:
import pandas as pd
import numpy as np
import pickle

In [4]:
dir = ".." # directory containing NER output pickle files and crewDB csv tables

########## PREPROCESSING ##########

# load data from pickle files
# discard the "text" data and mark which journal each article originated from
with open(f'{dir}/savedCellArticles.pkl', 'rb') as f:
    cell_data = pickle.load(f) # 924 articles
    filtered_cell_data = {article: [entities, 'cell'] for article, [text, entities] in cell_data.items()}

with open(f'{dir}/savedNatureArticles.pkl', 'rb') as f:
    nature_data = pickle.load(f) # 401 articles
    filtered_nature_data = {article: [entities, 'nature'] for article, [text, entities] in nature_data.items()}

# raw combined data of the form {article: [entities, origin]}
raw_combined_data = filtered_cell_data | filtered_nature_data # 1325 articles

In [5]:
########## UMLS LEXICON NORMALIZATION ##########
# for more information about Norm tool see here: https://lhncbc.nlm.nih.gov/LSG/Projects/lvg/current/docs/userDoc/install/install.html

# check that delimiter is not a word found as an entity
delimiter = 'delim'
for article, [entities, origin] in raw_combined_data.items():
    if (entities['word'] == delimiter).any():
        print('Match found.') # we don't want to see this

# write all entities to a text file
# delimiter is used because the UMLS Norm tool sometimes outputs more than one line of output per input word
# so this is a crude way of marking which lines in the output correspond to which line in the input
with open('NormInput.txt', 'w', encoding='utf-8') as f:
    for article, [entities, origin] in raw_combined_data.items():
        f.write(f'\n{delimiter}\n'.join(entities['word'].tolist()))
        f.write(f'\n{delimiter}\n')

In [6]:
# at this point, the norm tool should have been run externally on the files
# for more information about Norm tool see here: https://lhncbc.nlm.nih.gov/LSG/Projects/lvg/current/docs/userDoc/install/install.html

# read in the Norm tool output file, process each line of output
normalized_words = []
with open('NormOutput.txt', 'r', encoding='utf-8') as f:
    cache = ''
    while True:
        line = f.readline().strip()
        if not line:
            break
        if line == f'{delimiter}|{delimiter}':
            normalized_words.append(cache)
        else:
            cache = line
normalized_words = [word.split('|')[1] for word in normalized_words]

# populate the "normalized word" column of the corresponding entities dataframe 
for article, [entities, origin] in raw_combined_data.items():
    n = len(entities['word'])
    normalized_words_subset = normalized_words[:n]
    del normalized_words[:n]
    entities['normalized word'] = normalized_words_subset

In [7]:
######## VISUALIZE DISTRIBUTION OF WORD LENGTHS ##########

# calculate the frequency of each word
disease_words = np.array([])
unnormed_disease_words = np.array([])
# word_lengths = np.array([])
for article, [entities, origin] in raw_combined_data.items():
    # aggregate every disease words instance
    disease_words = np.append(disease_words, entities[entities['entity'] == 'disease']['normalized word'].astype('string').to_numpy())
    unnormed_disease_words = np.append(unnormed_disease_words, entities[entities['entity'] == 'disease']['word'].astype('string').to_numpy())

# calculate frequency of each distinct word
uniq_disease_words, disease_counts = np.unique(disease_words, return_counts=True)
uniq_unnormed_disease_words = np.unique(unnormed_disease_words)

# sort by frequency
disease_by_counts = {word: count for count, word in sorted(zip(disease_counts, uniq_disease_words), reverse=True)}
disease_by_counts = pd.DataFrame({
    'word': disease_by_counts.keys(),
    'count': disease_by_counts.values()
})

In [9]:
########## CATEGORIZATION OF DISEASE ##########

# performed manual categorization for the first 100 entries in most_frequent_disease.xlsx sheet separately prior to the following

# sort the general categories of diseases
disease_categories = pd.read_excel('most_frequent_diseases.xlsx')
categories_counts = {}
disease_categories.fillna('', inplace=True)
for index, row in disease_categories.iterrows():
    if row['categorization'] == '':
        continue
    curr = categories_counts.get(row['categorization'], 0)
    categories_counts[row['categorization']] = curr + int(row['count'])
categories_by_counts = dict(sorted(categories_counts.items(), key=lambda item: item[1], reverse=True))

In [10]:
categories_by_counts
# top 5 general categories:
# 1) breast cancer
# 2) cardiovascular disease
# 3) neurological disease
# 4) blood cancer
# 5) lung cancer
# "cancer" is by far the most abundant entity, but it is not 
# specific enough to be its own category. However, this suggests 
# that other types of cancer may deserve more weight than 
# cardiovascular or neurological disease.

{'cancer': 502,
 'breast cancer': 188,
 'cardiovascular disease': 134,
 'neurological disease': 115,
 'blood cancer': 87,
 'lung cancer': 56,
 'prostate cancer': 56,
 'brain cancer': 54,
 'liver cancer': 49,
 'mental health disorder': 42,
 'stomach cancer': 37,
 'colorectal cancer': 28,
 'viral infection': 20,
 'obesity': 15,
 'pancreatic cancer': 13,
 'skin cancer': 12,
 'lymphatic cancer': 11,
 'diabetes': 11,
 'immunodeficiency syndrome': 11,
 'ovarian cancer': 10,
 'kidney cancer': 10,
 'esophageal cancer': 7,
 'adenocarcinoma': 7,
 'neuroblastoma': 6,
 'pulmonary disease': 6,
 'arthritis': 6,
 'nasopharyngeal cancer': 5}

In [11]:
# find articles enriched for the words corresponding to the
# top 5 diseases of interest

In [12]:
# find keywords for each of the selected diseases
keywords_by_disease = {
    'breast cancer': [],
    'cardiovascular disease': [],
    'neurological disease': [],
    'blood cancer': [],
    'lung cancer': []
}
for disease in keywords_by_disease.keys():
    keywords_by_disease[disease] = list(disease_categories.loc[disease_categories['categorization'] == disease]['word'])

# mark appearance of keywords for each article
for article, [entities, origin] in raw_combined_data.items():
    for disease, keywords in keywords_by_disease.items():
        entities[f'is_{disease}_keyword'] = entities.apply(lambda row : 1 if row['normalized word'] in keywords else 0, axis=1)

In [13]:
n_articles = 5 # number of articles per disease category
articles_by_disease = {
    'breast cancer': [],
    'cardiovascular disease': [],
    'neurological disease': [],
    'blood cancer': [],
    'lung cancer': []
}

# select n_articles number of articles most enriched for keywords for each disease category
for disease in keywords_by_disease.keys():
    articles_by_keyword_freqs = {}
    for article, [entities, origin] in raw_combined_data.items():
        articles_by_keyword_freqs[article] = sum(entities[f'is_{disease}_keyword'])
    articles_by_disease[disease] = list(sorted(articles_by_keyword_freqs.items(), key=lambda i: i[1], reverse=True)[:n_articles])

In [102]:
articles_by_disease

{'breast cancer': [('10.1016/j.jcpa.2012.01.021', 13),
  ('10.1016/j.bbagrm.2019.03.002', 12),
  ('10.1016/j.bbrc.2019.02.088', 12),
  ('10.1016/j.gene.2022.146463', 10),
  ('10.1016/j.freeradbiomed.2016.08.031', 9)],
 'cardiovascular disease': [('10.1016/j.tcm.2015.08.006', 18),
  ('10.1007/s10741-015-9483-x', 10),
  ('10.1038/s41371-019-0218-7', 9),
  ('10.1016/j.bbadis.2020.165836', 8),
  ('10.1016/j.neuint.2019.03.004', 7)],
 'neurological disease': [('10.1007/s00401-017-1732-8', 13),
  ('10.1016/j.nbd.2014.11.023', 9),
  ('10.1016/j.biopha.2018.01.110', 8),
  ('10.1007/s10571-013-0012-y', 8),
  ('10.1007/s11060-018-03018-6', 8)],
 'blood cancer': [('10.1038/leu.2010.276', 16),
  ('10.1038/leu.2012.86', 15),
  ('10.1016/j.leukres.2005.05.010', 8),
  ('10.1016/j.mehy.2013.04.021', 8),
  ('10.1007/s00018-018-2895-8', 8)],
 'lung cancer': [('10.1053/j.seminoncol.2005.07.007', 9),
  ('10.1007/s10555-015-9563-3', 9),
  ('10.1016/j.jss.2003.11.024', 6),
  ('10.3816/CLC.2008.n.053', 5),
 

In [75]:
def get_atc(doi):
    """Fetches article text data given doi.

    Args:
        doi: the article doi

    Returns:
        The text of the article as a string.
    """
    
    if doi in cell_data:
        return cell_data[doi][0]
    elif doi in nature_data:
        return nature_data[doi][0]
    else:
        return None

breast_cancer_atcs = [get_atc(doi) for doi, __ in articles_by_disease['breast cancer']]
cardiovascular_disease_atcs = [get_atc(doi) for doi, __ in articles_by_disease['cardiovascular disease']]
neurological_disease_atcs = [get_atc(doi) for doi, __ in articles_by_disease['neurological disease']]
blood_cancer_atcs = [get_atc(doi) for doi, __ in articles_by_disease['blood cancer']]
lung_cancer_atcs = [get_atc(doi) for doi, __ in articles_by_disease['lung cancer']]