In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

In [2]:
import spacy

nlp = spacy.load("en_core_web_lg")


In [3]:
# treat empty strings and np.nan as NAm will impact dropna method
pd.options.mode.use_inf_as_na = True
# set Jupyter to display all output from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words = list(stop_words)
stop_words.append('other')
stop_words = set(stop_words)
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajivdulepet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajivdulepet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

df_allDocDetails_with_ranking = pd.read_csv("results/df_allDocDetails_with_ranking.csv")
df_pubmed_articles = pd.read_csv("results/df_pubmed_articles.csv")

df_allDocDetails_with_ranking.shape
df_allDocDetails_with_ranking.columns
df_pubmed_articles.shape
df_pubmed_articles.columns

df_allDocDetails_with_ranking.sample(5)

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
 
def filter_stop_words(sent):
    word_tokens = word_tokenize(sent)

    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    filtered_sentence = []

    for w in word_tokens:
        if (w not in stop_words) and (not re.search(r'^\d+$', w)):
            filtered_sentence.append(w)

    return(' '.join(filtered_sentence))

In [6]:
def count_leading_spaces(sent):
    return len(sent) - len(sent.lstrip())

In [7]:
def isSectionHeader(x):
    if re.search(r'^\s*\d+\s+', x):
        x = re.sub(r'^\s*\d+\s+', '', x)
        return(1, x)
    return(0, x)

In [8]:
def remove_extraneous_info(item, spaces):
    if item['space'] in spaces:
        item['text'] = re.sub(r',.*', '', item['text'])
    return item

In [9]:
def remove_after_with(text):
    txt = re.sub(r'associated with.*$', '', text).strip()
    return re.sub(r'with.*$', '', txt).strip()

In [10]:
def extract_abbrev_alternate_search_term(items, item):
    res = re.search(r'\(([A-Z]+)?\)', item['text'])
    #print(item['text'])
    if res:
        #print(item)
        abbrev = res.groups(0)[0]
        non_abbrev = ''
        split_text = item['text'].split()
        for ind, tok in enumerate(split_text):
            if tok == f'({abbrev})':
                start_letter_abbrev = abbrev[0]
                ind_start = ind-len(abbrev)
                while(ind_start >= 0):
                    if split_text[ind_start][0] == start_letter_abbrev:
                        break
                    ind_start -= 1
                non_abbrev = ' '.join(split_text[ind_start:ind])
                #print(ind, tok)
        refined_text_with_only_abbrev = re.sub(f'\s*{non_abbrev}\s*',' ', item['text']).strip()
        refined_text_with_only_abbrev = re.sub(f'\s*\({abbrev}\)\s*', ' ' + abbrev + ' ', refined_text_with_only_abbrev).strip()
        refined_text_with_only_non_abbrev = re.sub(f'\s*\({abbrev}\)\s*',' ', item['text']).strip()
        items.append({'space':item['space'], 'text':refined_text_with_only_abbrev})
        items.append({'space':item['space'], 'text':refined_text_with_only_non_abbrev})
    else:
        items.append(item)


def extract_alternative_word_search_term(items, item):
    txt = item['text']
    m = re.search(r'([\w\-\+]+)\s*\/\s*([\w\-\+]+)', txt)
    txts = []
    if m:
        both_terms = m.group(0)
        term1 = m.group(1)
        term2 = m.group(2)
        
        alt1 = txt.replace(both_terms, term1)
        alt2 = txt.replace(both_terms, term2)
        txts.extend([alt1, alt2])
    
        txts = list(set(txts))
    else:
        txts.append(item['text'])
    
    for txt in txts:
        items.append({'space': item['space'], 'text': txt})


In [11]:
def extract_noun_phrases(item):
    doc = nlp(item['text'])
    clean_chunks = []
    for chunk in doc.noun_chunks:
        chunk_text = chunk.text
        chunk_text = re.sub(r'[,;\.\)\(]+', '', chunk_text)
        chunk_text = filter_stop_words(chunk_text.lower())
        if len(chunk_text) > 1:
            # remove any repeated words
            #split_chunk_text = chunk_text.split()
            #chunk_text = ' '.join(list(set([word for word in chunk_text.split()])))
            clean_chunks.append(chunk_text)
    item['suggestions'] = clean_chunks
    item['suggestions'].append(filter_stop_words(item['text'].lower()))
    item['suggestions'] = list(set(item['suggestions']))
    
    # add variations to handle british vs US english such as tumour vs tumor, pediatric vs paediatric
    variations = [suggestion.replace('paed', 'ped').replace('tumour', 'tumor') for suggestion in item['suggestions'] if re.search(r'tumour|paed', suggestion)]
    item['suggestions'].extend(variations)
    
    return item

In [12]:
%%time
search_space = {}
adult_search_space = {}
child_search_space = {}
JUNK_PATTERN = 'chapter|https|http|overview|iarc|all rights|Terms of use|privacy policy|copyright|BlueBooksOnline|introduction|foreword|abbreviation|references|subject|committee|declaration|contributor|volume|icd|contents|sources|contents|sources|classification|appendix'

for root, dirs, files in os.walk("./data/who"):
    for filename in files:
        if filename.endswith('.txt'):
            my_file = open(f"data/who/{filename}", "r", encoding="utf-8")
            
            childFile = False
            if re.search(r'paediatric|pediatric', filename):
                childFile = True

            # reading the file
            data = my_file.read()

            # replacing end splitting the text 
            # when newline ('\n') is seen.
            data_into_list = data.split("\n")
            data_into_list = [re.sub(r'–|−', '-', line) for line in data_into_list if line.strip() != '']
            data_into_list = [re.sub(r'[^\x20-\x7e]', ' ', line) for line in data_into_list if line.strip() != '']
            data_into_list = [re.sub(r'\(\/.*\)$', '', line) for line in data_into_list if line.strip() != '']
            data_into_list = [re.sub(r'\w*\([;\.\da-z]*\d\);*', '', line) for line in data_into_list if line.strip() != '']
            data_into_list = [line for line in data_into_list if not re.search(JUNK_PATTERN, line, flags=re.IGNORECASE)]
            data_into_list = [line for line in data_into_list if not re.search(r"^[\s,]+$", line, flags=re.IGNORECASE) ]
            data_into_list = [line for line in data_into_list if not re.search(r"^[\d\/APM:\s,]+$", line, flags=re.IGNORECASE) ]
            data_into_list = [line for line in data_into_list if not len(line.strip()) <= 1]
            data_into_list = [line for line in data_into_list if not re.search(r'^\(*\s*[a-z]\s*\)*$', line.strip())]
            data_into_list = [line.rstrip() for line in data_into_list]

            data_into_list_with_meta = [{'space':count_leading_spaces(line), 'text':re.sub('\s*\d+\.\s*', '', line.strip())} for line in data_into_list]

            refined_data_into_list_with_meta = []
            for index, item in enumerate(data_into_list_with_meta):
                # if starting letter is lower case then combine with previous line
                if (len(refined_data_into_list_with_meta) > 0) and re.search('^[a-z]', item['text']):
                    refined_data_into_list_with_meta[-1]['text'] = refined_data_into_list_with_meta[-1]['text'] + ' ' + item['text']
                else:
                    refined_data_into_list_with_meta.append(item)

            # create alternative variations, if something is abbreviated
            refined_refined_data_into_list_with_meta = []
            for index, item in enumerate(refined_data_into_list_with_meta):
                extract_abbrev_alternate_search_term(refined_refined_data_into_list_with_meta, item)


            # handle alternatives due to presence of /
            refined_refined_refined_data_into_list_with_meta = []
            for index, item in enumerate(refined_refined_data_into_list_with_meta):
                extract_alternative_word_search_term(refined_refined_refined_data_into_list_with_meta, item)

            # identify the  indented space
            spaces_list = []
            for index, item in enumerate(refined_refined_refined_data_into_list_with_meta):
                spaces_list.append(item['space'])
            spaces_list = list(set(spaces_list))
            smallest_space = min(spaces_list)
            spaces_list.remove(smallest_space)
            #second_largest_space = max(spaces_list)

            #print(spaces_list)
            #print(smallest_space)

            refined_refined_refined_refined_data_into_list_with_meta = []
            for index, item in enumerate(refined_refined_refined_data_into_list_with_meta):
                refined_refined_refined_refined_data_into_list_with_meta.append(remove_extraneous_info(item, spaces_list))

            # create initial search suggestions
            refined_refined_refined_refined_data_into_list_with_meta = [extract_noun_phrases(item) for item in refined_refined_refined_refined_data_into_list_with_meta]
            default_suggestions = refined_refined_refined_refined_data_into_list_with_meta[0]['suggestions']
            
            for idx, item in enumerate(refined_refined_refined_refined_data_into_list_with_meta):
                if idx == 0:
                    item['parent_index'] = -1
                    item['more_suggestions'] = default_suggestions
                elif item['space'] > refined_refined_refined_refined_data_into_list_with_meta[idx - 1]['space']:
                    item['parent_index'] = idx - 1
                    if item['parent_index'] == -1:
                        item['more_suggestions'] = default_suggestions
                    else:
                        item['more_suggestions'] = list(set(refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['suggestions'] + refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['more_suggestions']))
                elif item['space'] == refined_refined_refined_refined_data_into_list_with_meta[idx - 1]['space']:
                    item['parent_index'] = refined_refined_refined_refined_data_into_list_with_meta[idx - 1]['parent_index']
                    if item['parent_index'] == -1:
                        item['more_suggestions'] = default_suggestions
                    else:
                        item['more_suggestions'] = list(set(refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['suggestions'] + refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['more_suggestions']))
                else:
                    current_index = refined_refined_refined_refined_data_into_list_with_meta[idx - 1]['parent_index']
                    while(current_index >= 0):
                        if refined_refined_refined_refined_data_into_list_with_meta[current_index]['space'] == item['space']:
                            item['parent_index'] = refined_refined_refined_refined_data_into_list_with_meta[current_index]['parent_index']
                            if item['parent_index'] == -1:
                                item['more_suggestions'] = default_suggestions
                            else:
                                item['more_suggestions'] = list(set(refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['suggestions'] + refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['more_suggestions']))
                            break
                        elif refined_refined_refined_refined_data_into_list_with_meta[current_index]['space'] < item['space']:
                            item['parent_index'] = current_index
                            if item['parent_index'] == -1:
                                item['more_suggestions'] = default_suggestions
                            else:
                                item['more_suggestions'] = list(set(refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['suggestions'] + refined_refined_refined_refined_data_into_list_with_meta[item['parent_index']]['more_suggestions']))
                            break
                        else:
                            current_index = refined_refined_refined_refined_data_into_list_with_meta[current_index]['parent_index']

            for idx, item in enumerate(refined_refined_refined_refined_data_into_list_with_meta):
                if idx == 0:
                    # skip first entry
                    continue
                #print(idx, item)
                #suggestions = re.split(r'\s*,\s*', item['suggestions'])
                for suggestion in item['suggestions']:
                    if suggestion in search_space:
                        search_space[suggestion] += item['more_suggestions']
                        search_space[suggestion] = list(set(search_space[suggestion]))
                    else:
                        search_space[suggestion] = item['more_suggestions']
                        search_space[suggestion] = list(set(search_space[suggestion]))
                        
                    if childFile:
                        if suggestion in child_search_space:
                            child_search_space[suggestion] += item['more_suggestions']
                            child_search_space[suggestion] = list(set(child_search_space[suggestion]))
                        else:
                            child_search_space[suggestion] = item['more_suggestions']
                            child_search_space[suggestion] = list(set(child_search_space[suggestion]))
                    else:
                        if suggestion in adult_search_space:
                            adult_search_space[suggestion] += item['more_suggestions']
                            adult_search_space[suggestion] = list(set(adult_search_space[suggestion]))
                        else:
                            adult_search_space[suggestion] = item['more_suggestions']
                            adult_search_space[suggestion] = list(set(adult_search_space[suggestion]))

            my_file.close()


CPU times: user 9.43 s, sys: 83 ms, total: 9.51 s
Wall time: 9.53 s


In [13]:
len(search_space)
len(child_search_space)
len(adult_search_space)

3637

739

3380

In [14]:
child_search_space

{'haematolymphoid disorders': ['pediatric tumors', 'paediatric tumours'],
 'myeloid neoplasms': ['haematolymphoid disorders',
  'myeloid neoplasms',
  'proliferations',
  'pediatric tumors',
  'myeloid neoplasms proliferations associated antecedent predisposing conditions',
  'antecedent predisposing conditions',
  'paediatric tumours'],
 'myeloproliferative neoplasms': ['haematolymphoid disorders',
  'pediatric tumors',
  'myeloid neoplasms',
  'paediatric tumours'],
 'cml': ['haematolymphoid disorders',
  'myeloid neoplasms',
  'myeloproliferative neoplasms',
  'pediatric tumors',
  'paediatric tumours'],
 'chronic myeloid leukaemia': ['haematolymphoid disorders',
  'myeloid neoplasms',
  'myeloproliferative neoplasms',
  'pediatric tumors',
  'paediatric tumours'],
 'myelodysplastic neoplasms': ['haematolymphoid disorders',
  'pediatric tumors',
  'myeloid neoplasms',
  'paediatric tumours'],
 'juvenile myelomonocytic leukaemia': ['haematolymphoid disorders',
  'myeloid neoplasms',


In [27]:
def find_odd_ones(some_space):
    odd_ones = []

    for key in some_space:
        vals = search_space[key]
        if len(key.split()) == 2:
            # 1 token
            odd_ones.append(key)
        elif re.search(r'\(|\)|:|;|\+|\\|/', key):
            odd_ones.append(key)

        for val in vals:
            if len(val.split()) == 1:
                # 1 token
                odd_ones.append(val)
            elif re.search(r'\(|\)|:|;|\+|\\|/', val):
                odd_ones.append(val)
    odd_ones = list(set(odd_ones))
    
    return odd_ones

search_space_odd_ones = find_odd_ones(search_space)
child_search_space_odd_ones = find_odd_ones(child_search_space)
adult_search_space_odd_ones = find_odd_ones(adult_search_space)

In [31]:
len(search_space_odd_ones)
pd.DataFrame(search_space_odd_ones).to_csv('data/search_space_odd_ones.csv', index=False)

1254

In [None]:
import pickle

# create a binary pickle file 
with open('results/who_child_search_terms_mapping.pkl', 'wb') as handle:
    pickle.dump(child_search_space, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('results/who_adult_search_terms_mapping.pkl', 'wb') as handle:
    pickle.dump(adult_search_space, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('results/who_search_terms_mapping.pkl', 'wb') as handle:
    pickle.dump(search_space, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('results/who_search_terms_mapping.pkl', 'rb') as handle:
    out_search_space = pickle.load(handle)

In [None]:
from os.path import exists

exists('results/who_search_terms_mapping.pkl')

In [None]:
out_search_space['glioneuronal tumours']

In [None]:
list(search_space.keys())

In [None]:
extract_noun_phrases({'space':4, 'text':'Paediatric-type diffuse high-grade gliomas defined by H3 status'})

In [None]:
set([1,2,3,4]).difference(set([2,4]))

In [None]:
for key in search_space:
    vals = list(set(search_space[key]))
    vals = [re.sub(r'\s*,\s*|\s*&\s*', ' ', val) for val in vals]
    vals = [re.sub(r'\s+', '+', val) for val in vals]
    print(key, '--->', vals)


In [None]:
txt = """
Department of Medical Oncology, UZ Brussel, Laarbeeklaan 101, 1090 Brussels, Belgium. Bart.Neyns@uzbrussel.be
"""
doc = nlp(txt)
clean_chunks = []
for chunk in doc.noun_chunks:
    chunk_text = chunk.text
    clean_chunks.append(chunk_text)
print(clean_chunks)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)