In [None]:
from datasets import load_dataset

dataset = load_dataset("Saptarshi7/covid_qa_cleaned_CS", use_auth_token=True)

In [None]:
import re
all_contexts = set()
for ctx in dataset['train']['context']:
    all_contexts.add(ctx)

sorted_all_contexts_on_len = sorted(list(all_contexts), key=len, reverse=True)    
top_contexts = len(all_contexts)

with open("covidqa-longest_len_context.txt", "w") as text_file:
    for idx, ctx in enumerate(sorted_all_contexts_on_len):
        if idx == top_contexts:
            break
        text_file.write(ctx)
        text_file.write('\n')

In [None]:
#Generating the mapping table: (matched_text, CUI, preferred candidate)
import json
import pandas as pd
from tqdm.notebook import tqdm

texts = json.load(open('covidqa-context-cleaned4MM-MM_output.json'))['AllDocuments']

Metamap_Tokenizations = []
for doc in tqdm(texts):
    for ctx_dict in doc['Document']['Utterances']:
        mappings = []
        ctx_text = ctx_dict['UttText']
        ctx_start_idx = int(ctx_dict['UttStartPos'])
        for phr in ctx_dict['Phrases']:
            if phr['Mappings'] != []:
                for phr_dict in phr["Mappings"][0]['MappingCandidates']: #Choosing the first candidate
                    start_idx = int(phr_dict['ConceptPIs'][0]['StartPos']) - ctx_start_idx
                    end_idx = start_idx + int(phr_dict['ConceptPIs'][0]['Length'])
                    mappings.append((ctx_text[start_idx:end_idx], phr_dict['CandidateCUI'], \
                                     phr_dict['CandidatePreferred']))
        Metamap_Tokenizations.append((ctx_text, mappings))

entities = set()
for mappings in Metamap_Tokenizations:
    for tup in mappings[1]:
        entities.add(tup[2])
print(f"Number of entities discovered: {len(entities)}")

natural_text = [y[0] for x in Metamap_Tokenizations for y in x[1]]
cuis = [y[1] for x in Metamap_Tokenizations for y in x[1]]
pc = [y[2] for x in Metamap_Tokenizations for y in x[1]]
CUI_Preferred_Concept_Lookup_Table = pd.DataFrame(zip(natural_text, cuis, pc), columns=['natural_text','CUI','Preferred_Concept']).drop_duplicates()
CUI_Preferred_Concept_Lookup_Table.to_csv('natural_text_CUI_PC.csv', index=False)
print('Our_CUI_PC table generated...')

In [None]:
import pandas as pd
s = pd.read_csv('natural_text_CUI_PC.csv')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('csarron/bert-base-uncased-squad-v1')
model_vocab = list(tokenizer.vocab.keys())
pretrained_KGE_df = pd.read_csv('embeddings.csv', header=None)

In [None]:
from tqdm.notebook import tqdm
for row in tqdm(s.itertuples(), total=s.shape[0]):
    '''
    If the entity is in the model_vocab remove that row since we don't want another embedding for the same
    term. If the entities CUI is not in the list of pretrained ones, remove it since we can't learn a 
    mapping then.
    '''
    if (row.natural_text in model_vocab) or (row.CUI not in pretrained_KGE_df[0].to_list()):
        s.drop(axis=0, index=row.Index, inplace=True)

s.to_csv('Filtered_by_BERT_vocab_and_KGE.csv')

In [None]:
import pandas as pd
s = pd.read_csv('Filtered_by_BERT_vocab_and_KGE.csv')
s.drop(axis=1, inplace=True, columns=['Unnamed: 0'])
s.drop_duplicates(inplace=True)

In [1]:
import stanza
nlp = stanza.Pipeline('en', package=None, processors={'ner':['anatem',
                                                            'bc5cdr',
                                                            'bc4chemd',
                                                            'bionlp13cg',
                                                            'jnlpba',
                                                            'linnaeus',
                                                            'ncbi_disease',
                                                            's800',
                                                            'i2b2',
                                                            'radiology'], 'tokenize':'default'})
                      #tokenize_pretokenized=True)

2022-10-20 18:10:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-10-20 18:10:09 INFO: Loading these models for language: en (English):
| Processor | Package                                                                            |
--------------------------------------------------------------------------------------------------
| tokenize  | combined                                                                           |
| ner       | anatem;bc5cdr;bc4chemd;bionlp13cg;jnlpba;linnaeus;ncbi_disease;s800;i2b2;radiology |

2022-10-20 18:10:09 INFO: Use device: cpu
2022-10-20 18:10:09 INFO: Loading: tokenize
2022-10-20 18:10:09 INFO: Loading: ner
2022-10-20 18:10:15 INFO: Done loading processors!


In [2]:
doc = nlp([['ECMO extracorporeal membrane oxygenation']])
print(doc.entities) #if this is empty, it means, the NER didn't recognize anything

AssertionError: If neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object.

In [None]:
#Don't need to run this now, since I run it on GPU01. It's extremely slow on cpu!
from tqdm.notebook import tqdm
for row in tqdm(s.itertuples(), total=s.shape[0]):
    prepared_for_NER = [[str(row.natural_text)]]
    #i.e. we want to keep only those entries that map to at least 1 entity
    if nlp(prepared_for_NER).entities == []: 
        s.drop(axis=0, index=row.Index, inplace=True)

In [None]:
import pandas as pd
s = pd.read_csv('Filtered_by_Stanza_NER.csv')
s.drop(axis=1, inplace=True, columns=['Unnamed: 0'])
s.drop_duplicates(inplace=True)

In [None]:
import wikipedia
import pandas as pd
from tqdm.notebook import tqdm

texts = []
split = []
ents = []
unique_entities = list(set(s.natural_text.to_list()))

for ent in tqdm(unique_entities):
    try:
        texts.append(wikipedia.page(wikipedia.search(ent, results=1)).content)
        split.append('train')
        ents.append(str(ent))
    except:
        continue

pd.DataFrame(zip(split, ents, texts), columns = ['split', 'ent', 'text']).to_csv('corpus.csv')