In [36]:
import fitz
import tner
import pandas as pd
tner_tagger = tner.TransformersNER('asahi417/tner-xlm-roberta-base-ontonotes5')

2022-03-16 12:40:03 INFO     *** initialize network ***


In [49]:
def get_sentences(document_fname):
    document = fitz.open(document_fname)
    sentences = []
    for page in document:
        page_text = page.get_text()
        page_sentences = page_text.split("\n")
        page_sentences = [sentence.strip() for sentence in page_sentences]
        sentences.extend(page_sentences)
    sentences = [sentence.strip() for sentence in sentences if len(sentence.strip())>0]
    return sentences
fname = "../PDF/Eurofins 01.pdf"
sentences = get_sentences(fname)
sentences

['1048318294708',
 'Eurofins COVID Testing Services Limited',
 'Queens Road,',
 'Teddington,',
 'Middlesex,',
 'TW11 0LY,',
 'UK',
 'Customer Services +44 (0)1925 980 595, covid19administration@eurofins.co.uk',
 'Kai Lukas Lau',
 'Travel testing customer',
 '02 March 2022',
 'Barcode of sample: COV1105006ACONLFDAG00337815',
 'Test Report SARS-CoV-2 (COVID-19)',
 'PATIENT INFORMATION',
 'Last name:',
 'Lau',
 'First name:',
 'Kai Lukas',
 'Birth date:',
 '2011-10-07',
 'Passport:',
 'K2217817Z',
 'Gender:',
 'Male',
 'Nationality:',
 'Singapore',
 'Application ID:',
 '4152458',
 'HRID/Badge ID:',
 '2202871023cxq',
 'Sample kit ID:',
 'COV1105006ACONLFDAG00337815',
 'Date and time of',
 'sampling:',
 '2022-02-08 17:02',
 'Sample received in the',
 'laboratory:',
 'Date and time of result',
 'reporting:',
 'Yes',
 '2022-02-08 17:26',
 'Date and time of report',
 'generation:',
 '2022-03-02 11:26',
 'TEST',
 'RESULT',
 'REFERENCE',
 'METHOD',
 'Fit to Fly Antigen Testing -',
 'Drive Throug

In [87]:
def get_entities(list_of_strings):
    sentence_id, category, start_pos, end_pos, text, confidence_score = [], [], [], [], [], []
    for id, sentence in enumerate(list_of_strings):
        result = tner_tagger.predict([sentence])[0]['entity']
        if len(result)>0:
            for item in result:
                sentence_id.append(id)
                category.append(item['type'])
                start_pos.append(item['position'][0])
                end_pos.append(item['position'][1])
                text.append(item['mention'])
                confidence_score.append(item['probability'])
    entities = pd.DataFrame({"SentenceID": sentence_id, "EntityType": category, "StartPosition": start_pos, "EndPosition": end_pos, "EntityText": text, "ConfidenceScore": confidence_score})
    entities = entities[entities.EntityType.isin(["person", "date", "time"])]
    # entities = entities.groupby('SentenceID').agg(",".join).reset_index()
    # Get Gender
    gender_df = get_gender(list_of_strings)
    if gender_df is not None:
        entities = pd.concat([entities, gender_df])
    # Get Test Result
    test_result_df = get_test_result(list_of_strings)
    if test_result_df is not None:
        entities = pd.concat([entities, test_result_df])
    # Get Test Type
    test_type_df = get_test_type(list_of_strings)
    if test_type_df is not None:
        entities = pd.concat([entities, test_type_df])
    
    entities = entities.sort_values("SentenceID")
    entities['EntityType'] = entities.EntityType.str.upper()
    return entities
entities = get_entities(sentences)
entities

Unnamed: 0,SentenceID,EntityType,StartPosition,EndPosition,EntityText,ConfidenceScore
6,8,PERSON,0,13,Kai Lukas Lau,0.982803
7,10,DATE,0,13,02 March 2022,0.773422
8,15,PERSON,0,3,Lau,0.851363
10,17,PERSON,0,9,Kai Lukas,0.988147
12,19,DATE,0,10,2011-10-07,0.647136
0,23,GENDER,0,4,Male,1.0
14,34,DATE,0,10,2022-02-08,0.738792
15,34,TIME,11,16,17:02,0.860903
16,40,DATE,0,10,2022-02-08,0.752259
17,40,TIME,11,16,17:26,0.866541


In [64]:
def get_gender(list_of_strings):
    for id, sentence in enumerate(list_of_strings):
        list_of_words = sentence.lower().split()
        if 'female' in list_of_words:
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['gender'], "StartPosition": [sentence.lower().find("female")], "EndPosition": [sentence.lower().find("female")+6], "EntityText": ['Female'], "ConfidenceScore": [1.0]})
        if 'male' in list_of_words:
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['gender'], "StartPosition": [sentence.lower().find("male")], "EndPosition": [sentence.lower().find("male")+4], "EntityText": ['Male'], "ConfidenceScore": [1.0]})
    return None
get_gender(sentences)

Unnamed: 0,SentenceID,EntityType,StartPosition,EndPosition,EntityText,ConfidenceScore
0,23,gender,0,4,Male,1.0


In [73]:
def get_test_result(list_of_strings):
    for id, sentence in enumerate(list_of_strings):
        filtered_sentence = ''.join(filter(str.isalpha, sentence))
        if 'negative' in filtered_sentence.lower():
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['test_result'], "StartPosition": [sentence.lower().find("negative")], "EndPosition": [sentence.lower().find("negative")+8], "EntityText": ['Negative'], "ConfidenceScore": [1.0]})
        if 'positive' in filtered_sentence.lower():
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['test_result'], "StartPosition": [sentence.lower().find("positive")], "EndPosition": [sentence.lower().find("positive")+8], "EntityText": ['Positive'], "ConfidenceScore": [1.0]})
    return None
get_test_result(sentences)

Unnamed: 0,SentenceID,EntityType,StartPosition,EndPosition,EntityText,ConfidenceScore
0,50,test_result,0,8,Negative,1.0


In [82]:
def get_test_type(list_of_strings):
    for id, sentence in enumerate(list_of_strings):
        filtered_sentence = ''.join(filter(str.isalpha, sentence))
        filtered_sentence = filtered_sentence.lower()
        if 'rtpcr' in filtered_sentence:
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['test_type'], "StartPosition": [sentence.lower().find("rt")], "EndPosition": [sentence.lower().find("rt")+6], "EntityText": ['RT-PCR'], "ConfidenceScore": [1.0]})
        if 'pcr' in filtered_sentence:
            return pd.DataFrame({"SentenceID": [id], "EntityType": ['test_type'], "StartPosition": [sentence.lower().find("pcr")], "EndPosition": [sentence.lower().find("pcr")+6], "EntityText": ['PCR'], "ConfidenceScore": [1.0]})
        if 'rapid' in filtered_sentence:
            if 'antigen' in filtered_sentence:
                return pd.DataFrame({"SentenceID": [id], "EntityType": ['test_type'], "StartPosition": [sentence.lower().find("rapid")], "EndPosition": [sentence.lower().find("rapid")+6], "EntityText": ['Rapid Antigen'], "ConfidenceScore": [1.0]})
    return None
get_test_type(sentences)

Unnamed: 0,SentenceID,EntityType,StartPosition,EndPosition,EntityText,ConfidenceScore
0,51,test_type,8,14,Rapid Antigen,1.0


In [58]:
def extract_info(sentences):
    name, gender, collection, specimen, test_type, result = None, None, None, None, None, None
    for sentence in sentences:
        if sentence.lower().startswith("name:"):
            if not name:
                name = sentence.lower().split("name:")[-1].upper().strip()
        if not gender:
            if 'female' in sentence.lower().split():
                gender = 'FEMALE'
            elif 'male' in sentence.lower().split():
                gender = 'MALE'
        if not result:
            if 'negative' in sentence.lower().split():
                result = 'NEGATIVE'
            if '(negative)' in sentence.lower().split():
                result = 'NEGATIVE'
            if 'positive' in sentence.lower().split():
                result = 'POSITIVE'
            if '(positive)' in sentence.lower().split():
                result = 'POSITIVE'
        if not specimen:
            if 'swab' in sentence.lower():
                specimen = sentence.split(":")[-1].strip()
                specimen = specimen.split("swab")[0].strip() + ' swab'
        if not test_type:
            if 'rt' in sentence.lower():
                if 'pcr' in sentence.lower():
                    test_type = 'RT-PCR'
            elif 'pcr' in sentence.lower():
                test_type = 'PCR'
            elif 'rapid' in sentence.lower():
                if 'antigen' in sentence.lower():
                    test_type = 'RAPID ANTIGEN'
        if not collection:
            if 'collect' in sentence.lower():
                trf_doc = trf(sentence)
                candidate_date = [ent.text for ent in trf_doc.ents if ent.label_ == 'DATE']
                if candidate_date:
                    collection = candidate_date[0]

    result = {'Name': name,
              'Gender': gender,
              'Specimen': specimen,
              'Collection Date/Time': collection,
              'Test Type': test_type,
              'Result': result}
    return result
