#### To know the current working directory


In [1]:
pwd

'/home/sobha/Orion-CustomNER'

In [2]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize,sent_tokenize

[nltk_data] Downloading package punkt to /home/sobha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import os
import pandas as pd
from nltk import sent_tokenize

In [4]:
from glob import glob # glob (short for global) is used to return all file paths that match a specific pattern. .

### Step 1: Collect inclusive and exclusive criterion data separately 

##### - Download chia dataset and list the directory contents

In [7]:
os.listdir(".")

['DataCollection_Preparation-Copy1.ipynb',
 'base_config.cfg',
 'middle.ann',
 '.ipynb_checkpoints',
 'train.spacy',
 'DataCollection_Preparation.ipynb',
 'spacyannotation.json',
 'preprocess.ipynb',
 'command prompt-spacy',
 'dev.spacy',
 'config.cfg',
 'DataCollection_Preprocessing.ipynb',
 'chia_with_scope.zip',
 'middle.txt',
 'screenshot.png',
 'inclusion_data.txt',
 'inclusion_sentence.txt',
 'exclusion_data.txt',
 'chia_with_scope',
 'output']

##### Since we need customized annotations we need to collect all the txt files and annotate based on our use case requirements
- separate the txt files according to inclusion and exclusion criteria
- get the recursive filepaths using glob

In [8]:
inc_file_paths = glob("chia_with_scope/*_inc.txt")
exc_file_paths = glob("chia_with_scope/*_exc.txt")

In [9]:
#print(inc_file_paths)
#print(exc_file_paths)

##### Create a dataset by adding the contents of all the text files
 - Create 2 datasets : one for inclusion criteria and the other for exclusion criteria

In [10]:
inc_data = [] # inclusion criteria data
exc_data = [] # exclusion criteria data

##### loop through the files and append data to the main dataset

In [11]:
for inc_file in inc_file_paths:
    with open(inc_file, "r", encoding="unicode_escape") as f:
        data = sent_tokenize(f.read())
        inc_data.extend(data)

In [12]:
inc_data

['Type 2 Diabetes\nHypertension\nEstimated glomerular filtration rate (eGFR) > 30 ml/min\nUse of Ace Inh and ARB for control of blood pressure who are willing to be placed on alternate drug(s) in the washout period for blood pressure control',
 'Clinical and radiologic diagnosis of primary knee osteoarthritis (Kellgren & Lawrence I, II or III); \nCapability to understand the Informed Consent Form; \nChronic pain for at least 3 months prior to inclusion, measured by VAS.',
 '(VAS 4 or above); \nAbsence of skin injures, infections or tumor in the target knee; \nAvailability to comply with the visits.',
 'All patients presenting for elective shoulder arthroscopic procedures will be eligible for enrollment.',
 "Age: 18 to75 years old;\nPathologically diagnosed with advanced gastric cancer (including adenocarcinoma of the gastroesophageal junction) with measurable metastases outside the stomach (measuring = 10mm on spiral CT scan, satisfying the criteria in RECIST 1.1);\nFailure of prior th

In [13]:
len(inc_data)

2600

In [14]:
for exc_file in exc_file_paths:
    with open(exc_file, "r", encoding="unicode_escape") as f:
        data = sent_tokenize(f.read())
        exc_data.extend(data)

In [15]:
exc_data

['Research exemption requested\nHistory of PCV-13 vaccination\nHistory of cochlear implant\nCerebrospinal Fluid (CSF) leak\nCongestive Heart Failure (CHF)\nDiabetes Mellitus (DM)\nChronic Kidney Disease (CKD)\nHuman Immunodeficiency Virus (HIV)\nCommon Variable Immune Deficiency (CVID)\nPatients who have received the PPSV23 vaccine in the last 5 years\nWomen who are pregnant will also be excluded from the study by performing 2 point of care urine pregnancy tests ( prior to vaccinations)',
 'Patients having had an ophthalmic surgical procedure within 6 months of the beginning of the study.',
 'Patients with a diagnosis of glaucoma\nAny abnormality of the cornea which may prevent reliable applanation tonometry\nKnown allergy/ hypersensitivity reaction to Brimonidine\nContra-indication to Brimonidine including patients on monoamine oxidase inhibitors (MOA)\nPatients unwilling or unable to provide informed consent\nPatients with anticipated difficult airway management (as this may require 

In [16]:
len(exc_data)

3773

##### write them to one big txt file for creating annotations

#### inclusion_data 

In [17]:
with open("inclusion_data.txt", "w", encoding="unicode_escape") as f:
    for each_sentence in inc_data:
    #     f.write(each_sentence)
        f.writelines(each_sentence)

#### exclusion_data

In [18]:
with open("exclusion_data.txt", "w", encoding="unicode_escape") as f:
    for each_sentence in exc_data:
    #     f.write(each_sentence)
        f.writelines(each_sentence)

### Step 2: Create annotation data using Markup tool for NER model training

In [19]:
# custom ner training

#### Installing Spacy

In [20]:
import spacy
import collections
from collections import Counter

#### Check the spaCy Version

In [21]:
!python -m spacy info

[1m

spaCy version    3.2.1                         
Location         /home/sobha/anaconda3/lib/python3.7/site-packages/spacy
Platform         Linux-5.11.0-43-generic-x86_64-with-debian-bullseye-sid
Python version   3.7.6                         
Pipelines        en_core_web_sm (3.2.0)        



# Followed the below tutorial link
### Steps to build the custom NER model for detecting the job role in job postings in spaCy 3.0:

#### https://turbolab.in/build-a-custom-ner-model-using-spacy-3-0/



### Load a spaCy model and check if it has ner pipeline

In [5]:
# !python -m spacy download en_core_web_sm

import spacy 

In [6]:
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
sentence = 'Daniil Medvedev and Novak Djokovic have built an intriguing rivalry since the Australian Open decider, which the Serb won comprehensively.'
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)



In [8]:
[(X, X.ent_iob_, X.ent_type_) for X in doc if X.ent_type_]

[(Daniil, 'B', 'PERSON'),
 (Medvedev, 'I', 'PERSON'),
 (Novak, 'B', 'PERSON'),
 (Djokovic, 'I', 'PERSON'),
 (Australian, 'B', 'NORP'),
 (Serb, 'B', 'NORP')]

### Used SpaCy NER annotation tool by agateteam- http://agateteam.org/spacynerannotate/ to annotate and got the text below which i asssigned to "trainData" ( also for future rerference saved the annotated data to file named spacyannotation.json)



###              Numeric   Nominal   Ordinal  Sentence/Sentiment                      - Entities

trainData= [("Aged 22 and older, undergoing 1 or 2 level spinal decompression",{"entities":[(0,18,"Numeric"),(19,63,"Ordinal")]}),
("Age 50 or over Diagnosed Giant Cell Arthritis Headache, jaw pain, vision loss Shoulder and/or hip pain",{"entities":[(0,15,"Numeric"),(25,46,"Nominal"),(46,102,"Ordinal")]}),
("Aged between 50-80.Have been diagnosed with Polymyalgia Rheumatica (PMR).Have experienced improvement in PMR symptoms with Prednisone.",{"entities":[(0,18,"Numeric"),(44,72,"Nominal"),(73,134,"Ordinal")]}),
("Aged 18 years or older Diagnosis of Ankylosing Spondylitis or Axial Spondyloarthritis",{"entities":[(0,23,"Numeric"),(36,85,"Nominal")]}),
("Aged over 18 years Confirmed diagnosis of bronchiectasis within 5 years Pulmonary exacerbation requiring antibiotics in the past 12 months",{"entities":[(0,18,"Numeric"),(42,56,"Nominal"),(72,138,"Ordinal")]}),
("Aged between 40 and 85 years Diagnosed with COPD A history of cardiovascular disease, including heart failure, ischaemic heart disease, tachyarrhythmias, and hypertension",{"entities":[(0,29,"Numeric"),(44,48,"Nominal"),(49,170,"Ordinal")]}),
("Aged 18-85, diagnosed non-cystic fibrosis bronchiectasis, two or more exacerbations in the past 12 months.",{"entities":[(0,10,"Numeric"),(22,56,"Nominal"),(58,106,"Ordinal")]}),
("Aged 18 or older, diagnosis of IgAN, on a stable dose of an ACE inhibitor or ARB / unable to tolerate this therapy.",{"entities":[(0,16,"Numeric"),(31,35,"Nominal"),(37,115,"Ordinal")]}),
("Aged 18 years or older, hypertensive, no more than 3 current blood-pressure lowering medications.",{"entities":[(0,23,"Numeric"),(24,36,"Nominal"),(38,97,"Ordinal")]}),
(" 18 years of age, undergoing a surgical procedure resulting in a closed approximately linear incision",{"entities":[(1,16,"Numeric"),(17,100,"Ordinal")]}),
("Are 18-50 years of age. Have a current/recent skin infection.No ongoing illness or history of serious chronic/progressive disease.",{"entities":[(0,22,"Numeric"),(61,130,"Ordinal"),(24,60,"Nominal")]}),
("Healthy woman aged between 18 and 49 years of age Pregnant with no known increased risk for complications Singleton pregnancy",{"entities":[(14,49,"Numeric"),(50,125,"Ordinal")]}),
("Aged between 5 - 14 years Newly diagnosed with Acute rheumatic fever",{"entities":[(0,25,"Numeric"),(47,68,"Nominal")]}),
("Viral respiratory disease Acute presentation to Middlemore Emergency Department or Intensive care unit",{"entities":[(26,102,"Ordinal"),(0,26,"Nominal")]}),
("Age>=18 Diagnosis of Essential Thrombocythemia Failure of standard therapy, e.g. Hydroxyurea",{"entities":[(0,7,"Numeric"),(31,46,"Nominal"),(47,92,"Ordinal")]}),
("Age >= 18 Diagnosed with Relapsed or Refractory Multiple Myeloma.",{"entities":[(0,10,"Numeric"),(25,65,"Nominal")]}),
("Age >= 18 Diagnosed with Myelodysplastic Syndrome (MDS)Anemia",{"entities":[(0,9,"Numeric"),(25,61,"Nominal")]}),
("age >= 18 Newly diagnosed multiple myeloma Ineligible for autologous stem cell transplant No prior treatment for multiple myeloma except localised radiotherapy or a short course of steroids",{"entities":[(0,9,"Numeric"),(26,43,"Nominal"),(43,189,"Ordinal")]}),
("Age>=18 Diagnosed with Relapsed or Refractory Multiple Myeloma No long-term use of Prednisone",{"entities":[(0,7,"Numeric"),(23,63,"Nominal")]}),
("Aged 18 to 65 Chronic Hepatitis B infection On stable NUC treatment (e.g Entecavir orTenofovir)",{"entities":[(0,13,"Numeric"),(13,43,"Nominal"),(44,95,"Ordinal")]}),
("Age less than 65 years, have diagnosed ulcerative colitis and experiencing increase in daily stool frequency and/or rectal bleeding, no allergy to antibiotic vancomycin, willing to undergo two colonoscopy procedures as part of the study (3 months apart)",{"entities":[(0,22,"Numeric"),(39,57,"Nominal"),(170,253,"Ordinal")]}),
("Aged between 18-70 years old Diagnosis of Non-alcoholic Fatty Liver Disease Have a BMI between 25 to 50 kg/m2",{"entities":[(0,29,"Numeric"),(42,75,"Nominal"),(83,109,"Numeric")]}),
("Male or postmenopausal female, aged 18-80 years old, diagnosed Nonalcoholic Steatohepatitis, no other chronic liver disease",{"entities":[(31,91,"Nominal")]}),
("Type 1 Myocardial infarction Have at least two coronary artery territories of disease > 50% Be on treatment for Diabetes",{"entities":[(0,28,"Nominal"),(92,120,"Ordinal"),(29,91,"Ordinal")]}),
("Aged 20 or older, myocardial ischemia, able to undergo PTCA, stenting and CABG,recent travel history, able to speak English",{"entities":[(0,16,"Numeric"),(18,37,"Nominal"),(39,78,"Ordinal"),(79,123,"Sentence/Sentiment")]})
]

In [61]:
trainData=[
("Aged 22 and older, undergoing 1 or 2 level spinal decompression",{"entities":[(0,18,"Numeric"),(19,63,"Ordinal")]}),
("Age 50 or over Diagnosed Giant Cell Arthritis Headache, jaw pain, vision loss Shoulder and/or hip pain",{"entities":[(0,15,"Numeric"),(25,46,"Nominal"),(46,102,"Ordinal")]}),
("Aged between 50-80.Have been diagnosed with Polymyalgia Rheumatica (PMR).Have experienced improvement in PMR symptoms with Prednisone.",{"entities":[(0,18,"Numeric"),(44,72,"Nominal"),(73,134,"Ordinal")]}),
("Aged 18 years or older Diagnosis of Ankylosing Spondylitis or Axial Spondyloarthritis",{"entities":[(0,23,"Numeric"),(36,85,"Nominal")]}),
("Aged over 18 years Confirmed diagnosis of bronchiectasis within 5 years Pulmonary exacerbation requiring antibiotics in the past 12 months",{"entities":[(0,18,"Numeric"),(42,56,"Nominal"),(72,138,"Ordinal")]}),
("Aged between 40 and 85 years Diagnosed with COPD A history of cardiovascular disease, including heart failure, ischaemic heart disease, tachyarrhythmias, and hypertension",{"entities":[(0,29,"Numeric"),(44,48,"Nominal"),(49,170,"Ordinal")]}),
("Aged 18-85, diagnosed non-cystic fibrosis bronchiectasis, two or more exacerbations in the past 12 months.",{"entities":[(0,10,"Numeric"),(22,56,"Nominal"),(58,106,"Ordinal")]}),
("Aged 18 or older, diagnosis of IgAN, on a stable dose of an ACE inhibitor or ARB / unable to tolerate this therapy.",{"entities":[(0,16,"Numeric"),(31,35,"Nominal"),(37,115,"Ordinal")]}),
("Aged 18 years or older, hypertensive, no more than 3 current blood-pressure lowering medications.",{"entities":[(0,23,"Numeric"),(24,36,"Nominal"),(38,97,"Ordinal")]}),
(" 18 years of age, undergoing a surgical procedure resulting in a closed approximately linear incision",{"entities":[(1,16,"Numeric"),(17,100,"Ordinal")]}),
("Are 18-50 years of age. Have a current/recent skin infection.No ongoing illness or history of serious chronic/progressive disease.",{"entities":[(0,22,"Numeric"),(61,130,"Ordinal"),(24,60,"Nominal")]}),
("Healthy woman aged between 18 and 49 years of age Pregnant with no known increased risk for complications Singleton pregnancy",{"entities":[(14,49,"Numeric"),(50,125,"Ordinal")]}),
("Aged between 5 - 14 years Newly diagnosed with Acute rheumatic fever",{"entities":[(0,25,"Numeric"),(47,68,"Nominal")]}),
("Viral respiratory disease Acute presentation to Middlemore Emergency Department or Intensive care unit",{"entities":[(26,102,"Ordinal"),(0,26,"Nominal")]}),
("Age>=18 Diagnosis of Essential Thrombocythemia Failure of standard therapy, e.g. Hydroxyurea",{"entities":[(0,7,"Numeric"),(31,46,"Nominal"),(47,92,"Ordinal")]}),
("Age >= 18 Diagnosed with Relapsed or Refractory Multiple Myeloma.",{"entities":[(0,10,"Numeric"),(25,65,"Nominal")]}),
("Age >= 18 Diagnosed with Myelodysplastic Syndrome (MDS)Anemia",{"entities":[(0,9,"Numeric"),(25,61,"Nominal")]}),
("age >= 18 Newly diagnosed multiple myeloma Ineligible for autologous stem cell transplant No prior treatment for multiple myeloma except localised radiotherapy or a short course of steroids",{"entities":[(0,9,"Numeric"),(26,43,"Nominal"),(43,189,"Ordinal")]}),
("Age>=18 Diagnosed with Relapsed or Refractory Multiple Myeloma No long-term use of Prednisone",{"entities":[(0,7,"Numeric"),(23,63,"Nominal")]}),
("Aged 18 to 65 Chronic Hepatitis B infection On stable NUC treatment (e.g Entecavir orTenofovir)",{"entities":[(0,13,"Numeric"),(13,43,"Nominal"),(44,95,"Ordinal")]}),
("Age less than 65 years, have diagnosed ulcerative colitis and experiencing increase in daily stool frequency and/or rectal bleeding, no allergy to antibiotic vancomycin, willing to undergo two colonoscopy procedures as part of the study (3 months apart)",{"entities":[(0,22,"Numeric"),(39,57,"Nominal"),(170,253,"Ordinal")]}),
("Aged between 18-70 years old Diagnosis of Non-alcoholic Fatty Liver Disease Have a BMI between 25 to 50 kg/m2",{"entities":[(0,29,"Numeric"),(42,75,"Nominal"),(83,109,"Numeric")]}),
("Male or postmenopausal female, aged 18-80 years old, diagnosed Nonalcoholic Steatohepatitis, no other chronic liver disease",{"entities":[(31,91,"Nominal")]}),
("Type 1 Myocardial infarction Have at least two coronary artery territories of disease > 50% Be on treatment for Diabetes",{"entities":[(0,28,"Nominal"),(92,120,"Ordinal"),(29,91,"Ordinal")]}),
("Aged 20 or older, myocardial ischemia, able to undergo PTCA, stenting and CABG,recent travel history, able to speak English",{"entities":[(0,16,"Numeric"),(18,37,"Nominal"),(39,78,"Ordinal"),(79,123,"Sentence/Sentiment")]}),
("Patients with biopsy-proven metastatic carcinoid tumors or other neuroendocrine tumors (Islet cell, Gastrinomas and VIPomas) with at least one measurable lesion (other than bone) that has either not been previously irradiated or if previously irradiated has demonstrated progression since the radiation therapy ",{"entities":[(14,20,"Nominal"),(28,55,"Nominal"),(65,87,"Nominal"),(87,124,"Nominal"),(133,161,"Ordinal"),(179,310,"Sentence/Sentiment")]}),
("Female patients must have a negative serum pregnancy test at screening. (Not applicable to patients with bilateral oophorectomy and/or hysterectomy or to those patients who are postmenopausal.) ",{"entities":[(37,52,"Nominal"),(73,99,"Sentence/Sentiment"),(105,127,"Nominal"),(135,147,"Nominal"),(177,191,"Nominal")]}),
("Karnofsky Performance Status > 60 ",{"entities":[(0,33,"Ordinal")]}),
("Must have a life expectancy of greater than three (3) months ",{"entities":[(11,60,"Numeric")]}),
("Patients on Sandostatin Lar (long acting somatostatin analogue) must be on a stable dose for 30 days prior to study entry and short acting somatostatin analogues must be judged to be on a clinically stable dose by the investigator prior to study entry ",{"entities":[(12,27,"Nominal"),(29,62,"Nominal"),(63,122,"Sentence/Sentiment"),(139,151,"Nominal"),(162,251,"Sentence/Sentiment")]}),
("The patient has no major impairment of renal or hepatic function, as defined by the following laboratory parameters: total bilirubin <1.5 X ULN; AST, ALT<2.5X ULN (<5 X ULN if liver metastases are present) ",{"entities":[(39,44,"Nominal"),(48,64,"Nominal"),(117,172,"Numeric")]}),
("Patients with biopsy-proven metastatic carcinoid tumors or other neuroendocrine tumors (Islet cell, Gastrinomas and VIPomas) with at least one measurable lesion (other than bone) that has either not been previously irradiated or if previously irradiated has demonstrated progression since the radiation therapy ",{"entities":[(14,20,"Nominal"),(28,55,"Nominal"),(65,87,"Nominal"),(87,124,"Nominal"),(133,161,"Ordinal"),(179,310,"Sentence/Sentiment")]}),
("ECOG performance status of 0,1, or 2. ",{"entities":[(0,37,"Ordinal")]}),
("Must have had a treatment-free interval of greater than 6 months following response to platinum. ",{"entities":[(0,96,"Sentence/Sentiment")]}),
("At least 4 weeks since last surgery or radiation therapy. ",{"entities":[(0,57,"Sentence/Sentiment")]}),
("Have had one prior platinum-based chemotherapy regimen for the treatment of primary disease. ",{"entities":[(19,54,"Nominal"),(54,92,"Sentence/Sentiment")]}),
("Have had one prior platinum-based chemotherapy regimen for the treatment of primary disease. ",{"entities":[]}),
("Patients with reproductive capability must agree to practice adequate contraception methods. ",{"entities":[(0,92,"Sentence/Sentiment")]}),
("Patient must give written informed consent before participating in any study-specific procedure, randomization, or receiving investigational product. ",{"entities":[(0,149,"Sentence/Sentiment")]}),
("Females of childbearing potential: negative serum or urine pregnancy test ",{"entities":[(0,49,"Numeric"),(72,153,"Numeric"),(181,244,"Numeric")]}),
("Serum creatinine less than or equal to 2.0 mg/dL (Note: Patients with a serum creatinine greater than or equal to 1.4 and less than or equal to 2.0 mg/dL must demonstrate a 24-hour urinary creatinine clearance greater than or equal to 50 mL/min) ",{"entities":[(0,49,"Numeric")]}),
("Serum bilirubin less than or equal to 1.5 x institutional upper limit of normal (ULN) ",{"entities":[(0,85,"Numeric")]}),
("Platelet count greater than or equal to 100 x 10^9/L ",{"entities":[(0,52,"Numeric")]}),
("Absolute neutrophil count (ANC) greater than or equal to 1.5 x 10^9/L without growth factor use in the 2 weeks before study randomization ",{"entities":[(0,70,"Numeric"),(103,110,"Numeric")]}),
("Hemoglobin (hgb) greater than or equal to 10 g/dL without transfusional support or growth factor use in the 4 weeks before study randomization ",{"entities":[(0,50,"Numeric"),(108,115,"Numeric")]}),
("Eastern Cooperative Oncology Group (ECOG) performance status of 0 - 2 ",{"entities":[(0,69,"Ordinal")]}),
("18 years of age or older ",{"entities":[(0,24,"Numeric")]}),
("Measurable disease ",{"entities":[(0,18,"Sentence/Sentiment")]}),
("Estimated weight loss less than or equal to 10% in the 3 months before study randomization ",{"entities":[(0,47,"Ordinal"),(55,64,"Numeric")]}),
("Life expectancy greater than or equal to 6 months ",{"entities":[(0,49,"Numeric")]}),
("Initial radiotherapy field of treatment to encompass greater than or equal to 30% of the esophagus ",{"entities":[(8,98,"Ordinal")]}),
("Unresectable (locally advanced) stage IIIa or IIIb disease ",{"entities":[(32,58,"Nominal")]}),
("Patients with a histologically or cytologically proven diagnosis of NSCLC ",{"entities":[(0,67,"Sentence/Sentiment"),(68,73,"Nominal")]}),
("Meet DSM-IV criteria for BPD as assessed by the Structured Clinical Interview for DSM-IV Personality Disorders (SCID-II). ",{"entities":[(5,20,"Nominal"),(25,28,"Nominal"),(82,121,"Nominal")]}),
("Be able to speak, read and write English and follow simple instructions for completing self-rated scales ",{"entities":[(0,104,"Sentence/Sentiment")]}),
("Be between age 18 and 55 years ",{"entities":[(3,30,"Numeric")]}),
("Provide written informed consent before beginning any study related activities ",{"entities":[(0,78,"Sentence/Sentiment")]}),
("Stable (8 wks or longer) concurrent medications including benzodiazepines, sedative hypnotics, antipsychotics, and antidepressants.",{"entities":[(0,131,"Sentence/Sentiment")]}),
("Completion of a 14-week open label trial of one the following SRI's: fluoxetine 80 mg/day, paroxetine 60 mg/day, fluvoxamine 300 mg/day, clomipramine 250 mg/day, sertraline 200 mg/day, citalopram 60 mg/day, escitalopram 30 mg/day and demonstrating a non or partial responses to SRI treatment (CGI-I of 3 or 4, Y-BOCS reduction of < 35%) ",{"entities":[(69,89,"Numeric"),(91,111,"Numeric"),(113,135,"Numeric"),(137,160,"Numeric"),(162,183,"Numeric"),(185,205,"Numeric"),(207,229,"Numeric")]}),
("Outpatient with primary DSM- IV OCD ",{"entities":[(24,35,"Nominal")]}),
("Patients must have a predicted life expectancy of at least 12 weeks. ",{"entities":[(30,68,"Numeric")]}),
("Patients must have a Zubrod performance status of 0-2. ",{"entities":[(0,53,"Ordinal")]}),
("Patients must have measurable disease. If prior radiation therapy was administered, measurable disease must be outside the radiation field. ",{"entities":[(0,37,"Ordinal"),(39,82,"Ordinal"),(84,138,"Ordinal")]}),
("Patients must agree to have a 20 cc blood sample drawn in addition to routine labs with each cycle of chemotherapy. ",{"entities":[(23,115,"Ordinal")]}),
("Tissue from tumor must be available. This may be paraffin embedded tissue from previous biopsy/resection or if it is not available, a repeat biopsy must be performed. The requirement for biopsy may be waived if alpha-fetoprotein is greater than 500 ng/mL and in the investigators opinion not explained by a concurrent hepatic inflammatory process. ",{"entities":[(12,18,"Nominal"),(211,255,"Numeric"),(255,347,"Sentence/Sentiment")]}),
("Unresectable, histologically confirmed hepatocellular carcinoma with evident disease limited to liver. ",{"entities":[(0,102,"Ordinal")]}),
("Patients must have adequate hepatic function as documented by a serum bilirubin less than or equal to 2x the institutional upper limit of normal, regardless of whether patients have liver involvement secondary to tumor. Patients may not have ascites or the ascites must be responsive to diuretics. ",{"entities":[(14,144,"Ordinal"),(177,218,"Ordinal")]}),
("Patients must have adequate renal function as documented by a calculated creatinine clearance 60. ",{"entities":[(15,98,"Ordinal")]}),
("Patients must have a pre-treatment granulocyte count (i.e., segmented neutrophils + bands) of greater than or equal to 1,500/mm3, a hemoglobin level of greater than or equal to 9 gm/dl, and platelet count greater than or equal to 50,000/mm3. The granulocyte requirement may be waived if in the investigator's opinion the lower count reflects hypersplenism with adequate bone marrow reserves. ",{"entities":[(35,128,"Numeric"),(132,184,"Numeric"),(190,240,"Numeric")]}),
("Patients must have a predicted life expectancy of at least 12 weeks. ",{"entities":[(30,68,"Numeric")]}),
("Patients must have a Zubrod performance status of 0-2. ",{"entities":[(0,53,"Ordinal")]}),
("Patients must have measurable disease. If prior radiation therapy was administered, measurable disease must be outside the radiation field. ",{"entities":[(0,37,"Ordinal"),(39,82,"Ordinal"),(84,138,"Ordinal")]}),
("Patients must agree to have a 20 cc blood sample drawn in addition to routine labs with each cycle of chemotherapy. ",{"entities":[(23,115,"Ordinal")]}),
("Tissue from tumor must be available. This may be paraffin embedded tissue from previous biopsy/resection or if it is not available, a repeat biopsy must be performed. The requirement for biopsy may be waived if alpha-fetoprotein is greater than 500 ng/mL and in the investigators opinion not explained by a concurrent hepatic inflammatory process. ",{"entities":[(12,18,"Nominal"),(211,255,"Numeric"),(255,347,"Sentence/Sentiment")]}),
("Unresectable, histologically confirmed hepatocellular carcinoma with evident disease limited to liver. ",{"entities":[(0,102,"Ordinal")]}),
("8. Left ventricular ejection fraction should be at least 30%. ",{"entities":[(3,61,"Ordinal")]}),
("7. Significant stenosis has been defined as a stenosis of more than 50% in luminal diameter (in at least one view, on visual interpretation or preferably by QCA); ",{"entities":[(15,23,"Nominal"),(24,161,"Ordinal")]}),
("6. Total occluded vessels. One total occluded major epicardial vessel or side branch can be included and targeted as long as one other major vessel has a significant stenosis amenable for SA, provided the age of occlusion is less than one month e.g. recent instability, infarction with ECG changes in the area subtended by the occluded vessel. Patients with total occluded vessels of unknown duration or existing longer than one month and a reference over 1.50 mm should not be included, not even as a third or fourth vessel to be dilated; ",{"entities":[(0,24,"Nominal"),(26,42,"Nominal"),(47,67,"Nominal"),(94,257,"Ordinal"),(259,313,"Ordinal")]}),
("5. Multivessel disease with at least one significant stenosis in LAD and with treatment of the lesion in another major epicardial coronary artery. A two-vessel disease or a three-vessel disease may be viewed as a combination of a side branch and a main epicardial vessel provided they supply different territories; left anterior descending, left circumflex and right coronary artery); ",{"entities":[(3,69,"Ordinal"),(73,145,"Ordinal"),(147,313,"Ordinal")]}),
("4. de novo native vessels; ",{"entities":[(0,22,"Ordinal")]}),
("3. At least 2 lesions (located in different vessels and in different territories) potentially amenable to stent implantation; ",{"entities":[(81,124,"Ordinal"),(3,80,"Ordinal")]}),
("2. Patients who are eligible for coronary revascularization (angioplasty or CABG); ",{"entities":[(0,78,"Ordinal")]}),
("Patients with stable (Canadian Cardiovascular Society 1, 2, 3 or 4) or unstable (Braunwald class IB, IC, IIB, IIC, IIIB, IIIC) angina pectoris and ischemia, or patients with atypical chest pain or even those who are asymptomatic provided they have documented myocardial ischaemia (e.g. treadmill exercise test, radionuclide scintigraphy, stress echocardiography, Holter tape); ",{"entities":[(0,67,"Ordinal"),(71,126,"Ordinal"),(126,155,"Ordinal"),(160,194,"Ordinal"),(259,279,"Nominal")]}),
("Single or twin pregnancies ",{"entities":[(0,26,"Nominal")]}),
("Pregnant women with abdomen discumfort and ultrasound diagnosis of polyhydramnios (AFI>25cm) ",{"entities":[(0,8,"Nominal"),(15,92,"Ordinal")]}),
("No prior treatment with Ventavis or other active treatments for primary pulmonary hypertension within 6 weeks of date of study inclusion (unless otherwise advised by Bayer Schering Pharma) ",{"entities":[(24,32,"Nominal"),(36,188,"Ordinal")]}),
("Patient with primary pulmonary hypertension (i.e. Idiopathic Pulmonary Arterial Hypertension or Familial Pulmonary Arterial Hypertension) and classified as NYHA functional class III (NYHA = New York Heart Association) ",{"entities":[(20,43,"Nominal"),(50,92,"Nominal"),(95,136,"Nominal"),(156,217,"Ordinal")]}),
("The treating physician has chosen Ventavis as a suitable long-term treatment for the patient ",{"entities":[(34,42,"Nominal"),(43,92,"Nominal")]}),
("Patients suspected to have vitamin B12 deficiency defined as a plasma vitamin B12 below the reference interval (<200 pmol/L). ",{"entities":[(27,49,"Nominal"),(50,124,"Nominal")]}),
("Non-smoking ",{"entities":[(0,11,"Ordinal")]}),
("Users of at least 2 cups of caffeinated coffee per day who are willing to be randomized to any of the interventions. ",{"entities":[(0,116,"Sentence/Sentiment")]}),
("Body mass index 25-35 kg/m2 ",{"entities":[(0,27,"Numeric")]}),
("Aged at least 18 years with an ability and willingness to give written informed consent. ",{"entities":[(0,22,"Numeric"),(23,87,"Ordinal")]}),
("If female, willing to use contraception throughout the study ",{"entities":[(11,60,"Ordinal")]}),
("Sufficient number of umbilical cord blood units available for transplantation ",{"entities":[(16,26,"Nominal"),(30,40,"Nominal"),(40,154,"Ordinal"),(155,232,"Ordinal")]}),
("Women between 40 to 70 years of age. ",{"entities":[(6,35,"Numeric")]}),
("Healthy postmenopausal women with 50 or more moderate to severe hot flushes. ",{"entities":[(0,75,"Ordinal")]}),
("Histologically proven recurrent or persistent endometrial cancer that is not amenable to curative treatment with surgery and/or radiation therapy AND has failed 2 previous treatment regimens ",{"entities":[(22,64,"Nominal"),(66,190,"Ordinal")]}),
("Measurable metastatic disease ",{"entities":[(11,29,"Nominal")]}),
("Subjects must have failed at least two previous chemotherapy regimens. Paclitaxel must have been a component of one or both regimens and cisplatin or carboplatin must have been a component of one or both regimens. ",{"entities":[(0,69,"Ordinal"),(71,213,"Ordinal")]}),
("Primary tumor must have been diagnosed histologically as either epithelial ovarian cancer, fallopian tube cancer, or primary peritoneal cancer (not borderline or low malignant potential epithelial carcinoma). ",{"entities":[(0,14,"Nominal"),(64,89,"Nominal"),(91,112,"Nominal"),(117,142,"Nominal"),(186,206,"Nominal")]}),
("Measurable metastatic disease as defined by Response Evaluation Criteria in Solid Tumors (RECIST) ",{"entities":[(11,29,"Nominal"),(44,97,"Ordinal")]}),
("Progression on prior therapy with a hormonal agent if estrogen receptor or progesterone receptor positive, and/or with trastuzumab if HER2-neu positive. If patient has progressed through hormone or trastuzumab therapy only, must have received one chemotherapy regimen. ",{"entities":[(0,105,"Ordinal"),(119,130,"Nominal"),(134,151,"Ordinal"),(153,267,"Ordinal")]}),
("Progression on or failure to respond to at least one previous chemotherapy regimen for metastatic disease ",{"entities":[(0,105,"Ordinal")]}),
("Measurable metastatic disease (>1cm) in at least one site other than bone-only ",{"entities":[(0,78,"Ordinal")]}),
("Metastatic cervical cancer (CX) ",{"entities":[(0,31,"Nominal")]}),
("Metastatic endometrial cancer (EM) ",{"entities":[(0,34,"Nominal")]}),
("Metastatic ovarian cancer (OV) ",{"entities":[(0,30,"Nominal")]}),
("Metastatic breast cancer (BR)",{"entities":[(0,29,"Nominal")]}),
("Diagnosis of one of the following malignancies: ",{"entities":[(34,46,"Nominal")]}),
("Normal organ function within 14 days of study entry",{"entities":[(0,51,"Ordinal")]}),
("If female and of childbearing potential, are willing to use adequate contraception (hormonal, barrier method, abstinence) prior to study entry and for the duration of study participation. ",{"entities":[(17,39,"Nominal"),(41,83,"Ordinal"),(84,92,"Ordinal"),(94,108,"Ordinal"),(110,120,"Ordinal"),(122,187,"Sentence/Sentiment")]}),
("Ovarian, endometrial or cervical - Gynecologic Oncology Group (GOG) performance score 2",{"entities":[(0,87,"Ordinal")]}),
("Breast - Karnofsky score > 50; ",{"entities":[(0,29,"Ordinal")]}),
("Body Mass Index (BMI) >21 kg/m^2 and <35 kg/m^2. ",{"entities":[(0,48,"Numeric")]}),
("HbA1c between 7.1% and 11.0%, inclusive. ",{"entities":[(0,39,"Ordinal")]}),
("Treated with a stable dose of one of the following for at least 3 months prior to screening: * >=1000 mg/day immediate-release metformin; or metformin >=1000 mg/day and sulfonylurea; or sulfonylurea/metformin combination therapy. ",{"entities":[(94,136,"Numeric"),(141,164,"Numeric"),(169,181,"Nominal"),(186,198,"Nominal"),(199,228,"Nominal"),(0,91,"Ordinal")]}),
(" Aged 20 or older,",{"entities":[(1,17,"Numeric")]}),
(" AGE>18 years",{"entities":[(1,13,"Numeric")]}),
(" Age>18 years age ,",{"entities":[(1,18,"Numeric")]}),
(" age>18 years age ,",{"entities":[(1,18,"Numeric")]}),
(" Aged above 18 years and under 65 years",{"entities":[(1,39,"Numeric")]}),
(" Aged over 18 years and under 65 years",{"entities":[]}),
(" Aged above 18 years and below 65 years",{"entities":[(1,39,"Numeric")]}),
(" aged above 18 years and below 65 years",{"entities":[(1,39,"Numeric")]}),
(" aged above 18 and under 65",{"entities":[(1,27,"Numeric")]}),
(" aged above 18 and below 65",{"entities":[(1,27,"Numeric")]}),
(" aged above 18",{"entities":[(1,14,"Numeric")]}),
(" AGE above 10",{"entities":[(1,13,"Numeric")]}),
(" age above 10",{"entities":[(1,13,"Numeric")]}),
(" AGE below 10",{"entities":[(1,13,"Numeric")]}),
(" age below 10",{"entities":[(1,13,"Numeric")]}),
(" aged below 20",{"entities":[(1,14,"Numeric")]}),
(" aged 60 and under",{"entities":[(1,18,"Numeric")]}),
(" aged 20 or younger",{"entities":[(1,19,"Numeric")]}),
("aged 20",{"entities":[(0,7,"Numeric")]}),
(" Age<65",{"entities":[(0,6,"Numeric")]}),
(" age less than 65 years old",{"entities":[(1,27,"Numeric")]}),
(" age less than 65 yrs",{"entities":[(1,21,"Numeric")]}),
(" Age less than 65 years,",{"entities":[(1,24,"Numeric")]}),
(" age 18 to 80 years old",{"entities":[(1,23,"Numeric")]}),
(" Aged 18-80 years",{"entities":[(1,17,"Numeric")]}),
(" aged 18-80 years old,",{"entities":[(1,21,"Numeric")]}),
(" age under 66 yrs",{"entities":[(1,17,"Numeric")]}),
(" AGE under 66",{"entities":[(1,13,"Numeric")]}),
(" Age under 66",{"entities":[(1,13,"Numeric")]}),
(" age under 66",{"entities":[(1,13,"Numeric")]}),
(" age under 66 years",{"entities":[(1,19,"Numeric")]}),
(" aged over 18 yrs",{"entities":[(1,17,"Numeric")]}),
(" aged over 18 years",{"entities":[(1,19,"Numeric")]}),
(" aged over 18",{"entities":[(1,13,"Numeric")]}),
(" Aged over 18",{"entities":[(1,13,"Numeric")]}),
(" age over 18",{"entities":[(1,12,"Numeric")]}),
(" Age over 18",{"entities":[(1,12,"Numeric")]}),
(" age <=65 yrs",{"entities":[(1,13,"Numeric")]}),
(" age<=65yrs",{"entities":[(1,11,"Numeric")]}),
(" age<18yrs",{"entities":[(1,10,"Numeric")]}),
(" age>=18yrs",{"entities":[(1,11,"Numeric")]}),
(" age>=18 yrs",{"entities":[(1,12,"Numeric")]}),
(" age >18 yrs",{"entities":[(1,12,"Numeric")]}),
(" age greater than 18 years and less than 65 years",{"entities":[(1,49,"Numeric")]}),
(" age greater than 18 and less than 65 years",{"entities":[(1,43,"Numeric")]}),
(" age less than 65 years",{"entities":[(1,23,"Numeric")]}),
(" age lesser than 45 years",{"entities":[(1,25,"Numeric")]}),
(" age greater than 18 years",{"entities":[(1,26,"Numeric")]}),
(" age greater than 18",{"entities":[(1,20,"Numeric")]}),
(" aged greater than 18",{"entities":[(1,21,"Numeric")]}),
(" Age greater than 18",{"entities":[(1,20,"Numeric")]}),
(" Aged above 18 years and under 65 years",{"entities":[(1,39,"Numeric")]}),
("Aged over 18 years and under 65 years",{"entities":[(0,37,"Numeric")]}),
(" aged between 18 and 65 years",{"entities":[(1,29,"Numeric")]}),
(" aged between 18 and 65",{"entities":[(1,23,"Numeric")]}),
(" Aged between 18 to 65",{"entities":[(1,22,"Numeric")]}),
(" aged 18 to 65",{"entities":[(1,14,"Numeric")]}),
(" Aged 18 to 65,",{"entities":[(1,14,"Numeric")]}),
(" AGE<=65",{"entities":[(1,8,"Numeric")]}),
(" age<=65",{"entities":[(1,8,"Numeric")]}),
(" age<65",{"entities":[(1,7,"Numeric")]}),
(" Age<=65",{"entities":[(1,8,"Numeric")]}),
(" Age<65",{"entities":[(1,7,"Numeric")]}),
(" aged between 5-65",{"entities":[(1,18,"Numeric")]}),
(" age between 5-65",{"entities":[(1,17,"Numeric")]}),
(" aged between 5-64 years",{"entities":[(1,24,"Numeric")]}),
(" Aged between 5-64 years,",{"entities":[(1,24,"Numeric")]}),
(" age > 18 < 65 years",{"entities":[(1,20,"Numeric")]}),
(" age>18<65years",{"entities":[(1,15,"Numeric")]}),
(" age>18<65",{"entities":[(1,10,"Numeric")]}),
(" age < = 65 years",{"entities":[(1,17,"Numeric")]}),
(" age < = 18 years",{"entities":[(1,17,"Numeric")]}),
(" age > = 18 years old",{"entities":[(1,21,"Numeric")]}),
(" age > = 18 years",{"entities":[(1,17,"Numeric")]}),
(" age > = 18",{"entities":[(1,11,"Numeric")]}),
(" AGE>=18",{"entities":[(1,8,"Numeric")]}),
(" age >= 18",{"entities":[(1,10,"Numeric")]}),
(" age>=18",{"entities":[(1,8,"Numeric")]}),
(" Age 18,",{"entities":[(2,8,"Numeric")]}),
(" age from 18-65 years",{"entities":[(1,21,"Numeric")]}),
(" age from 18-65",{"entities":[(1,15,"Numeric")]}),
(" aged from 18 to 65",{"entities":[(1,19,"Numeric")]}),
("AGED from 18 to 65",{"entities":[(0,18,"Numeric")]}),
(" Aged 18 years or older, ",{"entities":[(1,23,"Numeric")]}),
(" Aged between 5 - 14 years,",{"entities":[(1,26,"Numeric")]}),
(" aged from 18 to 65 years",{"entities":[(1,25,"Numeric")]}),
(" ages from 18 to 65 years",{"entities":[(1,25,"Numeric")]}),
(" ages from 18 to 65",{"entities":[(1,19,"Numeric")]}),
(" ages 18-65",{"entities":[(1,11,"Numeric")]}),
(" age 50-65",{"entities":[(1,10,"Numeric")]}),
("age 10-15",{"entities":[(0,9,"Numeric")]}),
(" ages 18 or younger",{"entities":[(1,19,"Numeric")]}),
(" Aged 18 years or younger, ",{"entities":[(1,25,"Numeric")]}),
("between ages 18-85 years of age",{"entities":[(0,31,"Numeric")]}),
("Male and females between ages 18-85 years of age",{"entities":[(17,47,"Numeric")]}),   
("type 2 diabetic, age 18 and over, informed consent, ",{"entities":[(0,15,"Nominal"),(17,32,"Numeric"),(34,50,"Sentence/Sentiment")]})  
]


### Convert the annotated data into the spaCy bin object

In [62]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
nlp = spacy.blank('en') # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(trainData): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot['entities']: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode='contract')
        if span is None:
            print('Skipping entity')
        else:
            ents.append(span)
    try:
        doc.ents = ents # label the text with the ents
        db.add(doc)
    except:
        print(text, annot)
db.to_disk('./train.spacy') # save the docbin object

100%|██████████| 213/213 [00:00<00:00, 1070.19it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity





### Generate the config file to train via Command line

In [63]:
!python -m spacy init fill-config /home/sobha/Orion-CustomNER/base_config.cfg /home/sobha/Orion-CustomNER/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/home/sobha/Orion-CustomNER/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Training the model using the command line

In [29]:
#in base_config.cfg file edited paths as below
#train = ./train.spacy
#dev = ./dev.spacy

### To save the  model output in the specified folder as an argument at the command line.

In [65]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
!python -m spacy train /home/sobha/Orion-CustomNER/config.cfg --paths.train /home/sobha/Orion-CustomNER/train.spacy --paths.dev /home/sobha/Orion-CustomNER/dev.spacy --output /home/sobha/Orion-CustomNER/output

[38;5;4mℹ Saving to output directory: /home/sobha/Orion-CustomNER/output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-01-04 17:48:53,593] [INFO] Set up nlp object from config
[2022-01-04 17:48:53,608] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-01-04 17:48:53,615] [INFO] Created vocabulary
[2022-01-04 17:48:53,616] [INFO] Finished initializing nlp object
[2022-01-04 17:48:54,107] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.56    0.00    0.00    0.00    0.00
  6     200       1429.26   4104.22   36.04   43.48   30.77    0.36
 13     400       2694.80   3207.53   80.31   82.26   78.46    0.80
 23     600       2821.73   1914.56   91.34   93.55   89.23    0.91
 35     800       1625.94    7

In [31]:
# !spacy train /home/sobha/Orion-CustomNER/config.cfg --output /home/sobha/Orion-CustomNER/output --paths.train /home/sobha/Orion-CustomNER/train.spacy --paths.dev /home/sobha/Orion-CustomNER/train.spacy

### Load & Test the model

#### Load the model.

In [66]:
import spacy

nlp = spacy.load('/home/sobha/Orion-CustomNER/output/model-last') #load the model-last

#### Take the unseen data to test the model prediction.

In [67]:
sentence = 'Aged 20 or older, myocardial ischemia, able to undergo PTCA, stenting and CABG'

doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)


In [68]:

nlp2 = spacy.load('/home/sobha/Orion-CustomNER/output/model-best') #load the model-best

In [69]:
sentence = 'Aged 20 or older, myocardial ischemia, able to undergo PTCA, stenting and CABG'

doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [70]:
sentence2='Type 1 Myocardial infarction Have at least two coronary artery territories of disease > 50% Be on treatment for Diabetes'
doc2 = nlp(sentence2)

from spacy import displacy
displacy.render(doc2, style='ent', jupyter=True)

In [71]:
sentence ='Age >= 18 Newly diagnosed multiple myeloma Ineligible for autologous stem cell transplant '
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [72]:
sentence ='Age>=18'
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [74]:
sentence ='age less than 18'
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [50]:
sentence ='age greater than 18'
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [51]:
sentence ='Life expectancy greater than or equal to 6 months' 
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [54]:
sentence ='Metastatic breast cancer (BR) Metastatic ovarian cancer (OV) Metastatic endometrial cancer (EM) Metastatic cervical cancer (CX)Breast - Karnofsky score > 50'
doc = nlp(sentence)

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)



In [75]:
sentence='type 2 diabetic, age 18 and over, informed consent,'
doc = nlp(sentence)
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)