In [1]:
import spacy
from spacy import displacy
from spacy.tokens import Span
import pandas as pd
from pysbd.utils import PySBDFactory

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
#https://spacy.io/universe/project/python-sentence-boundary-disambiguation
nlp.add_pipe(PySBDFactory(nlp), first=True)

In [4]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [5]:
myfile = open('extracted_criteria.csv')

In [6]:
df = pd.read_csv('extracted_criteria.csv', sep='|')

In [7]:
#converting data to string as some of them are in float and fails during next step (NLP)
df['inclusion'] = [str(text) for text in df.inclusion]
df['exclusion'] = [str(text) for text in df.exclusion]

In [8]:
#Remove extra spaces between words

In [9]:
df['inclusion'] = [' '.join(text.split()) for text in df.inclusion]
df['exclusion'] = [' '.join(text.split()) for text in df.exclusion]

In [10]:
#Create NLP document. NER is disabled as it is not required. Parser is required to do sentencing

In [11]:
df['inclusion_doc'] = list(nlp.pipe(df['inclusion'],disable = ['ner']))
df['exclusion_doc'] = list(nlp.pipe(df['exclusion'],disable = ['ner']))

In [12]:
#Clean up by dropping invalid entries

In [13]:
df = df.dropna()

In [14]:
df.shape

(10000, 6)

In [15]:
# for each of the document
   # for each of the sentence in the document
        #just pass the text of the sentence
#output will be a list of sentences as one list for each doc.
#1 doc = 1 list (this list will contain all the sentences)

In [16]:
df['inclusion'] = [list(sent.text for sent in doc.sents) for doc in [doc for doc in df['inclusion_doc']]]
df['exclusion'] = [list(sent.text for sent in doc.sents) for doc in [doc for doc in df['exclusion_doc']]]

In [17]:
#take out only the nct_id and the sentence list and create a new dataframe.
#also create 1 row for each of the sentence keeping nct_id same for all the sentences

In [18]:
inc_criteria_df =df[['inclusion','nct_id']].explode('inclusion')
exc_criteria_df =df[['exclusion','nct_id']].explode('exclusion')

In [19]:
inc_criteria_df = inc_criteria_df[inc_criteria_df['inclusion']  != 'nan']
exc_criteria_df = exc_criteria_df[exc_criteria_df['exclusion']  != 'nan']

In [20]:
print(inc_criteria_df.shape)
print(exc_criteria_df.shape)


(10740, 2)
(14693, 2)


In [21]:
inc_criteria_df.to_csv('inclusion_criteria_2.csv',sep = '|')

In [22]:
exc_criteria_df.to_csv('exclusion_criteria_2.csv',sep = '|')

In [23]:
inc_criteria_df.head()

Unnamed: 0,inclusion,nct_id
1,Inclusion Criteria:,NCT04464967
1,1. Aged ≥18 years old.,NCT04464967
1,2. Patients with Eastern Cooperative Oncology ...,NCT04464967
1,3. Patients must have histologically confirmed...,NCT04464967
1,4. Patients must have radiologically confirmed...,NCT04464967


In [24]:
exc_criteria_df.head()

Unnamed: 0,exclusion,nct_id
1,Exclusion Criteria: 1. Prior treatment with an...,NCT04464967
1,2. Patients with central nervous system involv...,NCT04464967
1,3. Patients with no other site for biopsy than...,NCT04464967
1,4. Patients with other concurrent severe and/o...,NCT04464967
1,5. Known history of human immunodeficiency vir...,NCT04464967


In [25]:
print(inc_criteria_df['inclusion']  != 'nan')

1       True
1       True
1       True
1       True
1       True
        ... 
9998    True
9998    True
9998    True
9998    True
9998    True
Name: inclusion, Length: 10740, dtype: bool
