In [1]:
import spacy
from spacy import displacy
from spacy.tokens import Span
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [4]:
myfile = 'D:\\Users\\Prem2282@gmail.com\\Projects\\Python\\clinical data\\eligibilities.txt'

In [5]:
df = pd.read_csv(myfile, sep='|')

In [6]:
df.head()

Unnamed: 0,id,nct_id,sampling_method,gender,minimum_age,maximum_age,healthy_volunteers,population,criteria,gender_description,gender_based
0,4287621,NCT04464980,,All,18 Years,,No,,~ Inclusion Criteria for Retention Phase...,,
1,4287622,NCT04464967,,All,18 Years,75 Years,No,,~ Inclusion Criteria:~ - Capab...,,
2,4295848,NCT04357873,,All,18 Years,,No,,~ Inclusion Criteria:~ 1. Aged ...,,
3,4293842,NCT04384016,,All,12 Months,12 Years,Accepts Healthy Volunteers,,~ Inclusion Criteria:~ - Child...,,
4,4299025,NCT04316442,,All,18 Years,,No,,~ Inclusion Criteria:~ 1. Confi...,,


In [7]:
df[['nct_id','criteria']].head()

Unnamed: 0,nct_id,criteria
0,NCT04464980,~ Inclusion Criteria for Retention Phase...
1,NCT04464967,~ Inclusion Criteria:~ - Capab...
2,NCT04357873,~ Inclusion Criteria:~ 1. Aged ...
3,NCT04384016,~ Inclusion Criteria:~ - Child...
4,NCT04316442,~ Inclusion Criteria:~ 1. Confi...


In [8]:
df.shape

(349610, 11)

In [21]:
cancer_related = [ 'cancer' in str(item) for item in df['criteria']]

In [49]:
df_sample = df[cancer_related]

In [50]:
df_sample.shape

(68891, 11)

In [51]:
df_sample = df_sample[:10000]

In [52]:
# df_sample = df_sample.dropna()
df_sample.shape

(10000, 11)

In [53]:
df_sample.columns

Index(['id', 'nct_id', 'sampling_method', 'gender', 'minimum_age',
       'maximum_age', 'healthy_volunteers', 'population', 'criteria',
       'gender_description', 'gender_based'],
      dtype='object')

In [54]:
criteria = df_sample[['criteria','nct_id']]

In [55]:
criteria = criteria.dropna()
criteria.shape

(10000, 2)

In [56]:
def extract_criteria_text(text):
    text =text.replace('~',' ')
    text =text.replace('  ',' ')
    inc_ind = text.lower().find('inclusion criteria')
    exc_ind = text.lower().find('exclusion criteria')
    inc_cri = text[inc_ind:exc_ind]
    exc_cri = text[exc_ind:-1]
    return([inc_cri,exc_cri])
    

In [57]:
def extract_criteria(sample_criteria):

#     if(type(sample_criteria)==float):
#         print(sample_criteria)
        
    
    sample_criteria =sample_criteria.replace('~',' ')
    sample_criteria =sample_criteria.replace('  ',' ')
    doc = nlp.make_doc(sample_criteria)
    

    # pattern1 = [{"LIKE_NUM": True}, {"TEXT": "."}]
    pattern2 = [{"LOWER":"inclusion"},{"LOWER":"criteria"}]
    pattern3 = [{"LOWER":"exclusion"},{"LOWER":"criteria"}]
    # matcher.add('startWord', None, pattern1)
    # matcher.remove('startWord')
    matcher.add('topicWord', None, pattern2,pattern3)
    
    matches = matcher(doc)
#     for matchid,start,end in matches:
#         print(matchid,start,end)
    inc_start = 0
    inc_end = 0
    exc_start = 0
    exc_end = 0

    if len(matches) == 2:

        match_id,start,end = matches[0]
        inc_start = end + 1

        match_id,start,end = matches[1]
        inc_end = start - 1
        exc_start = end + 1
        exc_end = len(doc)

#     print(inc_start, inc_end, exc_start, exc_end)
    inclusion_criteria = Span(doc, inc_start, inc_end)
    exclusion_criteria = Span(doc, exc_start, exc_end)
#     print(inclusion_criteria.text)
#     print(exclusion_criteria.text)
    return([inclusion_criteria,exclusion_criteria])

In [58]:
#sample_criteria = criteria[5]

In [59]:
#extracted_criteria = extract_criteria(sample_criteria)


extracted_criterias = list(map(extract_criteria,criteria['criteria']))

In [60]:
extracted_criterias = list(map(extract_criteria_text,criteria['criteria']))

In [61]:
type(extracted_criterias)
len(extracted_criterias)

10000

In [62]:
criteria_df = pd.DataFrame(extracted_criterias,columns=['inclusion','exclusion'])

In [63]:
print(criteria_df.shape)
print(criteria.shape)

(10000, 2)
(10000, 2)


In [64]:
new_df = pd.concat([criteria['nct_id'] , criteria_df],axis = 1)

In [65]:
new_df.shape

(18400, 3)

In [66]:
new_df.to_csv('extracted_criteria.csv',sep = '|')

In [71]:
new_df.nct_id == 'NaN'

0        False
1        False
2        False
3        False
4        False
         ...  
56765    False
56784    False
56787    False
56788    False
56789    False
Name: nct_id, Length: 18400, dtype: bool

In [72]:
clean_df = new_df.dropna()

In [73]:
clean_df.shape

(1600, 3)

In [74]:
clean_df.head()

Unnamed: 0,nct_id,inclusion,exclusion
1,NCT04464967,Inclusion Criteria: 1. Aged ≥18 years old...,Exclusion Criteria: 1. Prior treatment wi...
2,NCT04357873,Inclusion Criteria: 1. Confirmed diagnosi...,Exclusion Criteria: 1. Isolated vascular ...
4,NCT04316442,Inclusion Criteria: - Signed informed con...,Exclusion Criteria: - Has a known hyperse...
5,NCT04240054,Inclusion Criteria: 1. Patients with acut...,Exclusion Criteria: 1. Patients with othe...
7,NCT04217798,Inclusion Criteria: 1. Histological diagn...,Exclusion Criteria: 1. Prior chemotherapy...


In [75]:
clean_df.to_csv('extracted_criteria_with_nct.csv',sep = '|')