In [29]:
def token_attributes(doc):
    for token in doc:
        token._.has_therapy = 'therapy' in token.text
        token._.has_treatment = 'treatment' in token.text
    return doc

In [30]:
def doc_attributes(doc):
    doc._.has_therapy = any([token._.has_therapy for token in doc])
    doc._.has_treatment = any([token._.has_treatment for token in doc])
    doc._.has_diagnosis = any(['diagnos' in text for text in [token.text for token in doc]])
    return doc

In [31]:
def getDiseaseList(doc):
    diseaseList = [ent.text for ent in doc.ents if ent.label_ == 'DISEASE']
#     chemicalList = [ent.text for ent in doc.ents if ent.label_ == 'CHEMICAL']
    
    if len(diseaseList) == 0:
        diseaseList = ''
#     if len(chemicalList) == 0:
#         chemicalList = ''
        
    doc._.diseaseList = diseaseList
#     doc._.chemicalList = chemicalList
    return doc

In [32]:
def getTherapyList(doc):
    nouns = [noun for noun in doc.noun_chunks]
    therapy_list = [noun for noun in nouns if 'therapy' in noun.text.lower() ]
    doc._.therapyList = therapy_list
    return doc

In [33]:
def getEthnicity(doc):
#     print(doc.ents)
    for ent in doc.ents:
#         print(ent.label_)
        if (ent.label_ == "NORP"):
            doc._.ethnicity = ent.text
    return doc

In [34]:
def getDate(doc):
#     print(doc.ents)
    dateList = []
    for ind, ent in enumerate(doc.ents):
#         print(ent.label_)
        if (ent.label_ == "DATE" or ent.label_ == 'AGE'):
            if  doc.ents[ind - 1].label_ in ['ABOVE','BELOW']:
                date = doc.ents[ind - 1].ent_id_ + " " + ent.text
                dateList.append(date)
            else:
                date = ent.text    
                dateList.append(date)
#     for ind, ent in enumerate(doc.ents):
#         if (ent.label_ == 'AGE_LABEL'):
#             if doc.ents[ind + 1].label_ in ['ABOVE','BELOW']:
#                 if doc.ents[ind + 2].text.is_digit:
#                     date =  doc.ents[ind + 1].ent_id_ + " " + doc.ents[ind + 2].ent_id_
    if len(dateList) > 0:
        doc._.date = dateList
    return doc

In [35]:
def symbol_cleanup(col):
    #column from a dataframe is the input
    col = col.str.replace('≥'," ≥ ")
    col = col.str.replace('≤'," ≤ ")
    col = col.str.replace('<'," < ")
    col = col.str.replace('>'," > ")
    col = col.str.replace('='," = ")
    col = col.str.replace('> =',">=")
    col = col.str.replace('= >',"=>")
    col = col.str.replace('< =',"<=")
    col = col.str.replace('= <',"=<")


    col = col.str.replace('⩾'," ⩾ ")
    col = col.str.replace('year'," year")
    col = [' '.join(text.split()) for text in col]
    return col

In [36]:
genderPatterns = [{"label": "GENDER", "pattern": [{"LOWER": "male"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "men"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "males"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "man"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "boy"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "guy"}], "id":"male"},
                  {"label": "GENDER", "pattern": [{"LOWER": "female"}], "id":"female"},
                  {"label": "GENDER", "pattern": [{"LOWER": "women"}], "id":"female"},
                  {"label": "GENDER", "pattern": [{"LOWER": "females"}], "id":"female"},
                  {"label": "GENDER", "pattern": [{"LOWER": "girl"}], "id":"female"},
                  {"label": "GENDER", "pattern": [{"LOWER": "girls"}], "id":"female"},
                  {"label": "GENDER", "pattern": [{"LOWER": "woman"}], "id":"female"}]

In [37]:
agePatternsOld = [{"label": "RANGE", "pattern": [{"LOWER": "aged"},{"LOWER": "between", "OP":"?"},{"IS_DIGIT": True},{"TEXT": "-"},{"IS_DIGIT": True} ], "id":"between"},
                   {"label": "AGE_LABEL", "pattern": [{"POS": "ADJ","OP":"?"},{"LOWER": "age"}], "id":"age"},
                   {"label": "AGE_LABEL", "pattern": [{"LOWER": "aged"}], "id":"age"},
                   {"label": "AGE_LABEL", "pattern": [{"LOWER": "ages"}], "id":"age"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "years"}], "id":"year"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "year"}], "id":"year"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "months"}], "id":"month"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "month"}], "id":"month"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "weeks"}], "id":"week"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "week"}], "id":"week"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "days"}], "id":"week"},
                   {"label": "DATE_LABEL", "pattern": [{"LOWER": "day"}], "id":"week"},
                   {"label": "AGE_LABEL", "pattern": [{"LOWER": "old"}], "id":"old"},
                   {"label": "AGE_LABEL", "pattern": [{"LOWER": "older"}], "id":"old"},
]

In [38]:
agePatterns = [
                   {"label": "AGE_LABEL", "pattern": [{"LEMMA": "age"}], "id":"age"},
                   {"label": "AGE_LABEL", "pattern": [{"LEMMA": "aged"}], "id":"age"},
                   {"label": "DATE_LABEL", "pattern": [{"LEMMA": "year"}], "id":"year"},
                   {"label": "DATE_LABEL", "pattern": [{"LEMMA": "month"}], "id":"month"},
                   {"label": "DATE_LABEL", "pattern": [{"LEMMA": "week"}], "id":"week"},
                   {"label": "DATE_LABEL", "pattern": [{"LEMMA": "day"}], "id":"week"},
                   {"label": "AGE_LABEL", "pattern": [{"LEMMA": "old"}], "id":"old"},
]

In [39]:
comparePatterns = [
    {"label": "ABOVE", "pattern": [{"TEXT": "≥"}], "id":"GE"},
    {"label": "ABOVE", "pattern": [{"TEXT": "⩾"}], "id":"GE"},
    {"label": "ABOVE", "pattern": [{"TEXT": ">"}], "id":"G"},
    {"label": "EQUAL", "pattern": [{"TEXT": "="}], "id":"EQ"},
    {"label": "ABOVE", "pattern": [{"TEXT": ">"},{"TEXT": "="}], "id":"GE"},
    {"label": "ABOVE", "pattern": [{"TEXT": "="},{"TEXT": ">"}], "id":"GE"},
    {"label": "BELOW", "pattern": [{"TEXT": "<"},{"TEXT": "="}], "id":"LE"},
    {"label": "BELOW", "pattern": [{"TEXT": "<"}], "id":"L"},
    {"label": "BELOW", "pattern": [{"TEXT": "≤"}], "id":"LE"},
    {"label": "ABOVE", "pattern": [{"LOWER": "over"}], "id":"G"},
    {"label": "ABOVE", "pattern": [{"LOWER": "above"}], "id":"G"},
    {"label": "BELOW", "pattern": [{"LOWER": "under"}], "id":"L"},
    {"label": "BELOW", "pattern": [{"LOWER": "below"}], "id":"L"},
    {"label": "BELOW", "pattern": [{"LOWER": "inferior"},{"LOWER": "to"}], "id":"L"},
    {"label": "ABOVE", "pattern": [{"LOWER": "greater"},{"LOWER": "than"}], "id":"G"},
    {"label": "ABOVE", "pattern": [{"LOWER": "more"},{"LOWER": "than"}], "id":"G"},
    {"label": "BELOW", "pattern": [{"LOWER": "no"},{"LOWER": "more"},{"LOWER": "than"}], "id":"L"},
    {"label": "ABOVE", "pattern": [{"LOWER": "at"},{"LOWER": "least"}], "id":"G"},
    {"label": "ABOVE", "pattern": [{"LOWER": "at"},{"LOWER": "the","OP":"?"},{"LOWER": "least"}], "id":"G"},
    {"label": "ABOVE", "pattern": [{"LOWER": "at"},{"TEXT": "-","OP":"?"},{"LOWER": "least"}], "id":"G"},
    {"label": "ABOVE", "pattern": [{"LOWER": "atleast"}], "id":"G"},
    {"label": "BELOW", "pattern": [{"LOWER": "lesser"},{"LOWER": "than"}], "id":"L"},
    {"label": "ABOVE", "pattern": [{"LOWER": "or"},{"LOWER": "over"}], "id":"GE"},
    {"label": "ABOVE", "pattern": [{"LOWER": "or"},{"LOWER": "greater"}], "id":"GE"},
    {"label": "BELOW", "pattern": [{"LOWER": "or"},{"LOWER": "lesser"}], "id":"LE"},
    {"label": "BELOW", "pattern": [{"LOWER": "or"},{"LOWER": "less"}], "id":"LE"},
    {"label": "ABOVE", "pattern": [{"LOWER": "or"},{"LOWER": "above"}], "id":"G"},
    {"label": "BELOW", "pattern": [{"LOWER": "or"},{"LOWER": "below"}], "id":"L"},
    {"label": "EQUAL", "pattern": [{"LOWER": "equal"},{"LOWER": "to","OP":"?"}], "id":"E"},
]

In [40]:
rangePatterns = [
    {"label": "RANGE", "pattern": [{"LIKE_NUM": True},{"TEXT": "-"},{"LIKE_NUM": True}], "id":"range"},
    {"label": "RANGE", "pattern": [{"LOWER": "from","OP":"?"},{"IS_DIGIT": True},{"LOWER": "to"},{"IS_DIGIT": True}], "id":"range"},
 ]

In [41]:
numberPattern = [
    {"label": "VALUE", "pattern": [{"POS": "NUM"}], "id":"value"},
    {"label": "S.NO", "pattern": [{"IS_SENT_START": True,"LIKE_NUM": True },{"IS_PUNCT": True}], "id":"value"},
 ]

In [42]:
occurancePattern = [
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "occur"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "recur"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "progress"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "recurrence"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "recurrent"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "occurance"}]},   
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "progression"}]},
    {"label": "OCCURANCE", "pattern": [{"LEMMA": "persistent"}]},
     {"label": "OCCURANCE", "pattern": [{"LEMMA": "persistence"}]},
 ]

In [43]:
ecogPatterns = [{"label": "ECOG", "pattern": [{"LOWER": "ecog"}], "id":"ECOG"},
                {"label": "ECOG", "pattern": [{"LOWER": "eastern","OP":"?"},{"LOWER": "cooperative"},{"LOWER": "oncology"},{"LOWER": "group","OP":"?"}], "id":"ECOG"},
                {"label": "KPS", "pattern": [{"LOWER": "kps"}], "id":"KPS"},
                {"label": "KPS", "pattern": [{"LOWER": "karnofsky"}], "id":"KPS"},
                {"label": "WHO", "pattern": [{"TEXT": "WHO"}], "id":"WHO"},
                {"label": "WHO", "pattern": [{"LOWER": "world"},{"LOWER": "health"},{"LOWER": "organization"},{"LOWER": "score",'OP':"?"}], "id":"WHO"},

]

In [44]:
lifePatterns = [{"label": "LIFE EXP", "pattern": [{"LOWER": "life"},{"LOWER": "expectancy"}], "id":"LIFE EXP"},
                {"label": "LIFE EXP", "pattern": [{"LOWER": "expected"},{"LOWER": "survival"}], "id":"LIFE EXP"},
                {"label": "LIFE EXP", "pattern": [{"LOWER": "estimated"},{"LOWER": "survival"}], "id":"LIFE EXP"},
                {"label": "LIFE EXP", "pattern": [{"LOWER": "expected"},{"LOWER": "to"},{"LOWER": "survive"}], "id":"LIFE EXP"},
]

In [45]:
measurePatterns = [{"label": "MEASURE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "count"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "grade"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "score"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "stage"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "level"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"LOWER": "performance"},{"LOWER": "score"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"LOWER": "performance"},{"LOWER": "status"}], "id":"MEASURE"},
                   {"label": "MEASURE", "pattern": [{"TEXT": "PS"}], "id":"MEASURE"},
 ]

In [46]:
severityPatterns = [{"label": "SEVERITY", "pattern": [{"LEMMA": "mild"}]},
                    {"label": "SEVERITY", "pattern": [{"LEMMA": "moderate"}]},
                    {"label": "SEVERITY", "pattern": [{"LEMMA": "severe"}]},
                    {"label": "SEVERITY", "pattern": [{"LEMMA": "advance"}]},
                    {"label": "SEVERITY", "pattern": [{"POS": "ADJ", "OP":"*"},{"POS": "ADV", "OP":"*"},{"LEMMA": "advanced"},{"POS": "CCONJ", "OP":"*"},{"POS": "ADJ", "OP":"*"}]},
 ]

In [47]:
diagno_words = ['diagnosis','diagnose','symptom','symptomatic','confirm','prove','proven','suspect','exam','examine','examination']

diagnosisPatterns = []
for word in diagno_words:
    pattern1 = {
        "label" : "DIAGNOSIS",
        "pattern" : [
            {"POS":"ADV", "OP":"*"},
            {"POS":"CCONJ", "OP":"*"},
            {"POS":"ADV", "OP":"*"},
            {"LEMMA":word},
        ]
    }
    diagnosisPatterns.append(pattern1)
    pattern2 = {
        "label" : "DIAGNOSIS",
        "pattern" : [
            {"POS":"ADJ", "OP":"*"},
            {"POS":"CCONJ", "OP":"*"},
            {"POS":"ADJ", "OP":"*"},
            {"LEMMA":word},
        ]
    }

# diagnosisPatterns = [{"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"POS": "ADJ", "OP":"*"},{"LEMMA": "diagnosis"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "diagnosis"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "symptom"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "symptomatic"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "confirm"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "prove"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "proven"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "suspect"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "treat"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "diagnose"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "examine"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"LEMMA": "examination"}], "id":"DIAGNOSIS"},
#                      {"label": "DIAGNOSIS", "pattern": [{"POS": "ADV", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"POS": "CCONJ", "OP":"?"},{"POS": "ADJ", "OP":"?"},{"LEMMA": "confirmation"}], "id":"DIAGNOSIS"},
#  ]

In [48]:
cancerPatterns    = [
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"*"},{"LEMMA": "disease"}], "id":"DISEASE"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"*"},{"LEMMA": "disorder"}], "id":"DISEASE"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"*"},{"LEMMA": "infection"}], "id":"DISEASE"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"*"},{"POS": "CCONJ", "OP":"*"},{"POS": "ADJ", "OP":"*"},{"LEMMA": "disease"}], "id":"DISEASE"},

                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "cancer"}], "id":"DIAGNOSIS"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"?"},{"POS": "ADJ", "OP":"?"},{"LEMMA": "cancer"}], "id":"DIAGNOSIS"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"?"},{"POS": "NOUN", "OP":"*"},{"LEMMA": "cancer"}], "id":"DIAGNOSIS"},
                     {"label": "DISEASE", "pattern": [{"POS": "NOUN", "OP":"?"},{"POS": "ADJ", "OP":"?"},{"LEMMA": "cancer"}], "id":"DIAGNOSIS"},
                     {"label": "DISEASE", "pattern": [{"POS": "ADJ", "OP":"?"},{"POS": "NOUN", "OP":"*"},{"LEMMA": "metastasis"}], "id":"DIAGNOSIS"},
 ]

In [49]:
therapyPatterns    = [{"label": "THERAPY", "pattern": [{"POS": "ADJ", "OP":"*"},{"POS": "NOUN", "OP":"*"},{"LEMMA": "therapy"}], "id":"THERAPY"},
                      {"label": "THERAPY", "pattern": [{"POS": "NOUN", "OP":"*"},{"LEMMA": "therapy"}], "id":"THERAPY"},
                      {"label": "THERAPY", "pattern": [{"_": {"has_therapy":True}}], "id":"THERAPY"},
  ]

In [50]:
treatmentPatterns    = [{"label": "TREATMENT", "pattern": [{"POS": "ADJ", "OP":"?"},{"LEMMA": "receive"}], "id":"TREATMENT"},
                      {"label": "TREATMENT", "pattern": [{"POS": "NOUN", "OP":"?"},{"LEMMA": "treatment"}], "id":"TREATMENT"},
  ]

In [51]:
subjectPatterns    = [{"label": "SUBJECT", "pattern": [{"LEMMA": "patient"}], "id":"SUBJECT"},
                     {"label": "SUBJECT", "pattern": [{"LEMMA": "subject"}], "id":"SUBJECT"},
                     {"label": "SUBJECT", "pattern": [{"LEMMA": "person"}], "id":"SUBJECT"},
                      {"label": "SUBJECT", "pattern": [{"LEMMA": "participant"}], "id":"SUBJECT"},
                     
 ]

In [52]:
methodPatterns    = [{"label": "METHOD", "pattern": [{"POS": "ADJ", "OP":"?"},{"POS": "NOUN", "OP":"*"},{"LEMMA": "scan"}], "id":"METHOD"},

 ]

In [53]:
unitPatterns    = [{"label": "UNIT", "pattern": [{"POS": "NOUN", "OP":"?"},{"TEXT": "ULN"}], "id":"UNIT"},
                   {"label": "UNIT", "pattern": [{"LOWER": "x", "OP":"?"},{"TEXT": "ULN"}], "id":"UNIT"},
                   {"label": "UNIT", "pattern": [{"POS": "NOUN", "OP":"?"},{"TEXT": "UNL"}], "id":"UNIT"},
                   {"label": "UNIT", "pattern": [{"LOWER": "x", "OP":"?"},{"TEXT": "UNL"}], "id":"UNIT"},
                     {"label": "UNIT", "pattern": [{"LEMMA": "upper"},{"LEMMA": "limit"},{"LEMMA": "of","OP":"?"},{"LEMMA": "normal"}], "id":"SUBJECT"},                    
 ]

In [54]:
eligibilityPatterns    = [{"label": "ELIGIBILITY", "pattern": [{"POS": "PART", "OP":"?"},{"lemma": "eligible"}], "id":"UNIT"},
                          {"label": "ELIGIBILITY", "pattern": [{"POS": "DET", "OP":"?"},{"lemma": "eligible"}], "id":"UNIT"},
                          {"label": "ELIGIBILITY", "pattern": [{"POS": "PART", "OP":"?"},{"lemma": "ineligible"}], "id":"UNIT"},
 ]

In [55]:
diagTermPattern = [
    {"label": "DIAGTERM", "pattern": [{"ENT_TYPE": "DIAGNOSIS"},{"LOWER": "of"},{"LOWER": "the"}], "id":"DIAGTERM"},
]

In [56]:
#{"label": "UNIT", "pattern": [{"LEMMA": "upper"},{"LEMMA": "limit"},{"LEMMA": "of","OP":"?"},{"LEMMA": "normal"}], "id":"SUBJECT"},                   
def getPatterns(removedLookup):
    patternList = []
    for index, text in removedLookup.iterrows():

        condition_list = []


        text1  = "one two three"
        wordlist = text.text1.split()

        condition_list = [{"LOWER":word.lower()} for word in wordlist]

        pattern = {
            "label": text.cat1,
            "pattern" : condition_list
        }
        patternList.append(pattern)
    return patternList