  # Identifying Entities in Healthcare Data Assignment 

### Loading Data

In [1]:
# importing necessary packages
!pip install pycrf
!pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
from sklearn_crfsuite import CRF, scorers, metrics

model = spacy.load("en_core_web_sm")



In [2]:
#defining a function to load all the dataset and convert the word per line to sentences
def process_files(filename):
    input_files= open(filename, 'r')
    input_data=input_files.readlines()
    input_files.close()
    list_data=[]
    sent_data=""
    for word in input_data:
        word=word.strip()
        if word != "":
            if sent_data  =="":
                sent_data=word
            else:
                sent_data=sent_data+" "+word
        else:
            list_data.append(sent_data)
            sent_data=""
    return list_data
    
    

In [3]:
# Passing the filenames to above defined function to get the data
train_sentence=process_files('train_sent')
test_sentence=process_files('test_sent')
train_label=process_files('train_label')
test_label=process_files('test_label')

### Printing Sentences

In [4]:
# printing 5 sentences and respective labels from train data
for i in range(5):
    print("Sentences: ", train_sentence[i])
    print("Labels: ",train_label[i])
    print("\n")

Sentences:  All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Labels:  O O O O O O O O O O O O O O O


Sentences:  The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  Arrest of dilation was the most common indication in 

In [5]:
# printing 5 sentences and respective labels from test data
for i in range(5):
    print("Sentences: ", test_sentence[i])
    print("Labels: ",test_label[i])
    print("\n")

Sentences:  Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration
Labels:  O O O O O O O O O O O O O O O O O O O


Sentences:  The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )
Labels:  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


Sentences:  Fluctuations in ambient

### Count the number of sentences in the processed train and test dataset

In [6]:
# counting the number of sentences in train data
print("The Number of Sentence in Train data",len(train_sentence))

The Number of Sentence in Train data 2599


In [7]:
# counting the number of sentences in test data
print("The Number of Sentence in Test data",len(test_sentence))


The Number of Sentence in Test data 1056


### Count the number of lines of labels in the processed train and test dataset.

In [8]:
# counting the number of labels in train data
print("The Number of Sentence in Train label",len(train_label))

The Number of Sentence in Train label 2599


In [9]:
# counting the number of labels in test data
print("The Number of Sentence in Test label",len(test_label))

The Number of Sentence in Test label 1056


### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

In [10]:
train_df = pd.DataFrame(columns=['sentence','word','pos','label'])
test_df = pd.DataFrame(columns=['sentence','word','pos','label'])

In [11]:
i=0
j=0

for sent,label in zip(train_sentence,train_label):
    i=i+1
    for sentence,lab in zip(sent.split(),label.split()):
        processed_sentence = model(sentence) # Process each sentence by spacy model
        for token in processed_sentence:
            train_df.loc[j,['sentence','word','pos','label']] = [i,token.text,token.pos_,lab]
            j=j+1

In [12]:
i=0
j=0

for sent,label in zip(test_sentence,test_label):
    i=i+1
    for sentence,lab in zip(sent.split(),label.split()):
        processed_sentence = model(sentence) # Process each sentence by spacy model
        for token in processed_sentence:
            test_df.loc[j,['sentence','word','pos','label']] = [i,token.text,token.pos_,lab]
            j=j+1

In [13]:
Count_tab = pd.DataFrame()
Count_tab = pd.concat((train_df,test_df),axis=0)

In [14]:
Count_tab.reset_index(inplace=True,drop=True)

### Print the top 25 most common tokens with NOUN or PROPN PoS tags

In [15]:
Count_tab[(Count_tab['pos'] == 'NOUN') | ((Count_tab['pos'] == 'PROPN'))]['word'].value_counts()[:25]

patients        492
treatment       281
cancer          200
A               182
therapy         175
disease         143
cell            140
lung            116
group            94
chemotherapy     88
gene             88
effects          85
results          79
women            77
TO_SEE           75
cases            71
risk             71
surgery          71
analysis         70
rate             67
human            67
response         66
survival         65
children         64
primary          63
Name: word, dtype: int64

### Defining features for CRF

In [29]:
# Defining a function to get the features value for a word.
def getFeaturesForOneWord(sentence, pos):
    word = sentence[pos]

    features = [
            'word.lower=' + word.lower(), # serves as word id
            'word[-3:]=' + word[-3:],     # last three characters
            'word[-2:]=' + word[-2:],     # last two characters
            'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
            'word.isdigit=%s' % word.isdigit(),  # is the word a number
            'words.startsWithCapital=%s' % word[0].isupper() # is the word starting with a capital letter
          ]

    if(pos > 0):
        prev_word = sentence[pos-1]
        features.extend([
                'prev_word.lower=' + prev_word.lower(), 
                'prev_word.isupper=%s' % prev_word.isupper(),
                'prev_word.isdigit=%s' % prev_word.isdigit(),
                'prev_words.startsWithCapital=%s' % prev_word[0].isupper()
              ])
    else:
        features.append('BEG') # feature to track begin of sentence 

    if(pos == len(sentence)-1):
        features.append('END') # feature to track end of sentence

    return features

### Getting the features

### Write a code/function to get the features for a sentence

In [30]:
# defining a function to get the feature for a sentence
def getFeaturesForOneSentence(sentence):
    sentence_list = sentence.split()
    print("sentence_list: ",sentence_list)
    return [getFeaturesForOneWord(sentence_list, pos) for pos in range(len(sentence_list))]

### Write a code/function to get the labels of a sentence

In [31]:
# Define a function to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
    return labels.split()

### Define input and target variables

### Define the features' values for each sentence as input variable for CRF model in test and the train dataset

In [32]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentence]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentence]

sentence_list:  ['All', 'live', 'births', '>', 'or', '=', '23', 'weeks', 'at', 'the', 'University', 'of', 'Vermont', 'in', '1995', '(', 'n', '=', '2395', ')', 'were', 'retrospectively', 'analyzed', 'for', 'delivery', 'route', ',', 'indication', 'for', 'cesarean', ',', 'gestational', 'age', ',', 'parity', ',', 'and', 'practice', 'group', '(', 'to', 'reflect', 'risk', 'status', ')']
sentence_list:  ['The', 'total', 'cesarean', 'rate', 'was', '14.4', '%', '(', '344', 'of', '2395', ')', ',', 'and', 'the', 'primary', 'rate', 'was', '11.4', '%', '(', '244', 'of', '2144', ')']
sentence_list:  ['Abnormal', 'presentation', 'was', 'the', 'most', 'common', 'indication', '(', '25.6', '%', ',', '88', 'of', '344', ')']
sentence_list:  ['The', '``', 'corrected', "''", 'cesarean', 'rate', '(', 'maternal-fetal', 'medicine', 'and', 'transported', 'patients', 'excluded', ')', 'was', '12.4', '%', '(', '273', 'of', '2194', ')', ',', 'and', 'the', '``', 'corrected', "''", 'primary', 'rate', 'was', '9.6', '%

sentence_list:  ['It', 'identifies', 'the', 'learning', 'experiences', 'that', 'should', 'be', 'part', 'of', 'general', 'internal', 'medicine', 'residency', 'training', ',', 'lists', 'the', 'clinical', 'competencies', 'that', 'are', 'important', 'for', 'primary', 'care', 'practice', ',', 'and', 'describes', 'the', 'role', 'of', 'the', 'integrative', 'disciplines', 'that', 'should', 'inform', 'the', 'care', 'of', 'every', 'patient']
sentence_list:  ['It', 'also', 'describes', 'a', 'process', 'that', 'program', 'directors', 'and', 'local', 'program', 'committees', 'can', 'use', 'to', 'develop', 'competency-based', 'curricula']
sentence_list:  ['We', 'report', '4', 'new', 'cases', 'of', 'localized', 'Darier', 'disease', 'and', 'review', 'the', 'English-language', 'literature']
sentence_list:  ['OBSERVATIONS', ':', 'Localized', 'Darier', 'disease', 'occurred', 'with', 'equal', 'frequency', 'in', 'males', 'and', 'females']
sentence_list:  ['The', 'most', 'frequent', 'site', 'of', 'involveme

sentence_list:  ['Survey', 'on', 'rotavirus', 'infections', 'in', 'a', 'German', 'pediatric', 'hospital']
sentence_list:  ['Long-term', 'effect', 'of', 'calcium', 'supplementation', 'during', 'pregnancy', 'on', 'the', 'blood', 'pressure', 'of', 'offspring', ':', 'follow', 'up', 'of', 'a', 'randomised', 'controlled', 'trial']
sentence_list:  ['The', 'catalytic', 'role', 'of', 'carbon', 'dioxide', 'in', 'the', 'decomposition', 'of', 'peroxynitrite']
sentence_list:  ['Ropinirole', 'for', 'the', 'treatment', 'of', 'early', 'Parkinson', "'s", 'disease']
sentence_list:  ['The', 'Ropinirole', 'Study', 'Group']
sentence_list:  ['Female', 'genital', 'mutilation', ':', 'a', 'contemporary', 'issue', ',', 'and', 'a', 'Victorian', 'obsession']
sentence_list:  ['Fluoride', 'treatment', 'increased', 'serum', 'IGF-1', ',', 'bone', 'turnover', ',', 'and', 'bone', 'mass', ',', 'but', 'not', 'bone', 'strength', ',', 'in', 'rabbits']
sentence_list:  ['Retrospective', 'comparison', 'of', 'techniques', 'to'

sentence_list:  ['for', 'patients', 'with', 'stage', 'i', 'or', 'ii', 'non-small', 'cell', 'lung', 'cancer', '(', 'nsclc', ')', ',', 'surgical', 'resection', 'is', 'considered', 'the', 'standard', 'of', 'care']
sentence_list:  ['efforts', 'at', 'improving', 'survival', 'for', 'early-stage', 'nsclc', 'patients', 'have', 'focused', 'on', 'the', 'use', 'of', 'chemotherapy', 'administered', 'postoperatively', '(', 'adjuvant', ')', 'or', 'preoperatively', '(', 'neoadjuvant', 'or', 'induction', ')', 'to', 'eradicate', 'micrometastatic', 'disease']
sentence_list:  ['patients', ':', 'patients', 'with', 'a', 'single', '(', 'n', '=', '2', ',', '764', ')', 'and', 'synchronous', 'nsclc', '(', 'n', '=', '85', ')', 'who', 'underwent', 'pulmonary', 'resection']
sentence_list:  ['background', ':', 'prophylactic', 'cranial', 'irradiation', 'halves', 'the', 'rate', 'of', 'brain', 'metastases', 'in', 'patients', 'with', 'small', 'cell', 'lung', 'cancer']
sentence_list:  ['reviewer', "'s", 'conclusions', 

sentence_list:  ['Furthermore', ',', 'when', 'all', 'deliveries', 'were', 'analyzed', ',', 'regardless', 'of', 'risk', 'status', 'but', 'limited', 'to', 'gestational', 'age', '>', 'or', '=', '36', 'weeks', ',', 'the', 'rates', 'did', 'not', 'change', '(', '12.6', '%', ',', '280', 'of', '2214', ';', 'primary', '9.2', '%', ',', '183', 'of', '1994', ')']
sentence_list:  ['As', 'the', 'ambient', 'temperature', 'increases', ',', 'there', 'is', 'an', 'increase', 'in', 'insensible', 'fluid', 'loss', 'and', 'the', 'potential', 'for', 'dehydration']
sentence_list:  ['The', 'daily', 'high', 'temperature', 'ranged', 'from', '71', 'to', '104', 'degrees', 'F', 'and', 'AFI', 'values', 'ranged', 'from', '1.7', 'to', '24.7', 'cm', 'during', 'the', 'study', 'period']
sentence_list:  ['There', 'was', 'a', 'significant', 'correlation', 'between', 'the', '2-', ',', '3-', ',', 'and', '4-day', 'mean', 'temperature', 'and', 'AFI', ',', 'with', 'the', '4-day', 'mean', 'being', 'the', 'most', 'significant', '(

sentence_list:  ['The', 'pallidal', 'radiofrequency', 'lesions', 'were', 'prolate', 'spheroid', 'shaped', 'and', 'were', 'composed', 'of', 'three', 'concentric', 'zones', 'in', 'the', 'early', 'postoperative', 'studies']
sentence_list:  ['Additional', 'edema', 'spreading', 'to', 'the', 'internal', 'capsule', 'was', 'noted', 'in', '32', 'of', '34', 'cases', 'and', 'to', 'the', 'optic', 'tract', 'in', '11', 'of', '34', 'cases']
sentence_list:  ['In', 'three', 'patients', 'no', 'lesion', 'was', 'identified', 'despite', 'sustained', 'clinical', 'improvement']
sentence_list:  ['The', 'lesion', 'was', 'located', 'in', 'the', 'posteroventral', 'GPi', 'in', 'all', 'cases', 'except', 'in', 'one', 'patient', 'in', 'whom', 'it', 'was', 'confined', 'to', 'the', 'GP', 'externus', '(', 'GPe', ')']
sentence_list:  ['This', '49-year-old', 'woman', 'did', 'not', 'experience', 'sustained', 'benefit']
sentence_list:  ['The', 'authors', 'found', 'no', 'consistent', 'correlations', 'between', 'lesion', 'si

sentence_list:  ['Current', 'studies', 'using', 'interferon', 'in', 'patients', 'with', 'low-grade', 'non-Hodgkin', "'s", 'lymphomas', 'are', 'evaluating', 'lower', ',', 'and', 'perhaps', 'better', 'tolerated', 'doses', 'of', 'interferon']
sentence_list:  ['While', 'the', 'demonstrated', 'antiviral', ',', 'antiproliferative', ',', 'and', 'immunomodulatory', 'properties', 'of', 'interferons', 'have', 'led', 'to', 'a', 'number', 'of', 'theories', 'regarding', 'their', 'potential', 'use', 'in', 'treating', 'individuals', 'with', 'chronic', 'myelogenous', 'leukemia', '(', 'CML', ')', ',', 'their', 'limited', 'availability', 'has', 'prevented', 'thorough', 'clinical', 'investigation']
sentence_list:  ['Intrathecal', 'sufentanil', 'provides', 'approximately', '2', 'h', 'of', 'excellent', 'labor', 'analgesia', 'with', 'minimal', 'motor', 'blockade']
sentence_list:  ['OBJECTIVE', ':', 'We', 'report', 'the', 'results', 'of', 'a', 'double-blind', ',', 'double-dummy', ',', 'active-control', 'stud

### Define the labels as the target variable for test and the train dataset

In [33]:
y_train = [getLabelsInListForOneSentence(labels) for labels in train_label]
y_test = [getLabelsInListForOneSentence(labels) for labels in test_label]

### Build the CRF Model

In [34]:
# Build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=100)

try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

### Evaluation

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [35]:
# predicting values
y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset

In [36]:
# calculating F1 score
metrics.flat_f1_score(y_test, y_pred, average='weighted')

0.9042560946986944

### Create the logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset

In [37]:
medication_data =  {} # Initialize a dictionary where we need to store disease treatment data


for i in range(len(y_pred)): # getting the predicted sequence labels
    labels = y_pred[i]
    disease = "";
    treatment = "";
  
    for j in range(len(labels)): # j is the words in a sentence
        if labels[j] == 'O': # if the label is 'o' no action required and getting the next label
            continue

        if(labels[j] == 'D'):  # if the label is 'D' store the correspoding word in disease variable and taking next label
            disease += test_sentence[i].split()[j] + " "
            continue

        if(labels[j] == 'T'): # if label is 'T' store the corresponding word in treatment
            treatment += test_sentence[i].split()[j] + " "
            
   # removing unnecessary space using strip()
    disease = disease.strip()
    treatment = treatment.strip()


    if disease is not "" and treatment is not "": # if we getting both disease and treatment from one sentence store the data
        if disease not in medication_data.keys():
            medication_data[disease] = [treatment]
        else:
            treatment_list = medication_data.get(disease)
            treatment_list.append(treatment)
            medication_data[disease] = treatment_list 

  if disease is not "" and treatment is not "": # if we getting both disease and treatment from one sentence store the data
  if disease is not "" and treatment is not "": # if we getting both disease and treatment from one sentence store the data


In [38]:
# checking the dictionary where we store the disease and respective treatment
medication_data

{'hereditary retinoblastoma': ['radiotherapy'],
 'unstable angina or non-Q-wave myocardial infarction': ['roxithromycin'],
 'coronary-artery disease': ['Antichlamydial antibiotics'],
 'cellulitis': ['G-CSF therapy intravenous antibiotic treatment'],
 'foot infection': ['G-CSF treatment'],
 "early Parkinson 's disease": ['Ropinirole monotherapy'],
 'female stress urinary incontinence': ['surgical treatment'],
 'stress urinary incontinence': ['therapy'],
 'preeclampsia ( proteinuric hypertension )': ['intrauterine insemination with donor sperm versus intrauterine insemination'],
 'cancer': ['organ transplantation and chemotherapy',
  'oral drugs chemotherapy'],
 'major pulmonary embolism': ['Thrombolytic treatment right-side hemodynamics'],
 'malignant pleural mesothelioma': ['thoracotomy , radiotherapy , and chemotherapy'],
 'pulmonary symptoms attributable': ['chemotherapy'],
 'non-obstructive azoospermia': ['testicular fine needle aspiration ( TEFNA ) open biopsy and testicular sperm 

### Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [39]:
# predicting the medication for 'hereditary retinoblastoma'
medication_data.get('hereditary retinoblastoma')

['radiotherapy']

- radiotherepy is the treatment for 'hereditary retinoblastoma'