# Healthcare Data Assignment

In [1]:
# Install required libraries
!pip install pycrf
!pip install sklearn-crfsuite



In [2]:
import string
import spacy
import sklearn_crfsuite
import pandas as pd

from spacy.matcher import DependencyMatcher
from sklearn_crfsuite import metrics
from collections import Counter

In [3]:
# Print python, jupyter and all the imported library versions
import sys
import jupyter_core

print("python : {0}".format(sys.version))
print("jupyter : {0}".format(jupyter_core.__version__))
print("pandas : {0}".format(pd.__version__))
print("spacy : {0}".format(spacy.__version__))

python : 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
jupyter : 5.7.2
pandas : 2.2.2
spacy : 3.8.4


In [4]:
# Load the en core web small package from spacy
model = spacy.load("en_core_web_sm")

## Data Preprocessing

In [5]:
# Read train sentences and labels, test sentences and labels from corresponding file
with open('train_sent', 'r') as train_sent_file:
  train_sentences = train_sent_file.readlines()

with open('train_label', 'r') as train_label_file:
  train_labels = train_label_file.readlines()

with open('test_sent', 'r') as test_sent_file:
  test_sentences = test_sent_file.readlines()

with open('test_label', 'r') as test_label_file:
  test_labels = test_label_file.readlines()

In [6]:
# Create function to form sentences out of words
def form_sentences(words):
    sentences = []
    sentence = []
    for word in words:
        word = word.strip()
        if len(word) == 0:
            sentences.append(" ".join(sentence).strip())
            sentence = []
        else:
            sentence.append(word)
    return sentences

In [7]:
# Create function to form labels out of individual placeholders
def form_labels(labels):
    label_lines = []
    label_line = []
    for label in labels:
        label = label.strip()
        if len(label) == 0:
            label_lines.append(" ".join(label_line))
            label_line = []
        else:
            label_line.append(label)
    return label_lines

In [8]:
# Convert train sentences into actual sentences, print first 5
train_sentences = form_sentences(train_sentences)
train_sentences[:5]

['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )',
 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )',
 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )',
 "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )",
 "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )"]

In [9]:
# Convert train labels into label sequence, print first 5
train_labels = form_labels(train_labels)
train_labels[:5]

['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O O O O']

In [10]:
# Convert test sentences into actual sentences, print first 5
test_sentences = form_sentences(test_sentences)
test_sentences[:5]

['Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )',
 'As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration',
 'The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period',
 'There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )',
 'Fluctuations in ambient temperature are inversely correlated to changes in AFI']

In [11]:
# Convert test labels into label sequence, print first 5
test_labels = form_labels(test_labels)
test_labels[:5]

['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O O O O O O']

In [12]:
print("Number of sentences in train dataset =", len(train_sentences))

Number of sentences in train dataset = 2599


In [13]:
print("Number of sentences in test dataset =", len(test_sentences))

Number of sentences in test dataset = 1056


In [14]:
print("Number of label lines in train dataset =", len(train_labels))

Number of label lines in train dataset = 2599


In [15]:
print("Number of label lines in test dataset =", len(test_labels))

Number of label lines in test dataset = 1056


## Concept Identification

In [16]:
# Setup dependency matcher for nouns, and exclude punctuations
def extract_nouns(sentences):

    pattern = [{
        "RIGHT_ID": "noun",
        "RIGHT_ATTRS": {
            "pos": {"in": ["NOUN", "PROPN"]},
            "is_punct": False
        }
    }]
    matcher = DependencyMatcher(model.vocab)
    matcher.add("NOUN_MATCHER", [pattern])
    
    extracted_nouns = []
    for sentence in sentences:
        doc = model(sentence)
        for match in matcher(doc):
            extracted_nouns.append(doc[match[1][0]].text.lower())
    
    return extracted_nouns

In [17]:
# Extract nouns from training and testing data sets combined
all_sentences = train_sentences + test_sentences
all_nouns = extract_nouns(all_sentences)

In [18]:
# Adding all extracted nouns, and printing top 25 most common tokens
noun_counter = Counter(all_nouns)
noun_counter.most_common(25)

[('patients', 507),
 ('treatment', 304),
 ('cancer', 211),
 ('therapy', 177),
 ('study', 174),
 ('disease', 149),
 ('cell', 142),
 ('lung', 118),
 ('results', 116),
 ('group', 111),
 ('effects', 99),
 ('gene', 91),
 ('chemotherapy', 91),
 ('use', 87),
 ('effect', 82),
 ('women', 81),
 ('analysis', 76),
 ('risk', 74),
 ('surgery', 73),
 ('cases', 72),
 ('p', 72),
 ('rate', 68),
 ('survival', 67),
 ('response', 66),
 ('children', 66)]

## Defining features for CRF





In [19]:
# Let's define the features to get the feature value for one word.
def featuresForOneWord(sentence, tokens, pos): 
    
    word = sentence[pos]
    word_token = tokens[pos]
    features = [
        'word.lower=' + word.lower(),                          # Word in lowercase
        'word.length=%s' % len(word),                          # Length of word
        'word[-3:]=' + word[-3:],                              # Last 3 characters of the word
        'word[-2:]=' + word[-2:],                              # Last 2 characters of the word
        'word[:-3]=' + word[:-3],                              # All characters except last 3 characters of the word
        'word[:-2]=' + word[:-2],                              # All characters except last 2 characters of the word
        'word.isupper=%s' % word.isupper(),                    # Is the word in uppercase
        'word.isdigit=%s' % word.isdigit(),                    # Is the word a number
        'word.startsWithCapital=%s' % word[0].isupper(),       # Is the first character in uppercase
        'word.postag=' + word_token.pos_                       # POS tag associated with the word
    ]

    if (pos > 0):
        prev_word = sentence[pos-1]
        prev_word_token = tokens[pos-1]
        features.extend([
            'prev_word.lower=' + prev_word.lower(),                        # Previous word in lowercase
            'prev_word.isdigit=%s' % prev_word.isdigit(),                  # If the previous word a number
            'prev_word.ispunct=%s' % (prev_word in string.punctuation),    # Is the previous word a punctuation
            'prev_word.postag=' + prev_word_token.pos_,                    # POS tag associated with the word
            
            # Previous word is associated to the current word or parent of current word via compound or adjectival modifier dependency
            'prev_word.related=%s' % (prev_word_token.dep_ in ['compound', 'amod'] and (prev_word_token.head.text in [word, word_token.head.text]))
        ])
    else:
        features.append('BEG')                                 # Is the word first in the sentence

    if pos == (len(sentence)-1):
        features.append('END')                                 # Is the word last in the sentence

    return features

## Getting the features

In [20]:
# Write a code to get features for a sentence.
def featuresForOneSentence(sentence):
    sentence_list = sentence.split()
    tokens = model(sentence)
    return [featuresForOneWord(sentence_list, tokens, pos) for pos in range(len(sentence_list))]

In [21]:
# Write a code to get the labels for a sentence.
def labelsInListForOneSentence(labels):
    return labels.split()

## Define input and target variables


In [22]:
# Defining input variables on both train and test dataset
X_train = [featuresForOneSentence(sentence) for sentence in train_sentences]
X_test = [featuresForOneSentence(sentence) for sentence in test_sentences]

In [23]:
# Defining target variables on both train and test dataset
Y_train = [labelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [labelsInListForOneSentence(labels) for labels in test_labels]

## Building the model

In [24]:
# Build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=200)
crf.fit(X_train, Y_train)

## Evaluating the model

In [25]:
# Predicting on test dataset using the trained model
Y_pred = crf.predict(X_test)

In [26]:
# Printing the f2 score for the model against predicted labels
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average="weighted")
print("F1 score for prepared model =", f1_score)

F1 score for prepared model = 0.9200617128541253


## Identifying Diseases and Treatments

In [27]:
# Preparing a list of the diseases and corresponding treatments as found as per predicted labels
treatments = {}
for i, pred in enumerate(Y_pred):
    # print(test_sentences[i])
    # print(pred)
    treatment = []
    disease = []
    sentence = test_sentences[i]
    words = sentence.split()
    for j, label in enumerate(pred):
        if label == 'D':
            disease.append(words[j])
        elif label == 'T':
            treatment.append(words[j])

    if len(disease) > 0 and len(treatment) > 0:
        disease = " ".join(disease)
        if disease not in treatments:
            treatments[disease] = []
        treatments[disease].append(" ".join(treatment))

# Convert into data frame and print first 5 rows
treatments = pd.DataFrame(list(treatments.items()), columns=['Disease', 'Treatments'])
treatments.head()

Unnamed: 0,Disease,Treatments
0,gestational diabetes cases,[good glycemic control]
1,hereditary retinoblastoma,[radiotherapy]
2,unstable angina or non-Q-wave myocardial infar...,[roxithromycin]
3,coronary-artery disease,[Antichlamydial antibiotics]
4,primary pulmonary hypertension ( PPH ),[fenfluramines]


### Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [28]:
disease = "hereditary retinoblastoma"
print("Treatment for", disease, "is", ",".join(treatments[treatments['Disease'] == 'hereditary retinoblastoma']['Treatments'].to_list()[0]))

Treatment for hereditary retinoblastoma is radiotherapy
