# Syntactic Processing - Assignment

## 0. Import libraries and load data and models

In [1]:
# mount google drive
from google import colab
colab.drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install libraries
!pip install pycrf
!pip install sklearn-crfsuite

Collecting pycrf
  Downloading pycrf-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycrf
  Building wheel for pycrf (setup.py) ... [?25l[?25hdone
  Created wheel for pycrf: filename=pycrf-0.0.1-py3-none-any.whl size=1870 sha256=d955db0c493ad301c6369ed9651e0ed02af1c8b95d93f9f58d97cf20865c84f0
  Stored in directory: /root/.cache/pip/wheels/fd/3a/fb/e4d15c9c2b169f43811b23a863ee9717ff3eda5d2301789043
Successfully built pycrf
Installing collected packages: pycrf
Successfully installed pycrf-0.0.1
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting tabulate>=0.4.2 (from sklearn-crfsuite)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3

In [3]:
# import libraries
import os
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
from tqdm import tqdm

In [4]:
# load model
nlp = spacy.load('en_core_web_sm')

In [5]:
# utility to load data
def load_data(filename):
  conn = open(filename)
  data = conn.read()
  conn.close()
  return data

In [6]:
# load dataset
train_sent_doc = load_data('/content/drive/MyDrive/data/train_sent')
train_label_doc = load_data('/content/drive/MyDrive/data/train_label')
test_sent_doc = load_data('/content/drive/MyDrive/data/test_sent')
test_label_doc = load_data('/content/drive/MyDrive/data/test_label')

## Task 1: Data preprocessing

### Construct the proper sentences from individual words and print the 5 sentences.

In [7]:
# define utility to build sentences
def build_sentences(data_doc, label_doc):
  sentences, labels = [], []
  sentence, label = '', ''
  data_lines = data_doc.split('\n')
  label_lines = label_doc.split('\n')

  for i in range(0, len(data_lines)):
    if len(data_lines[i]) == 0:
      sentences.append(sentence)
      labels.append(label)
      sentence, label = '', ''
    else:
      sentence = sentence + data_lines[i] + ' '
      label = label + label_lines[i] + ' '

  return sentences, labels

In [8]:
# prepare train and test data
train_sentences, train_labels = build_sentences(train_sent_doc, train_label_doc)
test_sentences, test_labels = build_sentences(test_sent_doc, test_label_doc)

In [9]:
# remove last row from train & test data as it is empty
train_sentences = train_sentences[:-1]
test_sentences = test_sentences[:-1]

train_labels = train_labels[:-1]
test_labels = test_labels[:-1]

In [10]:
# print 5 sentences of train data
for i in range(0, 5):
  print('{0}: {1}'.format(i+1, train_sentences[i]))
  print('\n')

1: All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status ) 


2: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 ) 


3: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 ) 


4: The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 ) 


5: Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively ) 




In [11]:
# print 5 sentences of test data
for i in range(0, 5):
  print('{0}: {1}'.format(i+1, test_sentences[i]))
  print('\n')

1: Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 ) 


2: As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration 


3: The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period 


4: There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 ) 


5: Fluctuations in ambient temperature are inversely correlated to changes in AFI 




### Count the number of sentences in the processed train and test dataset

In [12]:
# print data shape
print('Sentences in train data:', len(train_sentences))
print('Sentences in test data:', len(test_sentences))

Sentences in train data: 2599
Sentences in test data: 1056


### Count the number of lines of labels in the processed train and test dataset.

In [13]:
# utility to count number of lines in labels
def count_lines(labels):
  count = 0
  for i in range(0, len(labels)):
    count += len(labels[i].split())
  return count

In [14]:
print('Number of lines in train labels:', count_lines(train_labels))
print('Number of lines in train labels:', count_lines(test_labels))

Number of lines in train labels: 45902
Number of lines in train labels: 18618


## Task 2. Concept Identification

### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

In [15]:
# utility to extract NOUN or PROPN as PoS tag
def extract_nouns(sentences):
  nouns = {}
  for sentence in sentences:
    tokens = nlp(sentence)
    for token in tokens:
      if token.pos_ in ['NOUN', 'PROPN']:
        nouns[token.text] = nouns.get(token.text, 0) + 1
  return nouns

In [16]:
# extract nouns from train and test data
all_sentences = []
all_sentences.extend(train_sentences)
all_sentences.extend(test_sentences)
nouns_freq_map = extract_nouns(all_sentences)

### Print the top 25 most common tokens with NOUN or PROPN PoS tags

In [17]:
# create a data frame for print
frequency_df = pd.DataFrame({
    'Nouns': nouns_freq_map.keys(),
    'Frequency': nouns_freq_map.values()
})

frequency_df.sort_values(by = 'Frequency', ascending=False).iloc[:25]

Unnamed: 0,Nouns,Frequency
17,patients,492
66,treatment,281
14,%,247
292,cancer,200
451,therapy,175
57,study,154
222,disease,142
1209,cell,140
1884,lung,116
10,group,94


## Task 3. Defining the features for CRF

- f1 = word starts with capital letter
- f2 = PoS tag of the word
- f3 = length of the word
- f4 = 1 if previeous words are 'suffers from', 'diagnosed with' or 'treated with'

In [18]:
# define utility to check if word is noun
def isnoun(word):
  if word in nouns_freq_map:
    return True

  return False

In [19]:
# define feature vectors for word
def get_word_features(sentence, idx):
  word = sentence[idx]
  tokens = nlp(' '.join(sentence))


  features = [
      'word.isNoun=%s' % isnoun(word),                  # is the word noun
      'word.dep=' + tokens[idx].dep_,                   # feature to identify the dependency parse tag
      'word.length=' + str(len(word)),                  # feature to identify the word length
      'word.lower=' + word.lower(),                     # serves as word id
      'word.postag=' + tokens[idx].pos_,                # PoS tag of current word
      'word[-3:]=' + word[-3:],                         # last three characters
      'word[-2:]=' + word[-2:],                         # last two characters
      'word.isupper=%s' % word.isupper(),               # is the word in all uppercase
      'word.isdigit=%s' % word.isdigit(),               # is the word a number
      'words.startsWithCapital=%s' % word[0].isupper()  # is the word starting with a capital letter
  ]

  if(idx > 0):
    prev_word = sentence[idx-1]
    features.extend([
        'prev_word.isNoun=%s' % isnoun(prev_word),                    # is the prev word noun
        'prev_word.dep=' + tokens[idx-1].dep_,                        # feature to identify the dependency parse tag
        'prev_word.length=' + str(len(prev_word)),                    # feature to identify the word length
        'prev_word.lower=' + prev_word.lower(),                       # previous word
        'prev_word.postag=' + tokens[idx-1].pos_,                     # PoS tag of previous word
        'prev_word.isupper=%s' % prev_word.isupper(),                 # is the previous word in all uppercase
        'prev_word.isdigit=%s' % prev_word.isdigit(),                 # is the previous word a number
        'prev_words.startsWithCapital=%s' % prev_word[0].isupper()    # is the previous word starting with a capital letter
    ])
  else:
    features.append('BEG') # feature to track begin of sentence

  if(idx == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

## Task 4: Getting the features and the labels of sentences

### Write a code/function to get the features for a sentence

In [20]:
# get feature vectors of sentence
def get_features_of_sentence(sentence):
  sentence_list = sentence.split()
  return [get_word_features(sentence_list, pos) for pos in range(len(sentence_list))]

### Write a code/function to get the labels of a sentence

In [21]:
def get_labels_of_sentence(labels):
  return labels.split()

In [22]:
# Apply function 'get_features_of_sentence' to get features on a single sentence
example_sentence = train_sentences[10]
print(example_sentence)

features = get_features_of_sentence(example_sentence)
features[8]

A 2- , 3- , and 4-day mean temperature prior to the test date was compared to AFI using a Spearman-rank Correlation 


['word.isNoun=True',
 'word.dep=nmod',
 'word.length=11',
 'word.lower=temperature',
 'word.postag=NOUN',
 'word[-3:]=ure',
 'word[-2:]=re',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=False',
 'prev_word.isNoun=True',
 'prev_word.dep=punct',
 'prev_word.length=4',
 'prev_word.lower=mean',
 'prev_word.postag=PUNCT',
 'prev_word.isupper=False',
 'prev_word.isdigit=False',
 'prev_words.startsWithCapital=False']

## Task 5: Defining input and target variables

In [23]:
# define X_train and y_train
X_train = [get_features_of_sentence(sentence) for sentence in tqdm(train_sentences)]
y_train = [get_labels_of_sentence(labels) for labels in tqdm(train_labels)]

100%|██████████| 2599/2599 [05:30<00:00,  7.87it/s]
100%|██████████| 2599/2599 [00:00<00:00, 1068306.16it/s]


In [24]:
# define X_test and y_test
X_test = [get_features_of_sentence(sentence) for sentence in tqdm(test_sentences)]
y_test = [get_labels_of_sentence(labels) for labels in tqdm(test_labels)]

100%|██████████| 1056/1056 [02:11<00:00,  8.02it/s]
100%|██████████| 1056/1056 [00:00<00:00, 975376.57it/s]


## Task 6: Building the model

In [25]:
# build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=300)
crf.fit(X_train, y_train)

## Task 7: Evaluation

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier

In [26]:
# predict the labels
y_train_pred = crf.predict(X_train)
y_test_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset

In [27]:
# calculate the f1 score
train_f1 = metrics.flat_f1_score(y_train, y_train_pred, average='weighted')
test_f1 = metrics.flat_f1_score(y_test, y_test_pred, average='weighted')

print('f1 score of train data:', round(train_f1, 4))
print('f1 score of test data:', round(test_f1, 4))

f1 score of train data: 0.9471
f1 score of test data: 0.9165


## Task 8: Identifying the diseases and treatment using a custom NER

### Create the logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset

In [28]:
# build a dictionary which contains mapping b/w disease and it's treatment
disease_treatment_map = {}

for i in range(0, len(y_test)):
  sentence = test_sentences[i].split()
  disease = ''
  treatment = ''
  for j in range(0, len(y_test_pred[i])):
    if y_test_pred[i][j] == 'D':
      disease = disease + sentence[j] + ' '
    elif y_test_pred[i][j] == 'T':
      treatment = treatment + sentence[j] + ' '

  disease = disease.strip()
  treatment = treatment.strip()

  if len(disease) > 0 and len(treatment) > 0:
    disease_treatment_map[disease] = disease_treatment_map.get(disease, [])
    disease_treatment_map[disease].append(treatment)

### Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [29]:
# prediction of given disease
disease_treatment_map['hereditary retinoblastoma']

['radiotherapy']