### Set up the Environment

In [1]:
from IPython.core.display import clear_output
!wget https://raw.githubusercontent.com/rahulrajpr/references/main/requirements_tf_cert_exam.txt
clear_output()

In [2]:
!wget https://raw.githubusercontent.com/rahulrajpr/references/main/helper_functions.py
clear_output()

In [3]:
!pip install -r requirements_tf_cert_exam.txt
!pip install split_folders
clear_output()

> Import Libraries

In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib as plt
import scipy
import warnings
warnings.filterwarnings('ignore')
import os
import random

> Check the versions of the library

In [5]:
print(f'tensorflow version : {tf.__version__}')
print(f'pandas version : {pd.__version__}')
print(f'numpy version : {np.__version__}')
print(f'scipy version : {scipy.__version__}')

tensorflow version : 2.10.0
pandas version : 1.4.2
numpy version : 1.22.4
scipy version : 1.7.3


### Download the DataSet

In [6]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git
!ls pubmed-rct

Cloning into 'pubmed-rct'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 33 (delta 5), reused 5 (delta 5), pack-reused 25[K
Unpacking objects: 100% (33/33), 177.08 MiB | 4.74 MiB/s, done.
Updating files: 100% (13/13), done.
PubMed_200k_RCT
PubMed_200k_RCT_numbers_replaced_with_at_sign
PubMed_20k_RCT
PubMed_20k_RCT_numbers_replaced_with_at_sign
README.md


In [7]:
!ls pubmed-rct/PubMed_200k_RCT_numbers_replaced_with_at_sign/

dev.txt  test.txt  train.zip


In [8]:
train_zip_dir = 'pubmed-rct/PubMed_200k_RCT_numbers_replaced_with_at_sign/train.zip'

In [9]:
from helper_functions import unzip_untar_data

In [10]:
unzip_untar_data(train_zip_dir)

File extension : .zip
File extracting.......
Completed successfully


In [11]:
train_dir = 'train.txt'
val_dir = 'pubmed-rct/PubMed_200k_RCT_numbers_replaced_with_at_sign/dev.txt'

> Get the line of the txt file


In [12]:
def read_lines(path):
  with open(path, 'r') as f:
    lines = f.readlines()
  return lines

In [13]:
train_lines = read_lines(train_dir)
val_lines = read_lines(val_dir)

In [14]:
type(train_lines)

list

In [15]:
train_lines[:20]

['###24491034\n',
 'BACKGROUND\tThe emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments .\n',
 'BACKGROUND\tThis paper describes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV .\n',
 'METHODS\tThis study is designed as a randomised controlled trial in which men living with HIV in Australia will be assigned to either an intervention group or usual care control group .\n',
 "METHODS\tThe intervention group will participate in the online group program ` Positive Outlook ' .\n",
 'METHODS\tThe program is based on self-efficacy theory and uses a self-management approach to enhance skills , confidence and abilities to manage the psychosocial issues associated with HIV in daily life .\n',
 'METHODS\tParticipants will access the program f

In [16]:
def convert_lines_to_structured_data(lines):

  elements = lines [:]
  element_types = ['id' if str(x).startswith('##') else('break' if x == '\n' else 'item') for x in elements]

  id_list,order_list,text_list,label_list = [],[],[],[]

  for tp,el in zip(element_types,elements):  
    if tp == 'id':
      id = el
      row_num = 0
    elif tp == 'item':
      id_list.append(id)
      order_list.append(row_num)
      label_list.append(el.split('\t')[0])
      text_list.append(el.split('\t')[1])
      row_num = row_num+1
    else:
      pass
    
  return id_list,order_list,text_list,label_list

In [17]:
train_id, train_ord, train_sentences, train_labels = convert_lines_to_structured_data(train_lines)
val_id, val_ord, val_sentences, val_labels = convert_lines_to_structured_data(val_lines)

In [18]:
train_sentences[:3]

['The emergence of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments .\n',
 'This paper describes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV .\n',
 'This study is designed as a randomised controlled trial in which men living with HIV in Australia will be assigned to either an intervention group or usual care control group .\n']

In [19]:
np.unique(train_labels)

array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
      dtype='<U11')

In [20]:
len(train_sentences), len(val_sentences)

(2211861, 28932)

### Lets select random 10% of the data for training

- select only 10% of the data

In [21]:
ten_percent_len = int(len(train_sentences)*0.1)
len(train_sentences),ten_percent_len

(2211861, 221186)

In [22]:
train_df = pd.DataFrame({'id':train_id, 
                         'order':train_ord, 
                         'sentence':train_sentences,
                         'label':train_labels})

In [23]:
train_df_10_percent = train_df.sample(ten_percent_len, random_state =42)
len(train_df_10_percent)

221186

In [24]:
train_id = train_df_10_percent['id'].tolist()
train_ord = train_df_10_percent['order'].tolist()
train_sentences = train_df_10_percent['sentence'].tolist()
train_labels = train_df_10_percent['label'].tolist()

### Encoding of the Text Labels

> One Hot Encoding of the labels

> convert the list into a numpy array and expand its dimention so that it can feed int the onehot encoder

In [25]:
train_labels = np.expand_dims(np.array(train_labels), axis = 1)

In [26]:
val_labels = np.expand_dims(np.array(val_labels), axis = 1)

In [27]:
from sklearn.preprocessing import OneHotEncoder

In [28]:
onehot = OneHotEncoder(sparse_output = False)

In [29]:
onehot.fit(np.array(train_labels))

In [30]:
train_label_onehot = onehot.transform(train_labels)
val_label_onehot = onehot.transform(val_labels)

> Lets label encode the labels as well for the comparison of the results

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
label_encoder = LabelEncoder()

In [33]:
label_encoder.fit(train_labels)

In [34]:
train_label_label_encoded = label_encoder.transform(train_labels)
val_label_label_encoded = label_encoder.transform(val_labels)

### Standardization of Sentences


> lets define the text standardistion function

In [1]:
import string
import tensorflow as tf

def text_standardization(text, to_lower=True, rm_punctuations=True, rm_html=True, rm_newline=True):

  """
  Objective
  ---------
  Function that takes the raw text process and return the standardized text for natural language processing task

  Note : It is fuction need to map into each elements of an iterative object

  example --> train_sentences = [text_standardization(x) for x in train_sentences]

  Parameters
  ---------
  text : text to standardize
  to_lower : lower all the alphabets in the text
  rm_punctuations : bool -- > remove the punctuations
  rm_html : bool -- > remove the thml tages from the text
  rm_newline : bool -- > remove the newline ('\n')

  """
  
  if to_lower:
    x = tf.strings.lower(text)  # convert all the text into lowercase
  if rm_html:
    x = tf.strings.regex_replace(x, '<[^>]*>', '')  # replace the html tags with nothing
  if rm_punctuations:
    x = tf.strings.regex_replace(x, '[' + string.punctuation + ']', '')  # replace the punctuations with nothing
  if rm_newline:
    x = tf.strings.regex_replace(x, '\n', '')  # remove the new line characters
  x = x.numpy().decode('utf-8')  # convert into a numpy value and decode the text into utf-8 format
    
  return x

lets standardise the text

In [36]:
train_sentences = [text_standardization(x) for x in train_sentences]
train_sentences[:3]

['because most infant deaths occur in the first few month of life  maternal supplementation may improve infant survival ',
 'the objective of this study was to determine the efficacy and safety of nebulized   hypertonic saline solution and salbutamol in the treatment of mild to moderate bronchiolitis ',
 'this randomized  doubleblind  multicenter study was conducted to confirm a previous finding that lansoprazole relieves heartburn faster than omeprazole in patients with erosive esophagitis ']

In [37]:
val_sentences = [text_standardization(x) for x in val_sentences]
val_sentences[:3]

['adrenergic activation is thought to be an important determinant of outcome in subjects with chronic heart failure  chf   but baseline or serial changes in adrenergic activity have not been previously investigated in a large patient sample treated with a powerful antiadrenergic agent ',
 'systemic venous norepinephrine was measured at baseline   months  and  months in the betablocker evaluation of survival trial  best   which compared placebo treatment with the betablockersympatholytic agent bucindolol ',
 'baseline norepinephrine level was associated with a progressive increase in rates of death or death plus chf hospitalization that was independent of treatment group ']

### Define the goabal varibales

In [38]:
len_list = [len(x) for x in train_sentences]
np.percentile(len_list,95)

268.0

In [39]:
bs = 32
vocab_size = 10000
pad_len = 248
oov_token = '<OOV>'

pad_style = 'post'
trunc_style = 'post'

embed_len = 16

### Tokenize the text & Pad the sentences

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [41]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)

In [42]:
tokenizer.fit_on_texts(train_sentences)

In [43]:
train_sentences

['because most infant deaths occur in the first few month of life  maternal supplementation may improve infant survival ',
 'the objective of this study was to determine the efficacy and safety of nebulized   hypertonic saline solution and salbutamol in the treatment of mild to moderate bronchiolitis ',
 'this randomized  doubleblind  multicenter study was conducted to confirm a previous finding that lansoprazole relieves heartburn faster than omeprazole in patients with erosive esophagitis ',
 'twentyone patients       had respiratory depression  hypercarbia  etco   mm hg  or hypoxia  oxygen saturation    for over  minute     patients       had hypercarbia  and  patients     had both hypoxia and hypercarbia ',
 'plasma urea and creatinine concentrations were determined at induction of anesthesia and  or  h postoperatively ',
 'healthy male volunteers  n    age  years  randomly received a single sublingual dose of asenapine  mg after  h fasting  treatment a  reference   after a highfat

> converting the sentecnces into tokens

In [44]:
train_sentences = tokenizer.texts_to_sequences(train_sentences)
val_sentences = tokenizer.texts_to_sequences(val_sentences)

> lets pad the sentences

In [45]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [46]:
train_sentences = pad_sequences(train_sentences, maxlen = pad_len, padding = pad_style, truncating = trunc_style)
val_sentences = pad_sequences(val_sentences, maxlen = pad_len, padding = pad_style, truncating = trunc_style)

In [47]:
train_sentences.shape, val_sentences.shape

((221186, 248), (28932, 248))

### Create the Datapipeline

In [48]:
train_ds = tf.data.Dataset.from_tensor_slices((train_sentences, train_label_onehot))
val_ds = tf.data.Dataset.from_tensor_slices((val_sentences, val_label_onehot))

In [49]:
train_ds,val_ds

(<TensorSliceDataset element_spec=(TensorSpec(shape=(248,), dtype=tf.int32, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>,
 <TensorSliceDataset element_spec=(TensorSpec(shape=(248,), dtype=tf.int32, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>)

In [50]:
train_ds = train_ds.shuffle(int(0.1*len(train_sentences))).batch(bs).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(bs).prefetch(tf.data.AUTOTUNE)

In [51]:
train_ds, val_ds

(<PrefetchDataset element_spec=(TensorSpec(shape=(None, 248), dtype=tf.int32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>,
 <PrefetchDataset element_spec=(TensorSpec(shape=(None, 248), dtype=tf.int32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>)

### Create Embedding Layer

In [52]:
from tensorflow.keras import layers

In [53]:
embed_layer = layers.Embedding(input_dim = vocab_size, output_dim = embed_len, input_length = pad_len)

> text the embedding layer

In [54]:
embed_layer(train_sentences[0])

<tf.Tensor: shape=(248, 16), dtype=float32, numpy=
array([[ 0.02869818, -0.01960782,  0.04175866, ...,  0.02182112,
        -0.04136328,  0.00414128],
       [ 0.03557614, -0.00731556,  0.02285565, ..., -0.04692751,
         0.047684  ,  0.0314167 ],
       [-0.02349266,  0.04732765,  0.02178572, ..., -0.02010095,
        -0.02582886,  0.00987246],
       ...,
       [-0.00650854, -0.03234841, -0.00045926, ...,  0.04235596,
         0.00947078,  0.03146854],
       [-0.00650854, -0.03234841, -0.00045926, ...,  0.04235596,
         0.00947078,  0.03146854],
       [-0.00650854, -0.03234841, -0.00045926, ...,  0.04235596,
         0.00947078,  0.03146854]], dtype=float32)>

### Create the callbacks

In [55]:
from helper_functions import create_model_checkpoint

In [56]:
train_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 248), dtype=tf.int32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>

### Model 1

Embedding + Global AvgPooling + Dense

In [57]:
classes = label_encoder.classes_
num_classes = len(classes)

In [58]:
inputs = tf.keras.Input(shape = [pad_len,], name = 'input_layer')
x = embed_layer(inputs)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(units = 32, activation = 'relu')(x)
outputs = layers.Dense(units = num_classes, activation = 'softmax')(x)
model_1 = tf.keras.Model(inputs, outputs, name = 'model_1')

model_1.compile(loss = tf.keras.losses.categorical_crossentropy,
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_1.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 248)]             0         
                                                                 
 embedding (Embedding)       (None, 248, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 5)                 165       
                                                                 
Total params: 160,709
Trainable params: 160,709
Non-trainable params: 0
_____________________________________________________

In [59]:
history_1 = model_1.fit(train_ds,
                        epochs = 5, 
                        steps_per_epoch = int(len(train_ds)),
                        validation_data = val_ds,
                        validation_steps = int(len(val_ds)*0.25),
                        callbacks = [create_model_checkpoint(model_name = model_1.name)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
model_1.load_weights('checkpoints/model_1.h5')

In [61]:
model_1.evaluate(val_ds)



[0.5484117865562439, 0.8004977107048035]

In [62]:
from helper_functions import calculate_results
def pred_and_show_result(model,val_ds = val_ds,val_labels = val_label_label_encoded):
  prob = model.predict(val_ds)
  pred = np.argmax(prob, axis = 1)
  res = calculate_results(y_true = val_labels, y_pred = pred)
  return res

In [63]:
result_1 = pred_and_show_result(model_1)
result_1



{'accuracy': 0.8004977187888843,
 'precision': 0.7994580934005469,
 'recall': 0.8004977187888843,
 'f1': 0.799669917162396}

Model 2:

Embedding + Conv1D + MaxPool + Flatten + Dense

In [67]:
inputs = tf.keras.Input(shape = [pad_len,], name = 'input_layer')
x = embed_layer(inputs)
x = layers.Conv1D(filters = 16, kernel_size = 5)(x)
x = layers.MaxPool1D()(x)
x = layers.Conv1D(filters = 32, kernel_size = 5)(x)
x = layers.MaxPool1D()(x)
x = layers.Flatten()(x)
x = layers.Dense(units = 16, activation = 'relu')(x)
outputs = layers.Dense(units = num_classes, activation = 'softmax')(x)
model_2 = tf.keras.Model(inputs, outputs, name = 'model_2')

model_2.compile(loss = tf.keras.losses.categorical_crossentropy,
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_2.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 248)]             0         
                                                                 
 embedding (Embedding)       (None, 248, 16)           160000    
                                                                 
 conv1d_2 (Conv1D)           (None, 244, 16)           1296      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 122, 16)          0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 118, 32)           2592      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 59, 32)           0         
 1D)                                                       

In [69]:
history_2 = model_2.fit(train_ds,
                        epochs = 5, 
                        steps_per_epoch = int(len(train_ds)),
                        validation_data = val_ds,
                        validation_steps = int(len(val_ds)*0.25),
                        callbacks = [create_model_checkpoint(model_name = model_2.name)])

model_2.load_weights('checkpoints/model_2.h5')

result_2 = pred_and_show_result(model_2)
result_2

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 0.821201437854279,
 'precision': 0.8169959767525753,
 'recall': 0.821201437854279,
 'f1': 0.8183513893778608}

Model 3 

- Embedding + LSTM + Dense

In [72]:
inputs = tf.keras.Input(shape = [pad_len,], name = 'input_layer')
x = embed_layer(inputs)
x = layers.LSTM(units = 64)(x)
x = layers.Dense(units = 16, activation = 'relu')(x)
outputs = layers.Dense(units = num_classes, activation = 'softmax')(x)
model_3 = tf.keras.Model(inputs, outputs, name = 'model_3')

model_3.compile(loss = tf.keras.losses.categorical_crossentropy,
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_3.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 248)]             0         
                                                                 
 embedding (Embedding)       (None, 248, 16)           160000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                20736     
                                                                 
 dense_5 (Dense)             (None, 16)                1040      
                                                                 
 dense_6 (Dense)             (None, 5)                 85        
                                                                 
Total params: 181,861
Trainable params: 181,861
Non-trainable params: 0
_________________________________________________________________


In [73]:
history_3 = model_3.fit(train_ds,
                        epochs = 5, 
                        steps_per_epoch = int(len(train_ds)),
                        validation_data = val_ds,
                        validation_steps = int(len(val_ds)*0.25),
                        callbacks = [create_model_checkpoint(model_name = model_3.name)])

model_3.load_weights('checkpoints/model_3.h5')

result_3 = pred_and_show_result(model_3)
result_3

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 0.3448430803262823,
 'precision': 0.1189167500489188,
 'recall': 0.3448430803262823,
 'f1': 0.1768485138356328}

LSTM seems noot working

### Model : 4 

- Embedding + Conv1D with diffrent parameters

In [78]:
inputs = tf.keras.Input(shape = [pad_len,], name = 'input_layer')
x = embed_layer(inputs)
x = layers.Conv1D(filters = 512, kernel_size = 3)(x)
x = layers.MaxPool1D()(x)
x = layers.Conv1D(filters = 124, kernel_size = 3)(x)
x = layers.MaxPool1D()(x)
x = layers.Flatten()(x)
x = layers.Dense(units = 16, activation = 'relu')(x)
outputs = layers.Dense(units = num_classes, activation = 'softmax')(x)

model_4 = tf.keras.Model(inputs, outputs, name = 'model_4')

model_4.compile(loss = tf.keras.losses.categorical_crossentropy,
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_4.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 248)]             0         
                                                                 
 embedding (Embedding)       (None, 248, 16)           160000    
                                                                 
 conv1d_8 (Conv1D)           (None, 246, 512)          25088     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 123, 512)         0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 121, 124)          190588    
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 60, 124)          0         
 1D)                                                       

In [79]:
history_4 = model_4.fit(train_ds,
                        epochs = 5, 
                        steps_per_epoch = int(len(train_ds)),
                        validation_data = val_ds,
                        validation_steps = int(len(val_ds)*0.25),
                        callbacks = [create_model_checkpoint(model_name = model_4.name)])

model_4.load_weights('checkpoints/model_4.h5')

result_4 = pred_and_show_result(model_4)
result_4

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


{'accuracy': 0.8171574726945943,
 'precision': 0.8148161639074278,
 'recall': 0.8171574726945943,
 'f1': 0.8154328979941129}

In [90]:
all_results = pd.DataFrame({model_1.name:result_1,
                            model_2.name:result_2,
                            model_3.name:result_3,
                            model_4.name:result_4}).transpose()

all_results.sort_values(by = 'f1', ascending = False, inplace = True)
all_results

Unnamed: 0,accuracy,precision,recall,f1
model_2,0.821201,0.816996,0.821201,0.818351
model_4,0.817157,0.814816,0.817157,0.815433
model_1,0.800498,0.799458,0.800498,0.79967
model_3,0.344843,0.118917,0.344843,0.176849


model_2 being the best model. lets save the model

In [93]:
model_2.load_weights('/content/checkpoints/model_2.h5')

In [94]:
model_2.save('best_model.h5')

In [95]:
loaded_model = tf.keras.models.load_model('best_model.h5')
loaded_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_layer (InputLayer)    [(None, 248)]             0         
                                                                 
 embedding (Embedding)       (None, 248, 16)           160000    
                                                                 
 conv1d_2 (Conv1D)           (None, 244, 16)           1296      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 122, 16)          0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 118, 32)           2592      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 59, 32)           0         
 1D)                                                       

In [96]:
result_best_model = pred_and_show_result(loaded_model)
result_best_model



{'accuracy': 0.821201437854279,
 'precision': 0.8169959767525753,
 'recall': 0.821201437854279,
 'f1': 0.8183513893778608}

End of the document