In [81]:
#a research paper implementation of medical data which is abstract to comprehend
import tensorflow as tf

In [82]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git

fatal: destination path 'pubmed-rct' already exists and is not an empty directory.


In [83]:
def get_lines(filepath):
    with open(filepath, 'r') as f:
        return f.readlines()



def preprocess_text_with_line_numbers(filename):
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_samples = []
    for line in input_lines:
        if line.startswith("###"):
            abstract_id = line
            abstract_lines=""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {}
                target_text_split = abstract_line.split("\t")
                line_data["target"] = target_text_split[0]
                line_data["text"] = target_text_split[1].lower()
                line_data["line_number"] = abstract_line_number
                line_data["total_lines"] = len(abstract_line_split)-1
                abstract_samples.append(line_data)
        else:
            abstract_lines+=line
    return abstract_samples

In [84]:
import os
data_dir = 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/'
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

['pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt']

In [85]:
get_lines(filenames[0])[:20]

['###24290286\n',
 'BACKGROUND\tIgE sensitization to Aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma .\n',
 'BACKGROUND\tIt is not clear whether these patients would benefit from antifungal treatment .\n',
 'OBJECTIVE\tWe sought to determine whether a @-month course of voriconazole improved asthma-related outcomes in patients with asthma who are IgE sensitized to A fumigatus .\n',
 'METHODS\tAsthmatic patients who were IgE sensitized to A fumigatus with a history of at least @ severe exacerbations in the previous @ months were treated for @ months with @ mg of voriconazole twice daily , followed by observation for @ months , in a double-blind , placebo-controlled , randomized design .\n',
 'METHODS\tPrimary outcomes were improvement in quality of life at the end of the treatment period and a reduction in the number of severe exacerbations over the @ months of the study .\n',
 'RESULTS\tSixty-five patients were randomized .

In [86]:
train_samples = preprocess_text_with_line_numbers(filenames[0])
val_samples = preprocess_text_with_line_numbers(filenames[1])
test_samples = preprocess_text_with_line_numbers(filenames[2])


In [87]:
train_samples[:3]

[{'target': 'BACKGROUND',
  'text': 'ige sensitization to aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma .',
  'line_number': 0,
  'total_lines': 9},
 {'target': 'BACKGROUND',
  'text': 'it is not clear whether these patients would benefit from antifungal treatment .',
  'line_number': 1,
  'total_lines': 9},
 {'target': 'OBJECTIVE',
  'text': 'we sought to determine whether a @-month course of voriconazole improved asthma-related outcomes in patients with asthma who are ige sensitized to a fumigatus .',
  'line_number': 2,
  'total_lines': 9}]

In [88]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

train_df.head()


Unnamed: 0,target,text,line_number,total_lines
0,BACKGROUND,ige sensitization to aspergillus fumigatus and...,0,9
1,BACKGROUND,it is not clear whether these patients would b...,1,9
2,OBJECTIVE,we sought to determine whether a @-month cours...,2,9
3,METHODS,asthmatic patients who were ige sensitized to ...,3,9
4,METHODS,primary outcomes were improvement in quality o...,4,9


In [89]:
train_sentences = train_df.text.to_list()
val_sentences = val_df.text.to_list()
test_sentences = test_df.text.to_list()

train_sentences[:2]


['ige sensitization to aspergillus fumigatus and a positive sputum fungal culture result are common in patients with refractory asthma .',
 'it is not clear whether these patients would benefit from antifungal treatment .']

In [90]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output = False) 

train_labels_one_hot = one_hot_encoder.fit_transform(train_df['target'].to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.fit_transform(val_df['target'].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.fit_transform(test_df['target'].to_numpy().reshape(-1, 1))

train_labels_one_hot #binary


array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [91]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels_encoded = label_encoder.fit_transform(train_df['target'].to_numpy())
val_labels_encoded = label_encoder.fit_transform(val_df['target'].to_numpy())
test_labels_encoded = label_encoder.fit_transform(test_df['target'].to_numpy())

train_labels_encoded #decimal

array([0, 0, 3, ..., 4, 1, 1])

In [92]:
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
class_names

array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
      dtype=object)

In [93]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#model_0 uses Naive Bayes Model

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels_encoded)
model_0.results = calculate_results(val_labels_encoded, model_0.predict(val_sentences))
model_0.results

{'accuracy': 67.64107976005333,
 'precision': 0.6805658878020778,
 'recall': 0.6764107976005332,
 'f1': 0.6349298999118057}

In [95]:
sent_length = [len(sentence.split()) for sentence in train_sentences]
tf.math.reduce_mean(sent_length).numpy()

26

In [96]:
import numpy as np
int(np.percentile(sent_length, 95))

55

In [97]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens = 68000,
    output_sequence_length = 55
) #55 is the 95 percentile of sent_length
text_vectorizer.adapt(train_sentences)

sample_sentence = 'hi there, how are you'
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 55), dtype=int64, numpy=
array([[4139,   63, 1093,   64,    1,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])>

In [98]:
embedding = tf.keras.layers.Embedding(
    input_dim = 68000,
    output_dim = 120,
    input_length = 55,
    mask_zero = True
)

embedding(text_vectorizer([sample_sentence]))

<tf.Tensor: shape=(1, 55, 120), dtype=float32, numpy=
array([[[-0.02091458, -0.02848719, -0.03392887, ..., -0.04245558,
          0.02222841,  0.00402117],
        [ 0.04975675, -0.01769758,  0.00190584, ..., -0.00498702,
         -0.03638126, -0.02260991],
        [ 0.01655388, -0.04954735,  0.04931574, ..., -0.01548467,
          0.04112792,  0.01318311],
        ...,
        [ 0.01021777,  0.04414475,  0.04980525, ..., -0.04040786,
          0.01880119,  0.01564344],
        [ 0.01021777,  0.04414475,  0.04980525, ..., -0.04040786,
          0.01880119,  0.01564344],
        [ 0.01021777,  0.04414475,  0.04980525, ..., -0.04040786,
          0.01880119,  0.01564344]]], dtype=float32)>

In [99]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [100]:
train_dataset = train_dataset.batch(64).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>

In [106]:
inputs = tf.keras.layers.Input(shape = (1, ), dtype = 'string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(64, kernel_size = 5)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(num_classes, activation = 'softmax')(x)

model_1 = tf.keras.models.Model(inputs, outputs) #model_1 uses CNN

model_1.compile(
    loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

history_1 = model_1.fit(
    train_dataset,
    epochs = 5,
    steps_per_epoch = len(train_dataset),
    validation_data = val_dataset,
    validation_steps = int(0.2 * len(val_dataset))
    )




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [105]:
model_pred_probs = model_1.predict(val_dataset)
model_1_preds = tf.argmax(model_pred_probs, axis = 1).numpy()
model_1_results = calculate_results(val_labels_encoded, model_1_preds)
model_1_results



{'accuracy': 73.0254387913797,
 'precision': 0.7386353504832475,
 'recall': 0.7302543879137969,
 'f1': 0.7332701444386144}

In [107]:
model_1.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 1)]               0         
                                                                 
 text_vectorization_3 (Text  (None, 55)                0         
 Vectorization)                                                  
                                                                 
 embedding_3 (Embedding)     (None, 55, 120)           8160000   
                                                                 
 conv1d_9 (Conv1D)           (None, 51, 64)            38464     
                                                                 
 global_average_pooling1d_9  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_9 (Dense)             (None, 5)                 325 

In [110]:
#model_2 uses Universal Sentence Encoder

import tensorflow_hub as hub
embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")


In [113]:
import random
random_sentence = random.choice(train_sentences)
print(random_sentence,'\n' ,len(random_sentence))
text_vectorizer([random_sentence])

to observe the protective effect of transcutaneous electrical acupoint stimulation ( teas ) on cerebral tissue in elderly hip replacement operation patients during general anesthesia under controlled hypotension . 
 213


<tf.Tensor: shape=(1, 55), dtype=int64, numpy=
array([[   6, 2107,    2, 2153,   74,    4, 4547, 2751, 5222,  703, 5755,
          19, 1081,  631,    5,  781, 1183, 1142,  860,   11,   56,  309,
         419,  341,  101, 1496,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]])>

In [114]:
embed_sample = embed([random_sentence])
embed_sample

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[ 0.02192686,  0.0717385 ,  0.06441112, -0.0727673 , -0.02691519,
        -0.00605917,  0.03653429, -0.0550508 , -0.02536041,  0.02767338,
         0.0799851 ,  0.05593303,  0.02379033,  0.06863826,  0.02127163,
        -0.06800511, -0.08003996, -0.02161273, -0.06571784, -0.04634899,
        -0.03561794, -0.00475984, -0.02629415, -0.03132291,  0.02071399,
         0.01995087, -0.06193408,  0.0269776 , -0.0172271 ,  0.04608248,
         0.02273444,  0.08011284,  0.07033496, -0.05087849,  0.03109113,
         0.02127643,  0.02391165, -0.03701675, -0.03789214, -0.06972475,
         0.05086392,  0.00221968,  0.05823132, -0.0681831 ,  0.0267735 ,
         0.07565604, -0.01899219, -0.03650256,  0.03285763, -0.02870599,
        -0.07292692,  0.01659679,  0.02373334,  0.04792888, -0.06238138,
         0.0252289 , -0.03569742,  0.02599585,  0.05232928, -0.00568793,
        -0.03624795,  0.07307966,  0.00145131,  0.04784302,  0.0280991 ,
 

In [115]:
sentence_encoder = hub.KerasLayer(embed,
                                  input_shape=[],
                                  dtype=tf.string,
                                  trainable=False)

In [117]:
model_2 = tf.keras.models.Sequential([
    sentence_encoder,
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
history_2 = model_2.fit(
    train_dataset,
    epochs = 5,
    steps_per_epoch = len(train_dataset),
    validation_data = val_dataset,
    validation_steps = int(0.2 * len(val_dataset))
    )



Epoch 1/5


ValueError: in user code:

    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1151, in train_step
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/losses.py", line 2532, in binary_crossentropy
        
    File "/Users/suchirmvelpanur/anaconda3/lib/python3.11/site-packages/keras/src/backend.py", line 5822, in binary_crossentropy
        

    ValueError: `logits` and `labels` must have the same shape, received ((None, 1) vs (None, 5)).
