In [2]:
import tensorflow as tf
import numpy as np

In [3]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git


fatal: destination path 'pubmed-rct' already exists and is not an empty directory.


In [4]:
def get_lines(filepath):
    with open(filepath, 'r') as f:
        return f.readlines()


def preprocess_text_with_line_numbers(filename):
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_samples = []
    for line in input_lines:
        if line.startswith("###"):
            abstract_id = line
            abstract_lines=""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {}
                target_text_split = abstract_line.split("\t")
                line_data["target"] = target_text_split[0]
                line_data["text"] = target_text_split[1].lower()
                line_data["line_number"] = abstract_line_number
                line_data["total_lines"] = len(abstract_line_split)-1
                abstract_samples.append(line_data)
        else:
            abstract_lines+=line
    return abstract_samples


def split_chars(text):
    return " ".join(list(text))

In [5]:
import os
data_dir= 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/'
filenames=[data_dir + filename for filename in os.listdir(data_dir)]
filenames

['pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',
 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt']

In [6]:
train_samples= preprocess_text_with_line_numbers(filenames[1])
val_samples= preprocess_text_with_line_numbers(filenames[2])
test_samples= preprocess_text_with_line_numbers(filenames[0])

In [7]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df=pd.DataFrame(test_samples)
train_df.head()

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,0,11
1,METHODS,a total of @ patients with primary knee oa wer...,1,11
2,METHODS,outcome measures included pain reduction and i...,2,11
3,METHODS,pain was assessed using the visual analog pain...,3,11
4,METHODS,secondary outcome measures included the wester...,4,11


In [8]:
train_sentences= train_df.text.to_list()
val_sentences= val_df.text.to_list()
test_sentences= test_df.text.to_list()
train_sentences[:2]

['to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .',
 'a total of @ patients with primary knee oa were randomized @:@ ; @ received @ mg/day of prednisolone and @ received placebo for @ weeks .']

In [9]:
char_vocab_length=28
import string
chars= string.ascii_lowercase+string.digits+string.punctuation
chars


'abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
char_vocab_length= len(chars)+2
char_vocab_length

70

In [11]:
char_len = [len(sentence) for sentence in train_sentences]
np.percentile(char_len, 95)

290.0

In [12]:
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]


In [13]:
train_chars[0]

't o   i n v e s t i g a t e   t h e   e f f i c a c y   o f   @   w e e k s   o f   d a i l y   l o w - d o s e   o r a l   p r e d n i s o l o n e   i n   i m p r o v i n g   p a i n   ,   m o b i l i t y   ,   a n d   s y s t e m i c   l o w - g r a d e   i n f l a m m a t i o n   i n   t h e   s h o r t   t e r m   a n d   w h e t h e r   t h e   e f f e c t   w o u l d   b e   s u s t a i n e d   a t   @   w e e k s   i n   o l d e r   a d u l t s   w i t h   m o d e r a t e   t o   s e v e r e   k n e e   o s t e o a r t h r i t i s   (   o a   )   .'

In [14]:
char_vectorizer = tf.keras.layers.TextVectorization(max_tokens = 70,
                                                    output_sequence_length = 290)
char_vectorizer.adapt(train_chars)


In [15]:
char_vectorizer('hello there, how are you')

<tf.Tensor: shape=(290,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     

In [16]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output = False) 

train_labels_one_hot = one_hot_encoder.fit_transform(train_df['target'].to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.fit_transform(val_df['target'].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.fit_transform(test_df['target'].to_numpy().reshape(-1, 1))

train_labels_one_hot #binary

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [17]:
char_embed = tf.keras.layers.Embedding(input_dim = len(char_vectorizer.get_vocabulary()),
                                       output_dim = 25,
                                       mask_zero = True)

In [18]:
char_embed(char_vectorizer(['hello there']))

<tf.Tensor: shape=(1, 290, 25), dtype=float32, numpy=
array([[[-0.04830536, -0.01438912,  0.04642378, ..., -0.04206916,
          0.00383711,  0.00743097],
        [-0.04830536, -0.01438912,  0.04642378, ..., -0.04206916,
          0.00383711,  0.00743097],
        [ 0.03335663, -0.03880579, -0.04235015, ...,  0.01424539,
         -0.03403411,  0.00419496],
        ...,
        [ 0.03335663, -0.03880579, -0.04235015, ...,  0.01424539,
         -0.03403411,  0.00419496],
        [ 0.03335663, -0.03880579, -0.04235015, ...,  0.01424539,
         -0.03403411,  0.00419496],
        [ 0.03335663, -0.03880579, -0.04235015, ...,  0.01424539,
         -0.03403411,  0.00419496]]], dtype=float32)>

In [19]:
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(64).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(64).prefetch(tf.data.AUTOTUNE)
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(64).prefetch(tf.data.AUTOTUNE)

train_char_dataset


<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>

In [20]:
inputs = tf.keras.layers.Input(shape = (1, ), dtype = 'string')
x = char_vectorizer(inputs)
x = char_embed(x)
x = tf.keras.layers.Conv1D(32, kernel_size = 3)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = tf.keras.layers.Dense(5, activation = 'softmax')(x)

model_1 = tf.keras.models.Model(inputs, outputs) #model_1 uses CNN

model_1.compile(
    loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(),
    metrics = ['accuracy']
)

history_1 = model_1.fit(
    train_char_dataset,
    epochs = 5,
    steps_per_epoch = len(train_char_dataset),
    validation_data = val_char_dataset,
    validation_steps = int(0.2 * len(val_char_dataset))
    )


Epoch 1/5




[1m2814/2814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.3903 - loss: 1.4045 - val_accuracy: 0.4574 - val_loss: 1.3066
Epoch 2/5
[1m2814/2814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.4443 - val_loss: 1.3381
Epoch 3/5
[1m   1/2814[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15s[0m 6ms/step - accuracy: 0.3750 - loss: 1.4569

2024-04-01 17:50:07.981217: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(typ, value, traceback)


[1m2814/2814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - accuracy: 0.4478 - loss: 1.3191 - val_accuracy: 0.4614 - val_loss: 1.2987
Epoch 4/5
[1m2814/2814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00 - val_accuracy: 0.4636 - val_loss: 1.2949
Epoch 5/5


2024-04-01 17:50:20.920488: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m2814/2814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - accuracy: 0.4529 - loss: 1.3114 - val_accuracy: 0.4774 - val_loss: 1.2952


In [21]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels_encoded = label_encoder.fit_transform(train_df['target'].to_numpy())
val_labels_encoded = label_encoder.fit_transform(val_df['target'].to_numpy())
test_labels_encoded = label_encoder.fit_transform(test_df['target'].to_numpy())

train_labels_encoded #decimal

array([3, 2, 2, ..., 4, 1, 1])

In [22]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [23]:
model_pred_probs = model_1.predict(val_char_dataset)
model_1_preds = tf.argmax(model_pred_probs, axis = 1).numpy()
model_1_results = calculate_results(val_labels_encoded, model_1_preds)
model_1_results

[1m 72/471[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step



[1m471/471[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


{'accuracy': 46.58038825286212,
 'precision': 0.432814509284679,
 'recall': 0.4658038825286212,
 'f1': 0.42739596855801143}

In [24]:
model_1.summary()

In [26]:
import pydot
from tensorflow.keras.utils import plot_model
plot_model(model_1)

AttributeError: module 'pydot' has no attribute 'InvocationException'

In [28]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens = 68000,
    output_sequence_length = 55
) #55 is the 95 percentile of sent_length
text_vectorizer.adapt(train_sentences)

In [29]:
text_embed = tf.keras.layers.Embedding(input_dim = len(char_vectorizer.get_vocabulary()),
                                       output_dim = 25,
                                       mask_zero = True)

text_embed(char_vectorizer(['hello there']))

<tf.Tensor: shape=(1, 290, 25), dtype=float32, numpy=
array([[[ 0.03521017, -0.00461576, -0.02322393, ..., -0.01394815,
          0.02561522,  0.04939162],
        [ 0.03521017, -0.00461576, -0.02322393, ..., -0.01394815,
          0.02561522,  0.04939162],
        [ 0.02231257,  0.03739584,  0.01798662, ...,  0.0444636 ,
         -0.01652571,  0.02009152],
        ...,
        [ 0.02231257,  0.03739584,  0.01798662, ...,  0.0444636 ,
         -0.01652571,  0.02009152],
        [ 0.02231257,  0.03739584,  0.01798662, ...,  0.0444636 ,
         -0.01652571,  0.02009152],
        [ 0.02231257,  0.03739584,  0.01798662, ...,  0.0444636 ,
         -0.01652571,  0.02009152]]], dtype=float32)>

In [30]:
token_inputs = tf.keras.layers.Input(shape = (1, ), dtype = tf.string)
token_vectorizer = text_vectorizer(token_inputs)
token_embed = text_embed(token_vectorizer)
token_avg = tf.keras.layers.GlobalAveragePooling1D()(token_embed)
token_outputs = tf.keras.layers.Dense(123, activation = 'relu')(token_avg)
token_model = tf.keras.models.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape = (1, ),dtype = 'string')
char_vector = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vector)
char_output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_embeddings)
char_model = tf.keras.models.Model(char_inputs, char_output)

token_char_concat = tf.keras.layers.Concatenate()([token_model.output, char_model.output])

combined_dropout = tf.keras.layers.Dropout(0.5)(token_char_concat)
combined_dense = tf.keras.layers.Dense(5, activation = 'softmax')(combined_dropout)

model_2 = tf.keras.Model(inputs = [token_model.input, char_model.input],
                                outputs = combined_dense)

model_2.compile(loss = tf.keras.losses.CategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_2.summary()

In [31]:
plot_model(model_2, dpi = 70)

AttributeError: module 'pydot' has no attribute 'InvocationException'

In [None]:
#creating new dataset

train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars))
train_char_token_one_hot = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_char_token_dataset = tf.data.Dataset.zip(train_char_token_data, train_char_token_one_hot).batch(64).prefetch(tf.data.AUTOTUNE)

val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
val_char_token_one_hot = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_dataset = tf.data.Dataset.zip(val_char_token_data, val_char_token_one_hot).batch(64).prefetch(tf.data.AUTOTUNE)

test_char_token_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars))
test_char_token_one_hot = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_char_token_dataset = tf.data.Dataset.zip(test_char_token_data, test_char_token_one_hot).batch(64).prefetch(tf.data.AUTOTUNE)

