## Combine Token Embedding and Character Embedding

In [1]:
# prepare data
import os
import pandas as pd

train_data_file_name = 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt'
val_data_file_name   = 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt'
test_data_file_name  = 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt'

def create_data_df(file_name):
    data = []
    lines = open(file_name,'r').readlines()
    abstract_lines = ''
    
    for line in lines:
        if line.startswith('###'):
            abstract_lines = ''
        elif line.startswith('\n'):
            all_lines = abstract_lines.splitlines()
            line_no = 0
            total_lines = len(all_lines) - 1
            for abstract in all_lines:
                current_abstract = abstract.split('\t')
                target, text = current_abstract[0],current_abstract[1]
                item = {'line_number': line_no,
                    'target': target,
                       'text': text.lower(),
                       'total_lines' :total_lines}
                data.append(item)
                line_no += 1
        else:
            abstract_lines += line
    return pd.DataFrame(data)
   

In [2]:
train_df = create_data_df(train_data_file_name)
val_df = create_data_df(val_data_file_name)
train_df.head()

Unnamed: 0,line_number,target,text,total_lines
0,0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,11
1,1,METHODS,a total of @ patients with primary knee oa wer...,11
2,2,METHODS,outcome measures included pain reduction and i...,11
3,3,METHODS,pain was assessed using the visual analog pain...,11
4,4,METHODS,secondary outcome measures included the wester...,11


In [3]:
# one hot encoding of text labels.
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse_output=False)
train_labels_one_hot  = one_hot.fit_transform(train_df['target'].to_numpy().reshape(-1,1))
val_labels_one_hot    = one_hot.fit_transform(val_df['target'].to_numpy().reshape(-1,1))

In [4]:
train_df['target'].to_numpy()

array(['OBJECTIVE', 'METHODS', 'METHODS', ..., 'RESULTS', 'CONCLUSIONS',
       'CONCLUSIONS'], dtype=object)

In [5]:
from tensorflow.data import Dataset
import tensorflow as tf

# prepare tensorflow dataset
train_dataset = Dataset.from_tensor_slices((train_df['text'].tolist(),train_labels_one_hot))
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = Dataset.from_tensor_slices((val_df['text'].tolist(),val_labels_one_hot))
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset,val_dataset

2023-06-14 06:06:13.112200: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-14 06:06:17.374769: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-14 06:06:17.384865: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-14 06:06:17.385543: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been bu

(<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>,
 <_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>)

### Text Vectorization

In [6]:
# create token vectorization layer
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding

max_tokens = 68000 # vocab size
# find the lengths of each sentences
sentence_lengths = [ len(sentence) for sentence in train_df['text'].tolist()]
output_sequence_length = int(np.percentile(sentence_lengths,95)) # how many tokens to consider to cover 95 % of the tokens.
text_vectorizor = TextVectorization(max_tokens=max_tokens,
                                   output_sequence_length=output_sequence_length,
                                   name='text_vectorizer_layer')
text_vectorizor.adapt(train_df['text'].tolist())

### Token Embedding

In [7]:
# create token embedding layer.
from tensorflow.keras.layers import Embedding

input_dimension = len(text_vectorizor.get_vocabulary()) # length of vacabulary size
output_dimension = 128 # feature vector
print(f'input dimension is {input_dimension}')
token_vector_embed = Embedding(input_dim=input_dimension,
                              output_dim=output_dimension,
                              mask_zero=True,
                              name='token_vector_embedding')

input dimension is 64841


### Token Embeddings Model

In [8]:
# create model for token embedding.

import tensorflow as tf
from tensorflow.keras.layers import Input, TextVectorization, Embedding, Dense
from tensorflow.keras import layers, Model

inputs = Input(shape=[],dtype=tf.string,name='token_input_layer')
x = text_vectorizor(inputs)
x = token_vector_embed(x)
outputs = Dense(units=128,activation='relu',name='token_embeddings_output_layer')(x)
token_embeddings_model = Model(inputs,outputs, name='token_embeddings_model')
token_embeddings_model.summary()

Model: "token_embeddings_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_input_layer (InputLay  [(None,)]                0         
 er)                                                             
                                                                 
 text_vectorizer_layer (Text  (None, 290)              0         
 Vectorization)                                                  
                                                                 
 token_vector_embedding (Emb  (None, 290, 128)         8299648   
 edding)                                                         
                                                                 
 token_embeddings_output_lay  (None, 290, 128)         16512     
 er (Dense)                                                      
                                                                 
Total params: 8,316,160
Trainable params: 8,

### Token Embedding Model with Universal Sentence Encoder

In [9]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Model

use = 'https://tfhub.dev/google/universal-sentence-encoder/4'
universal_sentence_encoder_layer = hub.KerasLayer(use,
                                                 trainable=False,
                                                 name='universal_sentence_encoder_layer')
token_inputs = Input(shape=[],dtype=tf.string,name='token_input')
token_embeddings = universal_sentence_encoder_layer(token_inputs)
token_outputs = Dense(units=128,activation='relu',name='fully_connected_layer')(token_embeddings)
token_model_with_use = Model(inputs=token_inputs,
                             outputs=token_outputs)
token_model_with_use.summary()

2023-06-14 06:06:48.907883: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]
2023-06-14 06:06:49.016792: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_input (InputLayer)    [(None,)]                 0         
                                                                 
 universal_sentence_encoder_  (None, 512)              256797824 
 layer (KerasLayer)                                              
                                                                 
 fully_connected_layer (Dens  (None, 128)              65664     
 e)                                                              
                                                                 
Total params: 256,863,488
Trainable params: 65,664
Non-trainable params: 256,797,824
_________________________________________________________________


## Character Embedding Model

In [10]:
# convert sentence to characters

def split_line_to_char_sequence(sentence):
    return ' '.join(list(sentence))

def update(x):
    return '2' + x

In [11]:
train_df['chars'] = train_df['text']
train_df['chars'] = train_df['chars'].apply(split_line_to_char_sequence)

val_df['chars'] = val_df['text']
val_df['chars'] = val_df['chars'].apply(split_line_to_char_sequence)

In [12]:
train_df.head() 

Unnamed: 0,line_number,target,text,total_lines,chars
0,0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,11,t o i n v e s t i g a t e t h e e f f i ...
1,1,METHODS,a total of @ patients with primary knee oa wer...,11,a t o t a l o f @ p a t i e n t s w ...
2,2,METHODS,outcome measures included pain reduction and i...,11,o u t c o m e m e a s u r e s i n c l u d ...
3,3,METHODS,pain was assessed using the visual analog pain...,11,p a i n w a s a s s e s s e d u s i n g ...
4,4,METHODS,secondary outcome measures included the wester...,11,s e c o n d a r y o u t c o m e m e a s u ...


In [13]:
val_df.head() 

Unnamed: 0,line_number,target,text,total_lines,chars
0,0,BACKGROUND,ige sensitization to aspergillus fumigatus and...,9,i g e s e n s i t i z a t i o n t o a s ...
1,1,BACKGROUND,it is not clear whether these patients would b...,9,i t i s n o t c l e a r w h e t h e r ...
2,2,OBJECTIVE,we sought to determine whether a @-month cours...,9,w e s o u g h t t o d e t e r m i n e ...
3,3,METHODS,asthmatic patients who were ige sensitized to ...,9,a s t h m a t i c p a t i e n t s w h o ...
4,4,METHODS,primary outcomes were improvement in quality o...,9,p r i m a r y o u t c o m e s w e r e i ...


In [14]:
# max_token in character embedding is printable ascii chracters
import string
import numpy as np

max_tokens = len(string.ascii_lowercase + string.digits + string.punctuation)
max_tokens

68

In [15]:
lengths_of_chars = [len(chars) for chars in train_df['chars'].tolist()]
output_sequence_length = int(np.percentile(lengths_of_chars,95))
output_sequence_length

579

In [16]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Dense, Input,Embedding, Bidirectional, LSTM
from tensorflow.keras import Model

text_vectorizer = TextVectorization(max_tokens=max_tokens,
                                  output_sequence_length=output_sequence_length,
                                   standardize='lower',
                                  name='text_vectorize')
text_vectorizer.adapt(train_df['chars'].tolist())
size_of_vocab = len(text_vectorizer.get_vocabulary())
print('size of vocabulary',size_of_vocab)
features = 25
char_embedder = Embedding(input_dim=size_of_vocab,
                         output_dim=features,
                          mask_zero=True,
                         name='char_embedding_layer')

# finally the model
char_inputs = Input(shape=(1,), dtype=tf.string,name='char_input')
char_vectors = text_vectorizer(char_inputs)
char_embeddings = char_embedder(char_vectors)
char_bi_lstm = Bidirectional(LSTM(features))(char_embeddings)
char_model = Model(inputs=char_inputs,outputs=char_bi_lstm)
char_model.summary()

size of vocabulary 57
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 char_input (InputLayer)     [(None, 1)]               0         
                                                                 
 text_vectorize (TextVectori  (None, 579)              0         
 zation)                                                         
                                                                 
 char_embedding_layer (Embed  (None, 579, 25)          1425      
 ding)                                                           
                                                                 
 bidirectional (Bidirectiona  (None, 50)               10200     
 l)                                                              
                                                                 
Total params: 11,625
Trainable params: 11,625
Non-trainable params: 0
_________________________________

## Concat Token Model and Char Model

In [17]:
from tensorflow.keras.layers import Concatenate

token_char_concat = Concatenate(name='token_char_hybrid')([token_model_with_use.output,char_model.output])
token_char_concat

<KerasTensor: shape=(None, 178) dtype=float32 (created by layer 'token_char_hybrid')>

## Create Other layers on top of concatenated layer

In [18]:
from tensorflow.keras.layers import Dropout

combined_dropout = Dropout(0.5)(token_char_concat)
combined_dense = Dense(units=128,activation='relu')(combined_dropout)
combined_final_dropout = Dropout(0.5)(combined_dense)
combined_output = Dense(5,activation='softmax')(combined_final_dropout)

# final concatenated model
model_4 = Model(inputs=[token_model_with_use.input,char_model.input],
               outputs=combined_output,
               name='model_4_token_and_char_embeddings')
model_4.summary()

Model: "model_4_token_and_char_embeddings"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 token_input (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 text_vectorize (TextVectorizat  (None, 579)         0           ['char_input[0][0]']             
 ion)                                                                                             
                                                                                                  
 universal_sentence_encoder_lay  (None, 512)         256797824   [

## Plot the model (276)

In [19]:
from tensorflow.keras.utils import plot_model

plot_model(model_4,show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Prepare Dataset (277)

In [21]:
from tensorflow.data import Dataset

# train data set
train_token_char_data = Dataset.from_tensor_slices( (train_df['text'].tolist(),train_df['chars'].tolist()))
train_token_char_label = Dataset.from_tensor_slices(train_labels_one_hot)
train_token_char_dataset = Dataset.zip((train_token_char_data,train_token_char_label))

# prefetch
train_token_char_dataset = train_token_char_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
print(train_token_char_dataset)

# val data set
val_token_char_data = Dataset.from_tensor_slices((val_df['text'].tolist(),val_df['chars'].tolist()))
val_token_char_label = Dataset.from_tensor_slices(val_labels_one_hot)
val_token_char_dataset = Dataset.zip((val_token_char_data,val_token_char_label))

val_token_char_dataset = val_token_char_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
print(val_token_char_dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>
<_PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.string, name=None)), TensorSpec(shape=(None, 5), dtype=tf.float64, name=None))>


# Build Fit and Evaluate (278)

In [22]:
# compile
model_4.compile(loss='categorical_crossentropy',
               optimizer=tf.keras.optimizers.Adam(),
               metrics=['accuracy'])

In [23]:
tf.debugging.set_log_device_placement(True)
model_4_history = model_4.fit(train_token_char_dataset,
                              steps_per_epoch=int(0.1*len(train_token_char_dataset)),
                             epochs=3,
                             validation_data=val_token_char_dataset,
                             validation_steps=int(0.1* len(val_token_char_dataset)))

Epoch 1/3


2023-06-14 06:07:27.310940: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [180040,5]
	 [[{{node Placeholder/_3}}]]
2023-06-14 06:07:31.019962: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-06-14 06:07:32.153267: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for

2023-06-14 06:07:32.511537: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall' with dtype float and shape [?,512]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall}}]]
2023-06-14 06:07:32.511850: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1' with dtype float and shape [?,1]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_1}}]]
2023-06-14 06:07:32.512024: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor s

2023-06-14 06:07:37.531138: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1]
	 [[{{node gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis}}]]
2023-06-14 06:07:39.640751: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	w



2023-06-14 06:09:11.340990: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_3' with dtype double and shape [30212,5]
	 [[{{node Placeholder/_3}}]]


Epoch 2/3
Epoch 3/3


In [24]:
model_4.evaluate(val_token_char_dataset)



[0.6859030723571777, 0.73533695936203]

In [25]:
model_4_pred_probs = model_4.predict(val_token_char_dataset)
model_4_pred_probs



array([[4.1611385e-01, 3.8750154e-01, 1.6606586e-03, 1.8853272e-01,
        6.1912090e-03],
       [3.2891273e-01, 5.1673281e-01, 3.5368409e-03, 1.4805631e-01,
        2.7613144e-03],
       [2.9735744e-01, 2.1243472e-01, 4.1535951e-02, 4.0741053e-01,
        4.1261382e-02],
       ...,
       [2.0100757e-04, 1.4720390e-03, 2.3350893e-02, 9.2542308e-05,
        9.7488344e-01],
       [1.1757055e-02, 7.4301951e-02, 3.1550628e-01, 6.8736807e-03,
        5.9156108e-01],
       [2.7003905e-01, 3.5344753e-01, 2.7998176e-01, 2.5034536e-02,
        7.1497060e-02]], dtype=float32)

In [26]:
model_4_pred = tf.argmax(model_4_pred_probs,axis=1)
model_4_pred

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([0, 1, 3, ..., 4, 4, 1])>

In [27]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

def calculate_scores(y_true,y_pred):
  accuracy = accuracy_score(y_true,y_pred)
  precision = precision_score(y_true,y_pred,average='weighted')
  recall = recall_score(y_true,y_pred,average='weighted')
  f1 = f1_score(y_true,y_pred,average='weighted')
  output = {'accuracy_score' : accuracy,
            'precision_score':precision,
            'recall_score':recall,
            'f1_score':f1}
  return output

In [28]:
from sklearn.preprocessing import LabelEncoder

labelencode = LabelEncoder()
train_label_encoded = labelencode.fit_transform(train_df['target'].to_numpy())
val_label_encoded = labelencode.transform(val_df['target'].to_numpy())

calculate_scores(val_label_encoded.tolist(),model_4_pred.numpy().tolist())

{'accuracy_score': 0.7353369522044221,
 'precision_score': 0.737317171918279,
 'recall_score': 0.7353369522044221,
 'f1_score': 0.7311062540475689}