### Import Libraries

In [None]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from sentence_transformer import *
from multi_task_transformer import *
from custom_lr_schedule import *

### Load Sentences

#### The below function `load_sentences` reads a JSON file containing sentences and their labels, extracts the texts and labels for Named Entity Recognition (NER) and sentiment analysis, and ensures the NER labels are padded to a consistent length. It finally returns the texts, padded NER labels, and sentiment labels, all ready for further processing.

In [5]:
# Load sentences and labels
def load_sentences(file_path, max_length=25):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    texts = []
    task_a_labels = []
    task_b_labels = []
    
    for item in data['sentences']:
        texts.append(item['text'])
        task_a_labels.append(item['task_a_ner'])
        task_b_labels.append(item['task_b_sentiment'])
    
    # Padding the NER labels to ensure all have the same length
    task_a_labels = pad_sequences(task_a_labels, maxlen=max_length, padding='post', value=0)
    task_b_labels = np.array(task_b_labels, dtype=np.int32)  # Ensure consistent data type
    
    return texts, task_a_labels, task_b_labels

### Tokenize Sentences

#### The below function `tokenize_sentences` converts the list of sentences into sequences of integers using a tokenizer, which maps words to unique indices. It then pads these sequences to a uniform length to ensure they all have the same size, making them ready for input into a neural network.

In [6]:
# Tokenize sentences
def tokenize_sentences(texts, max_length=25):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, tokenizer.word_index

### Task 1: Single Task Transformer 

#### We define a sentence transformer model using Keras. We start by creating an input layer for sentences of a fixed length. Then, we add an embedding layer to convert words into dense vectors of a specified dimension. Next, we incorporate a transformer block, which includes multi-head attention and layer normalization, to capture the relationships between words. We further refine this representation with dense layers using ReLU activation, and then condense the information with a global average pooling layer. Finally, we get a fixed-length vector representation of the sentence, and the model is compiled and returned, ready for training on our sentence data.

### Code (in sentence_transformer.py): 
``` python
def create_sentence_transformer_model(vocab_size, max_length, embedding_dim=128, num_heads=2, ff_dim=128):
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(inputs)
    transformer_block = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
    transformer_block = LayerNormalization(epsilon=1e-6)(transformer_block)
    transformer_block = Dense(ff_dim, activation='relu')(transformer_block)
    transformer_block = Dense(embedding_dim)(transformer_block)
    pooling_layer = GlobalAveragePooling1D()(transformer_block)
    outputs = Dense(embedding_dim)(pooling_layer)
    model = Model(inputs=inputs, outputs=outputs)
    return model
```

In [9]:
# Load data and create model
max_length = 25
texts, task_a_labels, task_b_labels = load_sentences('sample_sentences.json', max_length)
padded_sequences, word_index = tokenize_sentences(texts, max_length)
vocab_size = len(word_index) + 1

model = create_sentence_transformer_model(vocab_size, max_length)
model.summary()

2024-06-09 18:36:16.005922: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-06-09 18:36:16.006004: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-09 18:36:16.006024: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-09 18:36:16.006183: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-09 18:36:16.006231: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 25, 128)              3968      ['input_1[0][0]']             
                                                                                                  
 multi_head_attention (Mult  (None, 25, 128)              131968    ['embedding[0][0]',           
 iHeadAttention)                                                     'embedding[0][0]']           
                                                                                                  
 layer_normalization (Layer  (None, 25, 128)              256       ['multi_head_attention[0][

### Embeddings returned by the sentence transformer model

In [10]:
# Test the model
embeddings = model.predict(padded_sequences)
print("Embeddings shape:", embeddings.shape)
print("Sample embeddings:", embeddings[:2])

Embeddings shape: (4, 128)
Sample embeddings: [[ 0.23459706 -0.30983886  0.5449509  -0.41976482 -0.13037814 -0.62629455
  -0.50043684  0.12368797  0.8506861   1.0991418   0.03754522  0.224706
  -0.00927556  0.1778245   0.07703954  0.68957615  0.79017144 -0.17868286
   0.3920527   0.32240635 -0.17693391 -0.24006581 -0.6862731   0.79360986
   0.5506003  -0.46879065 -1.0408198  -0.07338049 -0.47277907 -0.01008147
  -0.01873752 -0.21998017  0.21115679 -0.8307386   0.49108008 -0.35911646
  -0.33599764  0.04394788 -0.3097751  -0.74753404 -0.7493551  -0.9350343
  -0.9181761   0.02630088 -0.18897507 -0.27805424 -0.3535875   0.43355793
   0.29065672  0.54873705 -1.099189   -0.24294516  0.6208929   0.24252032
  -0.6512095   1.0038126   0.99975574 -0.3793778   0.43597817  0.05830157
   0.07843599 -0.5443698  -0.8149005   0.01671231 -0.824543    0.21585312
  -0.04120785  0.03209037  0.7750724  -0.18239939 -0.19385614  0.13677363
   0.8025378   0.17499128 -0.6129162   0.46142155 -0.3958252   0.7570

2024-06-09 18:36:16.687192: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


### Task 2: Multi-Task Transformer Model

#### In the multi-task transformer model, we start by defining an input layer to handle sentences of a fixed length. We then add an embedding layer to convert words into dense vectors, followed by a transformer block with multi-head attention and layer normalization to capture intricate word relationships. We refine these with dense layers, using ReLU activation, and pool the information into a fixed-length vector. This shared vector is then fed into two separate output layers: one for Named Entity Recognition (NER) and another for Sentiment Analysis. By sharing the core transformer and adding task-specific heads, we efficiently handle multiple NLP tasks in one model. It's a neat way to leverage shared knowledge across tasks while maintaining specialized outputs for each task.

### Code (in multi_task_transformer.py)
``` python 
def create_multi_task_model(vocab_size, max_length, embedding_dim=128, num_heads=2, ff_dim=128, num_classes_task_a=5, num_classes_task_b=2):
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(inputs)
    transformer_block = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(embedding_layer, embedding_layer)
    transformer_block = LayerNormalization(epsilon=1e-6)(transformer_block)
    transformer_block = Dense(ff_dim, activation='relu')(transformer_block)
    transformer_block = Dense(embedding_dim)(transformer_block)
    
    # For NER, we use TimeDistributed to apply the dense layer to each time step
    task_a_output = TimeDistributed(Dense(num_classes_task_a, activation='softmax'), name='task_a')(transformer_block)
    
    # Pooling layer for shared encoder output
    pooling_layer = GlobalAveragePooling1D()(transformer_block)
    
    # Shared encoder output
    encoded_output = Dense(embedding_dim)(pooling_layer)
    
    # Task B: Sentiment Analysis
    task_b_output = Dense(num_classes_task_b, activation='softmax', name='task_b')(encoded_output)
    
    model = Model(inputs=inputs, outputs=[task_a_output, task_b_output])
    return model 
```

In [19]:
max_length = 25  # Define max_length for padding
texts, task_a_labels, task_b_labels = load_sentences('sample_sentences.json', max_length)

# Tokenize and pad the sentences
padded_sequences, word_index = tokenize_sentences(texts, max_length)

# Create the multi-task model
vocab_size = len(word_index) + 1
multi_task_model = create_multi_task_model(vocab_size, max_length)

In [22]:
multi_task_model.compile(optimizer=optimizer, loss={'task_a': 'sparse_categorical_crossentropy', 'task_b': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])



In [23]:
# For demonstration purposes, let's use random labels for training
task_a_labels_random = np.random.randint(0, 5, size=(len(texts), max_length))
task_b_labels_random = np.random.randint(0, 2, size=(len(texts),))

# Train the model briefly
multi_task_model.fit(padded_sequences, {'task_a': task_a_labels_random, 'task_b': task_b_labels_random}, epochs=3)

# Now we predict using the trained model
predictions = multi_task_model.predict(padded_sequences)

# Extract the embeddings for both tasks
task_a_embeddings = predictions[0]
task_b_embeddings = predictions[1]

Epoch 1/3


2024-06-09 18:56:36.936933: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


2024-06-09 18:56:37.999227: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




### Embeddings returned by the multi-task model

In [24]:
# Show the embeddings
print("Embeddings for Task A (NER):")
print(task_a_embeddings)

print("\nEmbeddings for Task B (Sentiment):")
print(task_b_embeddings)

# If you want to see the embeddings for a specific input, you can index into the predictions
# For example, embeddings for the first input sentence
print("\nEmbeddings for the first input sentence (Task A):")
print(task_a_embeddings[0])

print("\nEmbeddings for the first input sentence (Task B):")
print(task_b_embeddings[0])

Embeddings for Task A (NER):
[[[0.28270373 0.14954853 0.15299174 0.27705252 0.1377034 ]
  [0.2827034  0.14954832 0.15299176 0.27705282 0.13770373]
  [0.28270358 0.14954841 0.15299182 0.27705264 0.1377036 ]
  [0.2827036  0.14954856 0.15299189 0.2770525  0.13770348]
  [0.28270352 0.1495485  0.15299183 0.27705264 0.13770358]
  [0.2827037  0.1495485  0.15299173 0.27705264 0.13770345]
  [0.28270346 0.14954846 0.1529918  0.27705267 0.13770363]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.15299171 0.27705175 0.13770251]
  [0.2827048  0.14954919 0.

### Task 4: Layer-wise Learning Rate Implementation

#### The custom learning rate schedule class in TensorFlow is designed to dynamically adjust the learning rate during training. We start by initializing it with a base learning rate, a decay rate, and the number of layers. The __call__ method is then used to calculate the learning rate at each step by dividing the base learning rate by one plus the decay rate times the current step. This makes sure that as training progresses, the learning rate gradually decreases, helping the model converge better. We also have a get_config method to make the schedule serializable, returning the configuration parameters. This custom schedule helps us fine-tune our model's training process more effectively.

### Code (in custom_lr_schedule.py):
```python
class CustomLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, base_learning_rate, decay_rate, num_layers):
        super(CustomLearningRateSchedule, self).__init__()
        self.base_learning_rate = base_learning_rate
        self.decay_rate = decay_rate
        self.num_layers = num_layers

    def __call__(self, step):
        return self.base_learning_rate / (1 + self.decay_rate * tf.cast(step, tf.float32))
    
    def get_config(self):
        return {
            'base_learning_rate': self.base_learning_rate,
            'decay_rate': self.decay_rate,
            'num_layers': self.num_layers
        }
```

In [12]:
# Applying the custom learning rate schedule
base_learning_rate = 0.001
decay_rate = 0.01
num_layers = 10
learning_rate_schedule = CustomLearningRateSchedule(base_learning_rate, decay_rate, num_layers)
optimizer = Adam(learning_rate=learning_rate_schedule)

multi_task_model.compile(optimizer=optimizer, loss={'task_a': 'sparse_categorical_crossentropy', 'task_b': 'sparse_categorical_crossentropy'}, metrics=['accuracy'])

# Print model summary
multi_task_model.summary()



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 25, 128)              3968      ['input_2[0][0]']             
                                                                                                  
 multi_head_attention_1 (Mu  (None, 25, 128)              131968    ['embedding_1[0][0]',         
 ltiHeadAttention)                                                   'embedding_1[0][0]']         
                                                                                                  
 layer_normalization_1 (Lay  (None, 25, 128)              256       ['multi_head_attention_1

### Embeddings returned by the model 

In [14]:
# Ensure shapes and data types are correct
print(f"Shape of padded_sequences: {padded_sequences.shape}, Data type: {padded_sequences.dtype}")
print(f"Shape of task_a_labels: {task_a_labels.shape}, Data type: {task_a_labels.dtype}")
print(f"Shape of task_b_labels: {task_b_labels.shape}, Data type: {task_b_labels.dtype}")

# Train the model
multi_task_model.fit(padded_sequences, {'task_a': task_a_labels, 'task_b': task_b_labels}, epochs=30)

# Test the model
predictions = multi_task_model.predict(padded_sequences)
print("Predictions for Task A (NER):", predictions[0])
print("Predictions for Task B (Sentiment):", predictions[1])

Shape of padded_sequences: (4, 25), Data type: int32
Shape of task_a_labels: (4, 25), Data type: int32
Shape of task_b_labels: (4,), Data type: int32
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Predictions for Task A (NER): [[[9.69794631e-01 2.94138286e-02 1.77745038e-04 1.67104736e-04
   4.46671183e-04]
  [9.69783187e-01 2.94256043e-02 1.77686859e-04 1.67015300e-04
   4.46528778e-04]
  [9.69785511e-01 2.94231623e-02 1.77693553e-04 1.67029706e-04
   4.46548365e-04]
  [9.69771266e-01 2.94377878e-02 1.77641807e-04 1.66935381e-04
   4.46410879e-04]
  [9.69795287e-01 2.94132046e-02 1.77738897e-04 1.67101651e-04
   4.46660415e-04]
  [9.69797432e-01 2.94111241e-02 1.77733178e-04 1.67104576e