## Model Development

Objective: Develop a custom transformer model using Keras framework and then enhance the project by retraining a pre-trained model for comparision.

Hints:
- Modular Code: Keep your code modular to facilitate easy switching between the custom model and the BART model for different experiments.
- Documentation: Document each step in your model development process, including parameter settings and the rationale behind chosen architectures.
- Version Control: Commit all changes, especially new scripts and configurations, to GitHub to maintain a robust version history.
- Continuous Monitoring: Regularly monitor training progress using TensorBoard integrated with Keras to visualize performance metrics.

#### Imports

In [54]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D

import numpy as np
import pandas as pd

In [55]:
# Check if GPU enabled 
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

#### Get the data

In [56]:
data = np.load('../data/dataset_cleaned.npz', allow_pickle=True)

X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']
X_val = data['X_val']
y_val = data['y_val']


In [31]:
X = np.concatenate((X_train, X_test, X_val))
y = np.concatenate((y_train, y_test, y_val))

max_len_input = lenA(max(X, key=len).split())
max_len_output = len(max(y, key=len).split())
print(max_len_input)

2845


In [32]:
tokenizer = Tokenizer(num_words=max_len_input)
tokenizer.fit_on_texts(X)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)
y_train = tokenizer.texts_to_sequences(y_train)
y_test = tokenizer.texts_to_sequences(y_test)
y_val = tokenizer.texts_to_sequences(y_val)

vocab_size = len(tokenizer.get_config()['word_counts'])
print(vocab_size)

452075


In [33]:
# Standardize Data by padding sequences
X_train = pad_sequences(X_train, maxlen=max_len_input, padding ='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len_input, padding ='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=max_len_input, padding ='post', truncating='post')
y_train = pad_sequences(X_train, maxlen=max_len_output, padding ='post', truncating='post')
y_test = pad_sequences(X_test, maxlen=max_len_output, padding ='post', truncating='post')
y_val = pad_sequences(X_val, maxlen=max_len_output, padding ='post', truncating='post')

### Develop Custom Transformer Model with Keras

In [34]:
# Positional Encoding Function
def get_positional_encoding(seq_length, d_model):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / d_model)
    angle_rads = np.arange(seq_length)[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [35]:
# Transformer Block Function
def transformer_block(x, num_heads, d_model, dff, rate, training):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
    attn_output = Dropout(rate)(attn_output, training=training)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)
    ffn_output = Dense(dff, activation='relu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output, training=training)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

In [36]:
# Build the Transformer Model for Text Summarization
def build_model(max_len_input, max_len_output, vocab_size, num_heads=8, d_model=128, dff=512, rate=0.1):
    
    # Input to the encoder
    encoder_inputs = Input(shape=(max_len_input,), name="encoder_input")
    encoder_embedding = Embedding(vocab_size, d_model, name="encoder_embedding")(encoder_inputs)
    encoder_pos_encoding = get_positional_encoding(max_len_input, d_model)
    encoder_embedding += encoder_pos_encoding
    
    # Encoder
    encoder_output = encoder_embedding
    for _ in range(4):
        encoder_output = transformer_block(encoder_output, num_heads, d_model, dff, rate, training=True)
        
    # Input to the decoder
    decoder_inputs = Input(shape=(max_len_output,), name="decoder_input")
    decoder_embedding = Embedding(vocab_size, d_model, name="decoder_embedding")(decoder_inputs)
    decoder_pos_encoding = get_positional_encoding(max_len_output, d_model)
    decoder_embedding += decoder_pos_encoding
    
    # Decoder
    decoder_output = decoder_embedding
    for _ in range(4):
        decoder_output = transformer_block(decoder_output, num_heads, d_model, dff, rate, training=True)
        
    # Output layer
    outputs = Dense(vocab_size, activation="softmax")(decoder_output)    
    
    # Define the model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

In [37]:
# Create the model
import os
import re
import keras
import mlflow
import datetime
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

create_new_model = True

version = 0

models = os.listdir("../models/transformer/")
sub1 = "transformer_model_v"
sub2 = ".keras"
s=str(re.escape(sub1))
e=str(re.escape(sub2))

print(models)

for m in models:
    v = re.findall(s+"(.*)"+e,m)[0]
    if int(v) > version:
        version = int(v)

                    
if create_new_model:
    
    
    version = version + 1

    name = "transformer_model_v" + str(version) + ".keras" 

    mlflow.set_experiment(name)
    experiment = mlflow.get_experiment_by_name(name)
    
    mlflow.autolog()
    
    with mlflow.start_run():
     
        model = build_model(max_len_input, max_len_output, vocab_size)

        log_dir = "../logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        checkpoint_callback = ModelCheckpoint("../models/transformer/" + name, save_best_only=True)
        early_stopping_callback = EarlyStopping(patience=5, restore_best_weights=True)
        reduce_lr_callback = ReduceLROnPlateau(patience=5, factor=0.1)

        history = model.fit(
            [X_train, y_train], y_train, batch_size=32, epochs=4, verbose=True, validation_data=([X_val, y_val], y_val), callbacks=[tensorboard_callback, checkpoint_callback, early_stopping_callback, reduce_lr_callback]
        )

        model.save("../models/transformer/transformer_model_v" + str(version) + ".keras")
else:
    model = keras.models.load_model("../models/transformer/transformer_model_v" + str(version) + ".keras")
    
model.summary()

2024/04/24 15:54:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


['transformer_model_v1.keras', 'transformer_model_v3.keras', 'transformer_model_v2.keras']


2024/04/24 15:54:38 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1/4
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 8s/step - accuracy: 0.0083 - loss: nan - val_accuracy: 0.0229 - val_loss: nan - learning_rate: 0.0010
Epoch 2/4
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 8s/step - accuracy: 0.0110 - loss: nan - val_accuracy: 0.0229 - val_loss: nan - learning_rate: 0.0010
Epoch 3/4
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 7s/step - accuracy: 0.0167 - loss: nan - val_accuracy: 0.0229 - val_loss: nan - learning_rate: 0.0010
Epoch 4/4
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 7s/step - accuracy: 0.0205 - loss: nan - val_accuracy: 0.0229 - val_loss: nan - learning_rate: 0.0010


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [38]:
def predict_summary(input_text, tokenizer, model, max_len_input, max_len_output):
    
  input_seq = tokenizer.texts_to_sequences([input_text])
  input_seq_padded = pad_sequences(input_seq, maxlen=max_len_input, padding='post', truncating='post')

  decoder_input = [1]
  decoder_input_padded = pad_sequences([decoder_input], maxlen=max_len_output, padding ='post', truncating='post')
  summary = []

  for _ in range(0, max_len_output - 1):
        
    predictions = model.predict([input_seq_padded, decoder_input_padded])
    next_word_id = np.argmax(predictions[0, len(decoder_input)-1, :])
    summary.append(next_word_id)
    
    
    decoder_input.append(next_word_id)
    decoder_input_padded = pad_sequences([decoder_input], maxlen=max_len_output, padding ='post', truncating='post')
    
    predicted_sequence = tokenizer.sequences_to_texts([summary])

  return predicted_sequence
    

In [39]:
input_string = "Columbia University has extended remote classes at its main New York City campus for the rest of term amid tense Gaza war protests that have spread to US colleges nationwide. The hybrid learning comes as some students have reported antisemitic harassment around Columbia's campus. Some 133 were arrested on Monday in protests at New York University. Dozens of arrests were also made in rallies at Yale, while Harvard has restricted access to the campus. Gaza war demonstrations have also cropped up at colleges in the US Midwest and on the West Coast, where one campus has been closed. Nine students were arrested in Minneapolis on Tuesday morning as they attempted to set up a protest camp in front of a library on the University of Minnesota campus. On Monday, President Joe Biden said he condemned both \"the antisemitic protests\" as well as \"those who don't understand what's going on with the Palestinians\". Columbia provost Angela Olinto announced students would have the option of attending classes remotely at the Ivy League institution's main Morningside Campus until the last day of classes on 29 April. \"Safety is our highest priority,\" she said in an email on Monday night. Jewish students have expressed concern about antisemitism on and around Columbia's campus."

In [40]:
output = predict_summary(input_string, tokenizer, model, max_len_input, max_len_output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 892ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

In [41]:
print(output)

['']


### Select and Retrain Pre-trained Model

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import mlflow
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 

from transformers import AutoTokenizer, TFBartForConditionalGeneration
import pandas as pd

In [2]:
data = np.load('../data/dataset_cleaned.npz', allow_pickle=True)

X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']
X_val = data['X_val']
y_val = data['y_val']

max_len_input = len(max(X_train, key=len).split())
max_len_output = len(max(y_train, key=len).split())

train_source_texts = X_train.tolist()
train_target_texts = y_train.tolist()
val_source_texts = X_val.tolist()
val_target_texts = y_val.tolist()

print(len(X_train))


621


In [5]:
def convert_to_dataset(data, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(data)
    dataset = dataset.map(lambda x: {key: tf.reshape(val, [-1]) for key, val in x.items()})
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset



In [6]:
def prepare_data_for_bart(source_texts, target_texts, max_len=512):
    """
    Prepares data for BART model training.
    Args:
      source_texts: List of strings containing document text.
      target_texts: List of strings containing corresponding summaries.
      max_len: Maximum sequence length for tokenization (optional).
    Returns:
      A dictionary containing tokenized inputs and labels.
    """
    inputs = tokenizer(source_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="tf")
    labels = tokenizer(target_texts, padding="max_length", truncation=True, max_length=max_len, return_tensors="tf")

    print("Input IDs shape:", inputs['input_ids'].shape)
    print("Labels IDs shape:", labels['input_ids'].shape)

    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "decoder_attention_mask": labels.attention_mask,
        "labels": labels.input_ids  # labels are shifted for teacher forcing
    }

    

In [7]:

# Define model name
model_name = "facebook/bart-base"

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFBartForConditionalGeneration.from_pretrained(model_name)

# Prepare and convert training and validation data
train_data_dict = prepare_data_for_bart(train_source_texts, train_target_texts)
val_data_dict = prepare_data_for_bart(val_source_texts, val_target_texts)
train_dataset = convert_to_dataset(train_data_dict, batch_size=8)
val_dataset = convert_to_dataset(val_data_dict, batch_size=8)



All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Input IDs shape: (621, 512)
Labels IDs shape: (621, 512)
Input IDs shape: (208, 512)
Labels IDs shape: (208, 512)


In [8]:
import os 
import re
import tf_keras

# Define optimizer and loss function
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

create_new_fine_tuned_model = True

version = 0
models = os.listdir("../models/fine_tuned/")
sub1 = "fine_tuned_model_v"
sub2 = ".keras"
s=str(re.escape(sub1))
e=str(re.escape(sub2))

for m in models:
    v = re.findall(s+"(.*)"+e,m)[0]
    if int(v) > version:
        version = int(v)

                    
if create_new_fine_tuned_model:
    
    
    version = version + 1

    name = "fine_tuned_model_v" + str(version) + ".keras" 

    mlflow.set_experiment(name)
    experiment = mlflow.get_experiment_by_name(name)
    
    mlflow.autolog()
    
    with mlflow.start_run():

        # Training loop
        epochs = 1

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            batch_count = 0
            for batch in train_dataset:
                batch_count += 1
                print(f"Training batch {batch_count}...")
                with tf.GradientTape() as tape:
                    outputs = model(**batch)
                    loss_value = loss(batch["labels"], outputs.logits)
                    print(f"Batch {batch_count} loss: {loss_value.numpy()}")
                grads = tape.gradient(loss_value, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))
                tf.keras.backend.clear_session()  # Clear memory

            print(f"Completed {batch_count} batches.")
            
            
            model.save_pretrained("../models/fine_tuned/fine_tuned_model_v" + str(version) + ".keras")

            # Evaluate on validation data
            # val_loss, val_acc = model.evaluate(val_dataset)
            # print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")


else:
    model = keras.models.load_model("../models/fine_tuned/fine_tuned_model_v" + str(version) + ".keras")
    

model.summary()
    

2024/04/25 14:48:23 INFO mlflow.tracking.fluent: Experiment with name 'fine_tuned_model_v2.keras' does not exist. Creating a new experiment.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024/04/25 14:48:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2024/04/25 14:48:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been

Epoch 1/1
Training batch 1...
Batch 1 loss: 14.645500183105469
Training batch 2...



KeyboardInterrupt



In [None]:
from transformers import AutoTokenizer, TFBartForConditionalGeneration

# Define model name
model_name = "facebook/bart-base"

# Load tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFBartForConditionalGeneration.from_pretrained(model_name)

# Generate summaries using the fine-tuned model
new_text = """KerasNLP is a natural language processing library that works natively with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers, metrics, and tokenizers can be trained and serialized in any framework and re-used in another without costly migrations.

KerasNLP supports users through their entire development cycle. Our workflows are built from modular components that have state-of-the-art preset weights when used out-of-the-box and are easily customizable when more control is needed.

This library is an extension of the core Keras API; all high-level modules are Layers or Models that receive that same level of polish as core Keras. If you are familiar with Keras, congratulations! You already understand most of KerasNLP."""
encoded_text = tokenizer.encode(new_text, return_tensors="tf")
summary_ids = model.generate(encoded_text)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(f"Generated Summary: {summary}")

### Setup MLflow for Experiment tracking

### Training and Evaluation Setup