## Model Development

Objective: Develop a custom transformer model using Keras framework and then enhance the project by retraining a pre-trained model for comparision.

Hints:
- Modular Code: Keep your code modular to facilitate easy switching between the custom model and the BART model for different experiments.
- Documentation: Document each step in your model development process, including parameter settings and the rationale behind chosen architectures.
- Version Control: Commit all changes, especially new scripts and configurations, to GitHub to maintain a robust version history.
- Continuous Monitoring: Regularly monitor training progress using TensorBoard integrated with Keras to visualize performance metrics.

#### Imports

In [400]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D

import numpy as np
import pandas as pd

In [401]:
# Check if GPU enabled 
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

#### Get the data

In [405]:
data = np.load('../data/cnn_dailymail.npz', allow_pickle=True)
#data = np.load('../data/cnn_dailymail_small.npz', allow_pickle=True)

X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test']
y_test = data['y_test']
X_val = data['X_val']
y_val = data['y_val']

data = np.load('../data/news_api_data.npz', allow_pickle=True)
#data = np.load('../data/news_api_data_small.npz', allow_pickle=True)

X_train = np.concatenate((data['X_train'], X_train))
y_train = np.concatenate((data['y_train'], y_train))
X_test = np.concatenate((data['X_test'], X_test))
y_test = np.concatenate((data['y_test'], y_test))
X_val = np.concatenate((data['X_val'], X_val))
y_val = np.concatenate((data['y_val'], y_val))


In [406]:
X = np.concatenate((X_train, X_test, X_val))
y = np.concatenate((y_train, y_test, y_val))

max_len_input = len(max(X, key=len).split())
max_len_output = len(max(y, key=len).split())
print(max_len_input)

2845


In [384]:
tokenizer = Tokenizer(num_words=max_len)
tokenizer.fit_on_texts(X)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)
y_train = tokenizer.texts_to_sequences(y_train)
y_test = tokenizer.texts_to_sequences(y_test)
y_val = tokenizer.texts_to_sequences(y_val)

vocab_size = len(tokenizer.get_config()['word_counts'])
print(vocab_size)

433377


In [385]:
tokenizer.texts_to_sequences(["This is a test of the tokenizer"])

[[27, 11, 3, 1396, 4, 1]]

In [386]:
tokenizer.sequences_to_texts([[17, 11, 3, 2453, 5, 1]]) # tokenizer must not be in any of the articles

['his is a lots and the']

In [387]:
# Standardize Data by padding sequences
X_train = pad_sequences(X_train, maxlen=max_len_input, padding ='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len_input, padding ='post', truncating='post')
X_val = pad_sequences(X_val, maxlen=max_len_input, padding ='post', truncating='post')
y_train = pad_sequences(X_train, maxlen=max_len_output, padding ='post', truncating='post')
y_test = pad_sequences(X_test, maxlen=max_len_output, padding ='post', truncating='post')
y_val = pad_sequences(X_val, maxlen=max_len_output, padding ='post', truncating='post')

### Develop Custom Transformer Model with Keras

In [388]:
# Positional Encoding Function
def get_positional_encoding(seq_length, d_model):
    angle_rates = 1 / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / d_model)
    angle_rads = np.arange(seq_length)[:, np.newaxis] * angle_rates
    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [389]:
# Transformer Block Function
def transformer_block(x, num_heads, d_model, dff, rate, training):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
    attn_output = Dropout(rate)(attn_output, training=training)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)
    ffn_output = Dense(dff, activation='relu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output, training=training)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

In [390]:
# Build the Transformer Model for Text Summarization
def build_model(max_len_input, max_len_output, vocab_size, num_heads=8, d_model=128, dff=512, rate=0.1):
    
    # Input to the encoder
    encoder_inputs = Input(shape=(max_len_input,), name="encoder_input")
    encoder_embedding = Embedding(vocab_size, d_model, name="encoder_embedding")(encoder_inputs)
    encoder_pos_encoding = get_positional_encoding(max_len_input, d_model)
    encoder_embedding += encoder_pos_encoding
    
    # Encoder
    encoder_output = encoder_embedding
    for _ in range(4):
        encoder_output = transformer_block(encoder_output, num_heads, d_model, dff, rate, training=True)
        
    # Input to the decoder
    decoder_inputs = Input(shape=(max_len_output,), name="decoder_input")
    decoder_embedding = Embedding(vocab_size, d_model, name="decoder_embedding")(decoder_inputs)
    decoder_pos_encoding = get_positional_encoding(max_len_output, d_model)
    decoder_embedding += decoder_pos_encoding
    
    # Decoder
    decoder_output = decoder_embedding
    for _ in range(4):
        decoder_output = transformer_block(decoder_output, num_heads, d_model, dff, rate, training=True)
        
    # Output layer
    outputs = Dense(vocab_size, activation="softmax")(decoder_output)    
    
    # Define the model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

In [391]:
# Create the model
model = build_model(max_len_input, max_len_output, vocab_size)
model.summary()

In [392]:
history = model.fit(
    [X_train, y_train], y_train, batch_size=32, epochs=2, validation_data=([X_val, y_val], y_val)
)

Epoch 1/2
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 9s/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan
Epoch 2/2
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 8s/step - accuracy: 0.0000e+00 - loss: nan - val_accuracy: 0.0000e+00 - val_loss: nan


In [373]:
# Evaluate model
loss, accuracy = model.evaluate([X_test, y_test], y_test, batch_size=128)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.0585 - loss: 10.6353


In [342]:
y_pred = model.predict(X_test)

AttributeError: 'NoneType' object has no attribute 'shape'

In [393]:
def predict_summary(input_text, tokenizer, model, max_len_input, max_len_output):
    
  input_seq = tokenizer.texts_to_sequences([input_text])
  input_seq_padded = pad_sequences(input_seq, maxlen=max_len_input)
  decoder_input = [1]
  decoder_input_padded = pad_sequences([decoder_input], maxlen=max_len_output, padding ='post', truncating='post')
  summary = []

  for i in range(0, max_len_output):
    predictions = model.predict([input_seq_padded, decoder_input_padded])
    next_word_id = np.argmax(predictions[0, len(decoder_input)-1, :])
    summary.append(next_word_id)
    decoder_input.append(next_word_id)
    decoder_input_padded = pad_sequences([decoder_input], maxlen=max_len_output, padding ='post', truncating='post')
    
    predicted_sequence = tokenizer.sequences_to_texts([summary])

  return predicted_sequence
    

In [397]:
input_string = "Columbia University has extended remote classes at its main New York City campus for the rest of term amid tense Gaza war protests that have spread to US colleges nationwide. The hybrid learning comes as some students have reported antisemitic harassment around Columbia's campus. Some 133 were arrested on Monday in protests at New York University. Dozens of arrests were also made in rallies at Yale, while Harvard has restricted access to the campus. Gaza war demonstrations have also cropped up at colleges in the US Midwest and on the West Coast, where one campus has been closed. Nine students were arrested in Minneapolis on Tuesday morning as they attempted to set up a protest camp in front of a library on the University of Minnesota campus. On Monday, President Joe Biden said he condemned both \"the antisemitic protests\" as well as \"those who don't understand what's going on with the Palestinians\". Columbia provost Angela Olinto announced students would have the option of attending classes remotely at the Ivy League institution's main Morningside Campus until the last day of classes on 29 April. \"Safety is our highest priority,\" she said in an email on Monday night. Jewish students have expressed concern about antisemitism on and around Columbia's campus."

In [398]:
output = predict_summary(input_string, tokenizer, model, max_len_input, max_len_output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16

In [399]:
print(output)

['']


In [347]:
#X_test.shape
blank = np.zeros((X_test.shape[0], max_len_output))
blank.shape

(20, 41)

In [349]:
y_pred = model.predict([X_test, blank])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 721ms/step


In [353]:
summaries = tokenizer.sequences_to_texts([y_pred.tolist()])

TypeError: unhashable type: 'list'

### Select and Retrain Pre-trained Model

### Setup MLflow for Experiment tracking

### Training and Evaluation Setup