## Import necessary libraries

In [None]:
 !pip install tensorflow==2.11
 !pip install rouge
 !pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.11
  Downloading tensorflow-2.11.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.2/439.2 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7

In [None]:
# Import necessary libraries for text preprocessing
import re
import string
import nltk
import contractions
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

# Import necessary libraries for deep learning model
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import callbacks, models, layers, preprocessing as kprocessing
from tensorflow.keras.layers import LSTM, Dense, Input, Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Attention
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.utils.vis_utils import plot_model

# Import necessary libraries for evaluation
from rouge import Rouge
rouge = Rouge()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Reading

In [None]:
train = pd.read_csv("/content/drive/MyDrive/data/train.csv",nrows=100000)     # read train and test data
test = pd.read_csv("/content/drive/MyDrive/data/test.csv", nrows=100000)

In [None]:
train

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...
...,...,...,...
99995,0cd7ec4013b0a6dbbc9f15ce8a7ff757db5b8ad9,"By . Deni Kirkova . PUBLISHED: . 05:54 EST, 1 ...",Eleven innocent princesses raunch up as Miley ...
99996,0cd84db8ebc91821d07d910e830d8710021a2d6a,A furious Harvard Business School professor ha...,Ben Edelman is an associate professor of busin...
99997,0cd875596eada9164e7d349d654697c342020b65,"By . Daily Mail Reporter . Lewis Dale, 17, is ...","Lewis Dale, 17, weeps in the dock as he is tol..."
99998,0cd9048c654458e89b9f2a219da1b4d12bc6550f,By . Daily Mail Reporter . PUBLISHED: . 11:09 ...,CCTV shows driver chase a man on a scooter and...


# Implementation of Data Preprocessing 
## Drop Duplicates to avoid Redundancy

In [None]:
train.drop_duplicates(subset=['article'],inplace=True) 
test.drop_duplicates(subset=['article'],inplace=True)

## Define function to preprocess data

In [None]:
def preprocess_text(text,n,punkt=True):
    text = text.lower()                                                # lowercase all text data
    text = BeautifulSoup(text, "lxml").text                            # removes any HTML tags
    text = re.sub(r'\([^)]*\)', '', text)                              # removes any special characters
    text = re.sub('"','', text)                                        # removes any double quotes
    text = " ".join([word.strip() for word in text.split()])           # strip words
    text = contractions.fix(text)                                      # Fix contractions
    text = re.sub(r'[^\w\s]', '', text) if punkt is True else text     # remove punctuations and characters
    text = re.sub(r"'s\b","",text)                                     # removes any possessive apostrophes
    text = re.sub("[^a-zA-Z]", " ", text)                              # replaces characters with space
    if(n==0):                                                          # function to remove stop words
        tokens = [w for w in text.split() if not w in stop_words]
    else:
        tokens=text.split()
    lengthy_words=[]
    for i in tokens:
        if len(i)>1:                                                 
            lengthy_words.append(i)   
    return (" ".join(lengthy_words)).strip()                           # join them back to strings

In [None]:
train.drop(['id'], axis=1)                                             # drop 'id' column as it adds no value to model training
test.drop(['id'], axis=1)

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."
...,...,...
11485,Our young Earth may have collided with a body ...,Oxford scientists say a Mercury-like body stru...
11486,A man facing trial for helping his former love...,Man accused of helping former lover kill woman...
11487,A dozen or more metal implements are arranged ...,Marianne Power tried the tuning fork facial at...
11488,Brook Lopez dominated twin brother Robin with ...,Brooklyn Nets beat the Portland Trail Blazers ...


In [None]:
train.dropna(axis=0,inplace=True)                                               # drop NaN values                   
test.dropna(axis=0,inplace=True)
train.replace('', np.nan, inplace=True)
test.replace('', np.nan, inplace=True)

### Call preprocessing data function on the Highlights and Articles column in both Train and Test data

In [None]:
tr_articles, tr_highlights, tst_articles, tst_highlights = [],[],[],[]          # define variables for further data preprocessing
refined_txt_tr, refined_summ_tr, refined_txt_tst, refined_summ_tst = [],[],[],[]
tr_Texts, tst_Texts = np.array(train['Text']), np.array(test['Text'])
tr_Summaries, tst_Summaries = np.array(train['Summaries']), np.array(test['Summaries'])
                                                            
for a in train['article']:                                                      # apply function to all articles/texts in train and test data
    tr_articles.append(preprocess_text(a,0))
train['Text']= tr_articles

for a in test['article']:
    tst_articles.append(preprocess_text(a,0))
test['Text']= tst_articles

for h in train['highlights']:                                                   # apply function to all summaries/highlights in train and test data
    tr_highlights.append(preprocess_text(h,1))
train['Summaries']= tr_highlights

for h in test['highlights']:
    tst_highlights.append(preprocess_text(h,1))
test['Summaries']= tst_highlights

  text = BeautifulSoup(text, "lxml").text                            # removes any HTML tags
  text = BeautifulSoup(text, "lxml").text                            # removes any HTML tags


In [None]:
train                                                                           # display train dataset checking the new columns created

Unnamed: 0,id,article,highlights,Text,Summaries
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ...",associated press published est october updated...,bishop john folda of north dakota is taking ti...
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...,ralph mata internal affairs lieutenant miamida...,criminal complaint cop used his role to help c...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t...",drunk driver killed young woman headon crash c...,craig ecclestontodd had drunk at least three p...
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...,breezy sweep pen president vladimir putin wrot...,nina dos santos says europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...,fleetwood team still record sky bet league one...,fleetwood top of league one after win at scunt...
...,...,...,...,...,...
99995,0cd7ec4013b0a6dbbc9f15ce8a7ff757db5b8ad9,"By . Deni Kirkova . PUBLISHED: . 05:54 EST, 1 ...",Eleven innocent princesses raunch up as Miley ...,deni kirkova published est november updated es...,eleven innocent princesses raunch up as miley ...
99996,0cd84db8ebc91821d07d910e830d8710021a2d6a,A furious Harvard Business School professor ha...,Ben Edelman is an associate professor of busin...,furious harvard business school professor gone...,ben edelman is an associate professor of busin...
99997,0cd875596eada9164e7d349d654697c342020b65,"By . Daily Mail Reporter . Lewis Dale, 17, is ...","Lewis Dale, 17, weeps in the dock as he is tol...",daily mail reporter lewis dale facing lengthy ...,lewis dale weeps in the dock as he is told he ...
99998,0cd9048c654458e89b9f2a219da1b4d12bc6550f,By . Daily Mail Reporter . PUBLISHED: . 11:09 ...,CCTV shows driver chase a man on a scooter and...,daily mail reporter published est february upd...,cctv shows driver chase man on scooter and swe...


## Prepare inputs for Encoder and Decoder for Modeling

In [None]:
max_len_text=300                                                                                            # define the maximum length for text data
max_len_summary=100                                                                                         # define the maximum length for summary data

In [None]:
for i in range(len(tr_Texts)):                                                                              # Filter texts and summaries based on length
    if(len(tr_Summaries[i].split()) <= max_len_summary and len(tr_Texts[i].split()) <= max_len_text):       # Check if the length of summary and text is within the specified thresholds
        refined_txt_tr.append(tr_Texts[i])                                                                    # Append the text and summary to filtered lists for training data
        refined_summ_tr.append(tr_Summaries[i])

for i in range(len(tst_Texts)):
    if(len(tst_Summaries[i].split()) <= max_len_summary and len(tst_Texts[i].split()) <= max_len_text):
        refined_txt_tst.append(tst_Texts[i])
        refined_summ_tst.append(tst_Summaries[i])

In [None]:
df_train = pd.DataFrame({'short_text':refined_txt_tr,'short_summary':refined_summ_tr})                          # create dataframe with new short text and articles
df_test = pd.DataFrame({'short_text':refined_txt_tst,'short_summary':refined_summ_tst})
df_train

Unnamed: 0,short_text,short_summary
0,associated press published est october updated...,bishop john folda of north dakota is taking ti...
1,ralph mata internal affairs lieutenant miamida...,criminal complaint cop used his role to help c...
2,accused making many fashion faux pas holiday p...,prime minister and his family are enjoying an ...
3,daily mail reporter moment train announcer stu...,london midland service had been pulling into t...
4,number job descriptions waiting darren fletche...,tony pulis believes saido berahino should look...
...,...,...
39971,tim hauser founder singer grammywinning vocal ...,tim hauser the founder and singer of the gramm...
39972,monkey baby formed unlikely bond primate stray...,baby girl and monkey have become firm friends ...
39973,yearold burglar arrested allegedly feeding two...,jason rutt allegedly burgled three homes on ac...
39974,deni kirkova published est november updated es...,eleven innocent princesses raunch up as miley ...


In [None]:
df_test

Unnamed: 0,short_text,short_summary
0,ever noticed plane seats appear getting smalle...,experts question if packed out planes are putt...
1,drunk teenage boy rescued security jumping lio...,drunk teenage boy climbed into lion enclosure ...
2,dougie freedman verge agreeing new twoyear dea...,nottingham forest are close to extending dougi...
3,liverpool target neto also wanted psg clubs sp...,fiorentina goalkeeper neto has been linked wit...
4,moment crew firefighters struggled haul giant ...,giant pig fell into the swimming pool at his h...
...,...,...
4905,fiveyearold namesake grandson famed college ba...,jerry tarkanian five was taken to the hospital...
4906,lydia ko shot saturday ana inspiration second ...,lydia ko shot her second straight overpar roun...
4907,backache striking us younger ever almost half ...,some per cent of under surveyed said they had ...
4908,brook lopez dominated twin brother robin point...,brooklyn nets beat the portland trail blazers ...


In [None]:
df_train['short_summary'] = df_train['short_summary'].apply(lambda x : 'sostok '+ x + ' eostok')                                              # addition of start and end tokens for sentence identification for decoder input  
df_test['short_summary'] = df_test['short_summary'].apply(lambda x : 'sostok '+ x + ' eostok')

## Define data for modeling : X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = df_train['short_text'], df_train['short_summary'], df_test['short_text'], df_test['short_summary']

In [None]:
y_train

0        sostok bishop john folda of north dakota is ta...
1        sostok criminal complaint cop used his role to...
2        sostok prime minister and his family are enjoy...
3        sostok london midland service had been pulling...
4        sostok tony pulis believes saido berahino shou...
                               ...                        
39971    sostok tim hauser the founder and singer of th...
39972    sostok baby girl and monkey have become firm f...
39973    sostok jason rutt allegedly burgled three home...
39974    sostok eleven innocent princesses raunch up as...
39975    sostok cctv shows driver chase man on scooter ...
Name: short_summary, Length: 39976, dtype: object

We can see how the start and end tokens are added to each of the sentences in the y_train data above. Thereby, the tokens are equivalent to the count of sentences in the article.

In [None]:
lim = 4
count, total_count = 0,0

In [None]:
x_tokenizer = Tokenizer()                                                          # Create a tokenizer for X_train
x_tokenizer.fit_on_texts(list(X_train))

for key,val in x_tokenizer.word_counts.items():                                    # Loop through the word counts in x_tokenizer
    total_count = total_count+1                                                    # increment total_count for every word encountered
    if(val < lim):                                                                 # Check if the count of the word is less than the specified limit
        count = count+1                                                            # increment count for words with count less than limit

#words_cnt = total_count - count

## Build Tokenizer

In [None]:
y_tokenizer = Tokenizer() 

In [None]:
x_tokenizer = Tokenizer(num_words = total_count - count )                           # tokenizer building
x_tokenizer.fit_on_texts(list(X_train))                                           
y_tokenizer.fit_on_texts(list(y_train))

In [None]:
x_train_seq    =   x_tokenizer.texts_to_sequences(X_train)                          # text to number sequences conversion
x_test_seq     =   x_tokenizer.texts_to_sequences(X_test)

x_train      =   pad_sequences(x_train_seq,  maxlen=max_len_text, padding='post')   # padding zeroes
x_test       =   pad_sequences(x_test_seq, maxlen=max_len_text, padding='post')
x_voc_size   =  x_tokenizer.num_words + 1        

In [None]:
lim=6                                                                               # Set the limit for word counts
cnt,tot_cnt=0,0                                                                     # Initialize count variables

for key,val in y_tokenizer.word_counts.items():                                     # Loop through the word counts in y_tokenizer
    tot_cnt=tot_cnt+1
    if(val < lim):
        cnt=cnt+1

In [None]:
y_tokenizer = Tokenizer(num_words = tot_cnt - cnt)                                  # Create a new tokenizer for y_train data again
y_tokenizer.fit_on_texts(list(y_train))
y_train_seq    =   y_tokenizer.texts_to_sequences(y_train) 
y_test_seq     =   y_tokenizer.texts_to_sequences(y_test) 
y_train     =   pad_sequences(y_train_seq, maxlen=max_len_summary, padding='post')
y_test      =   pad_sequences(y_test_seq, maxlen=max_len_summary, padding='post')
y_voc_size  =   y_tokenizer.num_words +1

In [None]:
y_tokenizer.word_counts['sostok'],len(y_train) # ensuring that sentences count is equal to the start tokens

(39976, 39976)

In [None]:
ind_arr1 = []
i = 0
while i < len(y_train):
    cnt = 0
    for j in y_train[i]:
        if j != 0:
            cnt = cnt + 1
    if cnt == 2:
        ind_arr1.append(i)
    i += 1

y_tr = np.delete(y_train, ind_arr1, axis=0)
x_tr = np.delete(x_train, ind_arr1, axis=0)

In [None]:
ind_arr2 = []
i = 0
while i < len(y_test):
    cnt = 0
    for j in y_test[i]:
        if j != 0:
            cnt = cnt + 1
    if cnt == 2:
        ind_arr2.append(i)
    i += 1

y_val=np.delete(y_test,ind_arr2, axis=0)
x_val=np.delete(x_test,ind_arr2, axis=0)

By removing such sequences from both y_train and x_train arrays using the np.delete() function, the code is ensuring that the data used for training and evaluation only contains complete and meaningful sequences, which can help improve the quality and reliability of the model's training.

In [None]:
# Using third-party created Attention-layer from https://github.com/thushv89/attention_keras/blob/master/src/layers/attention.py 

from tensorflow.python.keras import backend as K
logger = tf.get_logger()

class AttentionLayer(tf.keras.layers.Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs

        logger.debug(f"encoder_out_seq.shape = {encoder_out_seq.shape}")
        logger.debug(f"decoder_out_seq.shape = {decoder_out_seq.shape}")

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            logger.debug("Running energy computation step")

            if not isinstance(states, (list, tuple)):
                raise TypeError(f"States must be an iterable. Got {states} of type {type(states)}")

            encoder_full_seq = states[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_full_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim

            logger.debug(f"U_a_dot_h.shape = {U_a_dot_h.shape}")

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)

            logger.debug(f"Ws_plus_Uh.shape = {Ws_plus_Uh.shape}")

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            logger.debug(f"ei.shape = {e_i.shape}")

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            logger.debug("Running attention vector computation step")

            if not isinstance(states, (list, tuple)):
                raise TypeError(f"States must be an iterable. Got {states} of type {type(states)}")

            encoder_full_seq = states[-1]

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_full_seq * K.expand_dims(inputs, -1), axis=1)

            logger.debug(f"ci.shape = {c_i.shape}")

            return c_i, [c_i]

        # we don't maintain states between steps when computing attention
        # attention is stateless, so we're passing a fake state for RNN step function
        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e], constants=[encoder_out_seq]
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c], constants=[encoder_out_seq]
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

# Attention-Based Sequence-to-Sequence (Seq2Seq) Model Building

In [None]:
K.clear_session()                                                                                           # Clear Keras session
latent_dim, embed_dim = 300, 100                                                                            # Define latent and embedding dimensions

### Encoder

In [None]:
e_in = Input(shape=(max_len_text,))                                                                         # Input layer for encoder 

e_embed =  Embedding(x_voc_size, embed_dim,trainable=True)(e_in)                                            # embedding layer
e_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4)                           # First LSTM layer in the encoder
e_out1, h1_state, c1_state = e_lstm1(e_embed)                                                               # Apply first LSTM layer to embedded input
e_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4)                           # Second LSTM layer in the encoder

e_out, h2_state, c2_state = e_lstm2(e_out1)

#### Decoder

In [None]:
d_in = Input(shape=(None,))                                                                                 # Input layer for decoder with variable length sequences

embedding_dec = Embedding(y_voc_size, embed_dim, trainable=True)                                            # Embedding layer for input words in decoder
d_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4)                            # LSTM layer in the decoder
d_embed = embedding_dec(d_in)

d_out, d_fwd_state, d_bkw_state = d_lstm(d_embed, initial_state = [h2_state, c2_state])                     # Apply LSTM layer to embedded decoder input, with initial state from the encoder LSTM

### Attention

In [None]:
a_layer = AttentionLayer(name='attention_layer')                                                            # Attention layer for combining encoder and decoder outputs
a_out, a_states = a_layer([e_out, d_out])

d_merge_input = concatenate(axis=-1)([d_out, a_out])                                                        # Merge decoder LSTM output and attention output
d_dense   = TimeDistributed(Dense(y_voc_size, activation='softmax'))                                        # Dense layer for output
d_out = d_dense(d_merge_input)                                                                              # Apply dense layer to merged decoder LSTM and attention output

model = Model([e_in, d_in], d_out)                                                                          # Define the model with inputs and outputs
model.summary() 

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 300, 100)     6759300     ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 300, 300),   481200      ['embedding[0][0]']              
                                 (None, 300),                                                 

The above represents the architecture of our Seq2Seq model.

### Model Compiling

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

### Define Checkpoints to save model weights everytime when the validation loss drops and accuracy improves.

In [None]:
filepath="/content/drive/MyDrive/Weights/weights-improvement-{val_loss:.4f}.hdf5"                                    # Define file path for saving weights
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')               # Create ModelCheckpoint callback
early_stop = EarlyStopping(patience=3, restore_best_weights=True)                                                    # Create EarlyStopping callback                                                   
callbacks_list = [checkpoint, early_stop]                                                                            # Combine callbacks into a list

In [None]:
hist = model.fit([x_train, y_train[:, :-1]], y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:] , epochs=20, callbacks = callbacks_list, batch_size=128, validation_data=([x_test, y_test[:, :-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:, 1:]))

Epoch 1/20
Epoch 1: val_loss improved from inf to 3.09901, saving model to /content/drive/MyDrive/Weights/weights-improvement-3.0990.hdf5
Epoch 2/20
Epoch 2: val_loss improved from 3.09901 to 3.07881, saving model to /content/drive/MyDrive/Weights/weights-improvement-3.0788.hdf5
Epoch 3/20
Epoch 3: val_loss improved from 3.07881 to 3.04456, saving model to /content/drive/MyDrive/Weights/weights-improvement-3.0446.hdf5
Epoch 4/20
Epoch 4: val_loss improved from 3.04456 to 3.01522, saving model to /content/drive/MyDrive/Weights/weights-improvement-3.0152.hdf5
Epoch 5/20
Epoch 5: val_loss improved from 3.01522 to 2.98253, saving model to /content/drive/MyDrive/Weights/weights-improvement-2.9825.hdf5
Epoch 6/20
Epoch 6: val_loss improved from 2.98253 to 2.95650, saving model to /content/drive/MyDrive/Weights/weights-improvement-2.9565.hdf5
Epoch 7/20
Epoch 7: val_loss improved from 2.95650 to 2.92173, saving model to /content/drive/MyDrive/Weights/weights-improvement-2.9217.hdf5
Epoch 8/20

In [None]:
rev_src_wrd_ind = x_tokenizer.index_word
tgt_wrd_ind = y_tokenizer.word_index
rev_tgt_wrd_ind = y_tokenizer.index_word

In [None]:
def genSummary(input_seq):                                                                                          # Function to generate a summary from an input sequence
    res_Summary = ''
    for i in input_seq:       
        if((i != 0 and i != tgt_wrd_ind['sostok']) and i != tgt_wrd_ind['eostok']):                                 # Exclude padding, start-of-sequence, and end-of-sequence tokens
            res_Summary = res_Summary + rev_tgt_wrd_ind[i] + ' '
    return res_Summary

#i = 0
#while i < len(input_seq):
#    if input_seq[i] != 0 and input_seq[i] != tgt_wrd_ind['sostok'] and input_seq[i] != tgt_wrd_ind['eostok']:
#        res_Summary = res_Summary + rev_tgt_wrd_ind[input_seq[i]] + ' '
#    i += 1
#return res_Summary

In [None]:
e_model = Model(inputs=encoder_inputs,outputs=[e_out, h2_state, c2_state])                                                # Define the encoder model with input and output tensors
d_state_input_h = Input(shape=(latent_dim,))                                                                          
d_state_input_c = Input(shape=(latent_dim,))
d_hidden_state_input = Input(shape=(max_len_text,latent_dim))
d_emb2 = embedding_layer(d_in) 
d_out2, state_h2, state_c2 = d_lstm(d_emb2, initial_state=[d_state_input_h, d_state_input_c])

a_out_inf, a_states_inf = a_layer([d_hidden_state_input, d_out2])                                                         # attention reference
d_inf_concat = Concatenate(axis=-1, name='concat')([d_out2, attn_out_inf])

d_out2  = d_dense(d_inf_concat)                                                                                           # passing values to dense softmax layer
d_model = Model([d_in] + [d_hidden_state_input,d_state_input_h, d_state_input_c],[d_outputs2] + [h2_state, c2_state])

def decode_the_sequence(input):                                                                                           # Function to decode the sequence and generate a summary
    e_out, e_h, e_c = encoder_model.predict(input)
    tgt_seq = np.zeros((1,1))
    tgt_seq[0, 0] = tgt_wrd_ind['sostok']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = d_model.predict([tgt_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = rev_tgt_wrd_ind[sampled_token_index]

        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_len_summary-1)):
            stop_condition = True

        if(sampled_token!='eostok'):
            decoded_sentence += ' '+ sampled_token

        tgt_seq = np.zeros((1,1))
        tgt_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c
    return decoded_sentence

In [None]:
def genText(input):                                                                                                         # Function to generate the original text from an input sequence
    res_Text = ''
    i = 0
    while i < len(input):
        if input[i] != 0:
            res_Text += rev_src_wrd_ind[input[i]] + ' '
        i += 1
    return res_Text

### Summary generation using Seq2Seq Model

In [None]:
# Create lists to store reference and predicted summaries
reference_summaries = []
predicted_summaries = []

# Generate reference and predicted summaries for each input text
for i in range(100):
    input_text = genText(x_tr[i])  # Generate input text
    reference_summary = genSummary(y_tr[i])  # Generate reference summary
    predicted_summary = decode_the_sequence(x_tr[i].reshape(1, max_len_text))  # Generate predicted summary

    reference_summaries.append(reference_summary)
    predicted_summaries.append(predicted_summary)

    # Print input text, reference summary, and predicted summary for each example
    print("Text:", input_text)
    print("Reference summary:", reference_summary)
    print("Predicted summary:", predicted_summary, "\n")

Text: associated press published est october updated est october bishop fargo catholic diocese north dakota exposed potentially hundreds church members fargo grand forks jamestown hepatitis virus late september early october state health department issued advisory exposure anyone attended five churches took communion bishop john fargo catholic diocese north dakota exposed potentially hundreds church members fargo grand forks jamestown hepatitis state immunization program manager molly howell says risk low officials feel important alert people possible exposure diocese announced monday bishop john taking time diagnosed hepatitis diocese says contracted infection contaminated food attending conference newly ordained bishops italy last month symptoms hepatitis include fever tiredness loss appetite nausea abdominal discomfort fargo catholic diocese north dakota bishop located 
Reference summary: bishop john of north dakota is taking time off after being diagnosed he contracted the infectio

### Summary Evaluation using ROUGE-N

In [None]:
# Calculate ROUGE scores
scores = rouge.get_scores(predicted_summaries, reference_summaries, avg=True)

# Print the average ROUGE scores
print("ROUGE-1: Precision =", scores['rouge-1']['p'], "Recall =", scores['rouge-1']['r'], "F1-score =", scores['rouge-1']['f'])
print("ROUGE-2: Precision =", scores['rouge-2']['p'], "Recall =", scores['rouge-2']['r'], "F1-score =", scores['rouge-2']['f'])
print("ROUGE-L: Precision =", scores['rouge-l']['p'], "Recall =", scores['rouge-l']['r'], "F1-score =", scores['rouge-l']['f'])

ROUGE-1: Precision = 0.30277777777777737 Recall = 0.08689915060406449 F1-score = 0.13241783711910146
ROUGE-2: Precision = 0.0343809523809524 Recall = 0.01216131041031263 F1-score = 0.017493984562127834
ROUGE-L: Precision = 0.2806666666666663 Recall = 0.08130867961910827 F1-score = 0.12359396349067281
