<a href="https://colab.research.google.com/github/shahtvisha/TextSummarizationUsingDeepLearning/blob/main/TransformerEvaluationOfTextSummarisartion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter
import matplotlib.pyplot as plt

pubmed_data = pd.read_csv('/content/preprocessed_pubmed_dataset(1).xls')
pubmed_data.dropna(subset=['article', 'abstract'], inplace=True)
tokenizer = Tokenizer(num_words=20000)


In [34]:
pubmed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 462 entries, 0 to 471
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   article   462 non-null    object
 1   abstract  462 non-null    object
dtypes: object(2)
memory usage: 10.8+ KB


In [2]:
tokenizer.fit_on_texts(pubmed_data['article'])  # Assuming 'article' is your text column

# Convert text to sequences
max_length = 512  # Adjust based on your requirements
input_sequences = tokenizer.texts_to_sequences(pubmed_data['article'])
target_sequences = tokenizer.texts_to_sequences(pubmed_data['abstract'])  # Assuming 'abstract' as summary

# Pad sequences
X = pad_sequences(input_sequences, maxlen=max_length, padding='post')
y = pad_sequences(target_sequences, maxlen=max_length, padding='post')

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure that both input and output sequences are padded to max_length = 512
def pad_and_truncate_sequences(sequences, max_len=512):
    return pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Example of tokenizing and padding the input and output sequences
input_sequences = tokenizer.texts_to_sequences(X_train)
output_sequences = tokenizer.texts_to_sequences(y_train)

# Pad and truncate the input and output sequences to max_length=512
input_sequences_padded = pad_and_truncate_sequences(input_sequences, max_len=512)
output_sequences_padded = pad_and_truncate_sequences(output_sequences, max_len=512)

# Shift the output sequences by one time step for training
output_sequences_shifted = output_sequences_padded[:, :-1]  # Remove last token
target_sequences = output_sequences_padded[:, 1:]  # Remove first token


In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, GRU, Dense

vocab_size = 20000  # Based on your tokenizer
embedding_dim = 128
latent_dim = 256  # Number of hidden units in the LSTM

# Define Encoder-Decoder architecture for LSTM-based Seq2Seq
def create_lstm_model():
    # Encoder
    encoder_inputs = Input(shape=(max_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(encoder_inputs)
    encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(max_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(decoder_inputs)

    # Passing both hidden state (state_h) and cell state (state_c) as initial_state to the decoder
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)(
        decoder_embedding, initial_state=[state_h, state_c])

    outputs = Dense(vocab_size, activation='softmax')(decoder_lstm)

    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Create and compile the model
model = create_lstm_model()


In [11]:
from tensorflow.keras.layers import Bidirectional

def create_bilstm_model():
    # Encoder
    encoder_inputs = Input(shape=(max_length,))
    encoder_embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(encoder_inputs)
    encoder_bilstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
        LSTM(latent_dim, return_state=True))(encoder_embedding)

    # Concatenate the forward and backward hidden and cell states
    state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
    state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

    # Decoder
    decoder_inputs = Input(shape=(max_length,))
    decoder_embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim * 2, return_sequences=True)(
        decoder_embedding, initial_state=[state_h, state_c])

    outputs = Dense(vocab_size, activation='softmax')(decoder_lstm)

    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pubmed_data['article'], pubmed_data['abstract'], test_size=0.2)

In [31]:
def train_seq2seq_model(model, X_train, y_train, epochs=10, batch_size=64):
    # Tokenize the input and output sequences
    input_sequences = tokenizer.texts_to_sequences(X_train)
    output_sequences = tokenizer.texts_to_sequences(y_train)

    # Padding the sequences
    input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')
    output_sequences_padded = pad_sequences(output_sequences, maxlen=max_length, padding='post')

    # Shift the output sequences by one time step for training
    output_sequences_shifted = output_sequences_padded[:, :-1]  # Remove last token
    target_sequences = output_sequences_padded[:, 1:]  # Remove first token

    # Pad the shifted output sequences to maintain the expected shape
    # Change maxlen to max_length -1 to match the expected target shape
    # The target sequence is one step shorter than the input/output sequence
    output_sequences_shifted = pad_sequences(output_sequences_shifted, maxlen=max_length - 1, padding='post')

    # Train the model
    history = model.fit([input_sequences_padded, output_sequences_shifted],
                    target_sequences,
                    epochs=10,
                    batch_size=64)
    return model, history

In [18]:

model.fit([X, y], y, batch_size=32, epochs=5, validation_split=0.2)  # Adjust epochs and batch size as needed


Epoch 1/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 27s/step - accuracy: 0.4902 - loss: 9.2470 - val_accuracy: 0.6662 - val_loss: 5.7676
Epoch 2/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m387s[0m 28s/step - accuracy: 0.6514 - loss: 4.8285 - val_accuracy: 0.6662 - val_loss: 3.1236
Epoch 3/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 27s/step - accuracy: 0.6547 - loss: 3.1940 - val_accuracy: 0.6662 - val_loss: 3.1244
Epoch 4/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 28s/step - accuracy: 0.6539 - loss: 3.0198 - val_accuracy: 0.6662 - val_loss: 2.8537
Epoch 5/5
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 27s/step - accuracy: 0.6507 - loss: 2.8217 - val_accuracy: 0.6662 - val_loss: 2.7084


<keras.src.callbacks.history.History at 0x7aa14473f760>

In [32]:
lstm_model = create_lstm_model()



In [33]:
trained_lstm_model, lstm_history = train_seq2seq_model(lstm_model, X_train, y_train)

Epoch 1/10


ValueError: Input 1 of layer "functional_11" is incompatible with the layer: expected shape=(None, 512), found shape=(None, 511)

In [41]:
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GRU


pubmed_data = pd.read_csv('/content/preprocessed_pubmed_dataset(1).xls')

pubmed_data['article'] = pubmed_data['article'].fillna('').astype(str)
pubmed_data['abstract'] = pubmed_data['abstract'].fillna('').astype(str)

pubmed_data['article'] = pubmed_data['article'].replace(r'^\s*$', '<EMPTY>', regex=True)
pubmed_data['abstract'] = pubmed_data['abstract'].replace(r'^\s*$', '<EMPTY>', regex=True)

max_length = 512  # Based on our dataset's max token length
vocab_size = 20000  # Adjusted based on total unique tokens
embedding_dim = 128
hidden_units = 256
batch_size = 1
epochs = 5
max_iterations = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(pubmed_data['article'])

article_sequences = tokenizer.texts_to_sequences(pubmed_data['article'])
abstract_sequences = tokenizer.texts_to_sequences(pubmed_data['abstract'])

article_padded = pad_sequences(article_sequences, maxlen=max_length, padding='post', truncating='post')
abstract_padded = pad_sequences(abstract_sequences, maxlen=max_length, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(article_padded, abstract_padded, test_size=0.2, random_state=42)

def build_lstm_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        LSTM(hidden_units, return_sequences=True),
        Dense(hidden_units, activation='relu'),
        Dense(vocab_size, activation='softmax')  # Output shape should be (batch_size, max_length, vocab_size)
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def build_bilstm_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(hidden_units, return_sequences=True)),  # Bi-directional LSTM
        Dense(hidden_units, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def build_gru_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        GRU(hidden_units, return_sequences=True),  # Ensure GRU returns sequences
        Dense(hidden_units, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def chunk_text(text, chunk_size=512):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

chunked_articles = [chunk_text(article) for article in article_padded]
chunked_abstracts = [chunk_text(abstract) for abstract in abstract_padded]

lstm_model = build_lstm_model()
bilstm_model = build_bilstm_model()
gru_model = build_gru_model()

lstm_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
bilstm_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
gru_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))



Epoch 1/5




[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 1s/step - accuracy: 0.6443 - loss: 4.2611 - val_accuracy: 0.6358 - val_loss: 3.1662
Epoch 2/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m565s[0m 1s/step - accuracy: 0.6465 - loss: 2.9433 - val_accuracy: 0.6368 - val_loss: 3.2043
Epoch 3/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 1s/step - accuracy: 0.6507 - loss: 2.7575 - val_accuracy: 0.6331 - val_loss: 3.1249
Epoch 4/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 1s/step - accuracy: 0.6585 - loss: 2.6357 - val_accuracy: 0.6330 - val_loss: 3.1576
Epoch 5/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 1s/step - accuracy: 0.6578 - loss: 2.6120 - val_accuracy: 0.6248 - val_loss: 3.1182
Epoch 1/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m763s[0m 2s/step - accuracy: 0.6485 - loss: 4.2177 - val_accuracy: 0.6357 - val_loss: 3.5715
Epoch 2/5
[1m378/378[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7a224ca2ded0>

In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [9]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=668b690803f54e72b48f6ad37faf3ea5ac2598591ef41214a0814ce32150ccb8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [12]:
import evaluate
from sklearn.metrics import f1_score
import numpy as np

rouge = evaluate.load("rouge")

def evaluate_model(model, X_test, y_test, tokenizer, max_iterations=50):
    predictions = []
    references = []

    for i in range(min(max_iterations, len(X_test))):
        # Predict  summary
        predicted = model.predict(np.expand_dims(X_test[i], axis=0))
        predicted_summary = np.argmax(predicted, axis=-1)[0]  # Get the predicted token indices
        predicted_summary = tokenizer.sequences_to_texts([predicted_summary])[0]  # Convert back to text

        true_summary = tokenizer.sequences_to_texts([y_test[i]])[0]

        predictions.append(predicted_summary)
        references.append(true_summary)

    rouge_scores = rouge.compute(predictions=predictions, references=references, rouge_types=["rouge1", "rouge2", "rouge3", "rougeL"], use_stemmer=True)

    # F1 score using sklearn's F1 score
    flattened_predictions = [tokenizer.texts_to_sequences([pred])[0] for pred in predictions]
    flattened_references = [tokenizer.texts_to_sequences([ref])[0] for ref in references]

    f1 = f1_score(
        [item for sublist in flattened_references for item in sublist],
        [item for sublist in flattened_predictions for item in sublist],
        average='macro'
    )

    return rouge_scores, f1


lstm_rouge, lstm_f1 = evaluate_model(lstm_model, X_test, y_test, tokenizer)
bilstm_rouge, bilstm_f1 = evaluate_model(bilstm_model, X_test, y_test, tokenizer)
gru_rouge, gru_f1 = evaluate_model(gru_model, X_test, y_test, tokenizer)

print("LSTM Results:")
print(f"ROUGE-1: {lstm_rouge['rouge1']:.4f}")
print(f"ROUGE-2: {lstm_rouge['rouge2']:.4f}")
print(f"ROUGE-3: {lstm_rouge['rouge3']:.4f}")
print(f"ROUGE-L: {lstm_rouge['rougeL']:.4f}")
print(f"F1 Score: {lstm_f1:.4f}\n")

print("BiLSTM Results:")
print(f"ROUGE-1: {bilstm_rouge['rouge1']:.4f}")
print(f"ROUGE-2: {bilstm_rouge['rouge2']:.4f}")
print(f"ROUGE-3: {bilstm_rouge['rouge3']:.4f}")
print(f"ROUGE-L: {bilstm_rouge['rougeL']:.4f}")
print(f"F1 Score: {bilstm_f1:.4f}\n")

print("GRU Results:")
print(f"ROUGE-1: {gru_rouge['rouge1']:.4f}")
print(f"ROUGE-2: {gru_rouge['rouge2']:.4f}")
print(f"ROUGE-3: {gru_rouge['rouge3']:.4f}")
print(f"ROUGE-L: {gru_rouge['rougeL']:.4f}")
print(f"F1 Score: {gru_f1:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 273ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 267ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 