In [None]:
import os
import tarfile
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import drive
import nltk
# Mount Google Drive
drive.mount('/content/drive')

# Specify the directory path
directory_path = '/content/drive/My Drive/'


# Specify the path to the .tgz file
file_path = '/content/drive/My Drive/cnn_stories.tgz'

# Extract the contents of the .tgz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall('/content/data/')

# List of file names
file_names = [
    '8278100e57ce63728df109f76b9888e7a918ad90.story',
    '827811c8e01692a37b10247f3a3cd9ebc1ece71b.story',
    # Add more file names here
]

# Split the data
train_files, test_files = train_test_split(file_names, test_size=0.2, random_state=42)

# Print the number of files in each set
print("Number of training files:", len(train_files))
print("Number of testing files:", len(test_files))

# Preprocessing functions
def preprocess_content(content):
    # Tokenize the content using NLTK
    tokens = nltk.word_tokenize(content)

    # Convert tokens to numerical representation using a tokenizer
    sequences = tokenizer.texts_to_sequences([tokens])


Mounted at /content/drive
Number of training files: 1
Number of testing files: 1


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input,Activation,Dot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [None]:
for file_name in test_files:
    file_path = "/content/data/cnn/stories/" + file_name

    with open(file_path, "r") as file:
        content = file.read()

    # Process the content as needed
    print("File:", file_name)
    print("Content:", content)
    print("----------------------------------")

File: 827811c8e01692a37b10247f3a3cd9ebc1ece71b.story
Content: Los Angeles (CNN) -- Buddy Holly finally got his star on Hollywood's Walk of Fame on Wednesday, which would have been the singer-songwriter's 75th birthday.

"It's never too late when you get a fantastic thing to happen," his widow, Maria Elena Holly, told CNN after unveiling the star on the sidewalk along Vine Street at the entrance to the historic Capitol Records building.

Holly was just 22 when he was killed in a plane crash, along with musicians Ritchie Valens and J.P. "The Big Bopper" Richardson.

"I'm saying now, my dear Buddy, you loved to go to the movies. You told me that one of your dreams was to write scores for movies and make your mark in Hollywood," his widow said during the ceremony. "Well, my dear, half of your dream unfortunately did not come true, but the other half did come true with a beautiful star on the Hollywood Walk of Fame."

Actor Gary Busey, who channeled Holly's voice and character in the 1978 m

Bahdanau Attention is the function based on the formula given in the abstract that executed successfully

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, attention_units):
        super(BahdanauAttention, self).__init__()
        self.W_base_h = Dense(attention_units)
        self.W_base_s = Dense(attention_units)
        self.V = Dense(1)

    def call(self, h, s):
        # h: encoder hidden states (shape: batch_size x max_sequence_length x encoder_units)
        # s: decoder state (shape: batch_size x decoder_units)

        # Expand dimensions of s to match h
        s = tf.expand_dims(s, axis=1)

        # Calculate attention scores
        e = self.V(tf.nn.tanh(self.W_base_h(h) + self.W_base_s(s)))

        # Calculate attention distribution
        a = tf.nn.softmax(e, axis=1)

        # Calculate the context vector
        context_vector = tf.reduce_sum(a * h, axis=1)

        return context_vector, a

In [None]:
# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Convert the corpus to token sequences
token_sequences = tokenizer.texts_to_sequences(train_files)


In [None]:
from tensorflow.keras.layers import *

# Pad the token sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in token_sequences) + 1  # Add 1 for the padding

# Define the total number of words in the vocabulary
total_words = len(tokenizer.word_index) + 1

# Define the model architecture
embedding_dim = 100
decoder_units = 64
attention_units = 10


# Define the coverage loss weight
lambda_val = 0.1

encoder_model = Sequential([
    Embedding(total_words, embedding_dim, input_length=max_sequence_length-1),
    Bidirectional(LSTM(decoder_units, return_sequences=True)),
    Dense(total_words, activation='softmax')
])

# Print the summary of the encoder model
encoder_model.summary()

decoder_model = Sequential([
    Embedding(total_words, embedding_dim, input_length=max_sequence_length),
    LSTM(decoder_units, return_sequences=True),
    Dense(total_words, activation='softmax')
])

# Print the summary of the decoder model
decoder_model.summary()

# Define the coverage vector shape
coverage_vector_shape = (max_sequence_length-1, total_words)

# Define the coverage vector
coverage_vector = tf.Variable(tf.zeros(coverage_vector_shape))

# Calculate attention distribution
context_vector = tf.zeros((1, decoder_units))  # Initial context vector
attention_distributions = []

for t in range(max_sequence_length - 1):
    # Calculate attention distribution
    attention_input = Concatenate(axis=-1)([decoder_model.output[:, t, :], context_vector, coverage_vector[t, :]])
    attention_scores = Dense(1)(attention_input)
    attention_scores = tf.squeeze(attention_scores, axis=-1)
    attention_distribution = Activation('softmax')(attention_scores)
    attention_distributions.append(attention_distribution)

    # Update context vector
    context_vector = tf.reduce_sum(tf.expand_dims(attention_distribution, axis=-1) * encoder_model.output, axis=1)

# Calculate attention distribution
attention = Dot(axes=[2, 2])([decoder_model.output, encoder_model.output])
attention = Activation('softmax')(attention)

# Calculate the coverage loss
coverage_loss = tf.reduce_sum(tf.minimum(attention, coverage_vector), axis=1)
coverage_loss = lambda_val * tf.reduce_sum(coverage_loss, axis=-1)

# Calculate the context vector
context_vector = Dot(axes=[2, 1])([attention, encoder_model.output])

# Concatenate the context vector and decoder state
decoder_combined_context = Concatenate(axis=-1)([context_vector, decoder_model.output])

# Two dense layers to calculate the probability distribution over the vocabulary
vocabulary_distribution = Dense(total_words, activation='softmax')(decoder_combined_context)
probability_distribution = vocabulary_distribution / tf.reduce_sum(vocabulary_distribution, axis=-1, keepdims=True)
# Define pgen as a learnable parameter
pgen = Dense(1, activation='sigmoid')(decoder_combined_context)

# Extended vocabulary distribution for OOV words and source document words
extended_vocabulary_distribution = pgen * probability_distribution
copy_distribution = (1 - pgen) * attention

# Calculate the coverage loss
coverage_loss = tf.reduce_sum(tf.minimum(attention, coverage_vector), axis=1)
lambda_val = 0.1  # Hyper


# Define the target word indices for each timestep
target_word_indices = tf.keras.Input(shape=(max_sequence_length,), name='target_word_indices')

# Calculate the loss for each timestep
loss_per_timestep = sparse_categorical_crossentropy(target_word_indices, vocabulary_distribution)
loss_per_timestep = tf.reduce_mean(loss_per_timestep, axis=-1)

# Calculate the overall loss for the whole sequence
loss = tf.reduce_mean(loss_per_timestep)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         100       
                                                                 
 bidirectional (Bidirectiona  (None, None, 128)        84480     
 l)                                                              
                                                                 
 dense (Dense)               (None, None, 1)           129       
                                                                 
Total params: 84,709
Trainable params: 84,709
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1, 100)            100       
                              



In [None]:
max_article_length = 400
max_summary_length = 120

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk

# Define the maximum sequence length
max_sequence_length = 100
import tensorflow as tf
import numpy as np

def loss(targets, predictions):
    # Ensure the shapes of targets and predictions are compatible
    if predictions.shape.ndims > 2:
        predictions = predictions[:, :, 0]

    # Calculate the loss
    loss_value = tf.reduce_mean(tf.square(targets - predictions))

    return loss_value

# Define the preprocess_encoder_inputs function
def preprocess_encoder_inputs(contents):
    encoder_inputs = []
    for content in contents:
        # Tokenize the content using NLTK
        tokens = nltk.word_tokenize(content)

        # Convert tokens to numerical representation using a tokenizer
        sequences = tokenizer.texts_to_sequences([tokens])

        # Flatten the sequence
        flattened_sequence = [item for sublist in sequences for item in sublist]

        # Append the flattened sequence to the encoder inputs list
        encoder_inputs.append(flattened_sequence)

    # Pad the encoder inputs to ensure a consistent sequence length
    encoder_inputs = pad_sequences(encoder_inputs, maxlen=max_sequence_length, padding='pre')

    return encoder_inputs

def preprocess_decoder_inputs(contents):
    decoder_inputs = []
    for content in contents:
        # Tokenize the content using NLTK
        tokens = nltk.word_tokenize(content)

        # Add the start-of-sequence token to the tokens
        tokens = ['<sos>'] + tokens

        # Convert tokens to numerical representation using a tokenizer
        sequences = tokenizer.texts_to_sequences([tokens])

        # Flatten the sequence
        flattened_sequence = [item for sublist in sequences for item in sublist]

        # Append the flattened sequence to the decoder inputs list
        decoder_inputs.append(flattened_sequence)

    # Pad the decoder inputs to ensure a consistent sequence length
    decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_sequence_length, padding='post')

    return decoder_inputs


def preprocess_target_word_indices(contents):
    target_word_indices = []
    for content in contents:
        # Tokenize the content using NLTK
        tokens = nltk.word_tokenize(content)

        # Add the end-of-sequence token to the tokens
        tokens = tokens + ['<eos>']

        # Convert tokens to numerical representation using a tokenizer
        sequences = tokenizer.texts_to_sequences([tokens])

        # Flatten the sequence
        flattened_sequence = [item for sublist in sequences for item in sublist]

        # Append the flattened sequence to the target word indices list
        target_word_indices.append(flattened_sequence)

    # Pad the target word indices to ensure a consistent sequence length
    target_word_indices = pad_sequences(target_word_indices, maxlen=max_sequence_length, padding='post')

    return target_word_indices



In [None]:
import nltk
nltk.download('punkt')  # Download the required NLTK data

# Define the model
model = tf.keras.Model(inputs=[encoder_model.input, decoder_model.input],
                       outputs=[vocabulary_distribution, extended_vocabulary_distribution, copy_distribution])

# Compile the model
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.15, initial_accumulator_value=0.1)
model.compile(optimizer='adagrad', loss=[loss, loss, loss])

# Define the number of epochs
num_epochs = 10

# Define the early stopping criteria
early_stopping_patience = 5
no_improvement_count = 0
best_loss = float('inf')

# Initialize variables for accuracy calculation
total_examples = 0
total_correct_predictions = 0

# Training loop
for epoch in range(num_epochs):
    # Perform training for each batch
    for batch in train_files:
        # Extract the inputs and targets
        encoder_inputs = preprocess_encoder_inputs(batch)
        decoder_inputs = preprocess_decoder_inputs(batch)
        target_word_indices = preprocess_target_word_indices(batch)

        # Perform a forward pass
        with tf.GradientTape() as tape:
            # Get the model predictions
            vocabulary_preds, extended_vocabulary_preds, copy_preds = model([encoder_inputs, decoder_inputs])

            # Calculate the loss
            loss_value = loss(target_word_indices, vocabulary_preds) + loss(target_word_indices, extended_vocabulary_preds) + loss(target_word_indices, copy_preds)

        # Perform backpropagation
        gradients = tape.gradient(loss_value, model.trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 2)  # Gradient clipping
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # Calculate the accuracy for the current batch
        batch_predictions = tf.argmax(vocabulary_preds, axis=-1)
        correct_predictions = tf.equal(batch_predictions, target_word_indices)
        batch_accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

        # Update the total number of examples and correct predictions
        total_examples += target_word_indices.shape[0]
        total_correct_predictions += tf.reduce_sum(tf.cast(correct_predictions, tf.float32)).numpy()

        # Print the training loss and accuracy for each batch
        print(f"Training loss: {loss_value:.4f} ")



    # Perform validation
    validation_loss = 0.0
    num_batches = 0

    for batch in test_files:
        # Extract the inputs and targets for validation
        encoder_inputs_val = preprocess_encoder_inputs(batch)
        decoder_inputs_val = preprocess_decoder_inputs(batch)
        target_word_indices_val = preprocess_target_word_indices(batch)

        # Perform a forward pass for validation
        with tf.GradientTape() as tape:
            # Get the model predictions for validation
            vocabulary_preds_val, extended_vocabulary_preds_val, copy_preds_val = model([encoder_inputs_val, decoder_inputs_val])

            # Calculate the loss for validation
            loss_value_val = loss(target_word_indices_val, vocabulary_preds_val) + loss(target_word_indices_val, extended_vocabulary_preds_val) + loss(target_word_indices_val, copy_preds_val)

            # Accumulate the validation loss
            validation_loss += loss_value_val.numpy()

        num_batches += 1

    # Calculate the average validation loss
    validation_loss /= num_batches
    print(f"Validation loss: {validation_loss:.4f}")


    # Check for early stopping
    if validation_loss < best_loss:
        best_loss = validation_loss
        no_improvement_count = 0
    else:
        no_improvement_count += 1

    # Early stopping condition
    if no_improvement_count >= early_stopping_patience:
        print("Early stopping. No improvement in validation loss.")
        break


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training loss: 1.0527 
Validation loss: 1.0443
Training loss: 1.0443 
Validation loss: 1.0381
Training loss: 1.0381 
Validation loss: 1.0334
Training loss: 1.0334 
Validation loss: 1.0297
Training loss: 1.0297 
Validation loss: 1.0267
Training loss: 1.0267 
Validation loss: 1.0243
Training loss: 1.0243 
Validation loss: 1.0222
Training loss: 1.0222 
Validation loss: 1.0205
Training loss: 1.0205 
Validation loss: 1.0190
Training loss: 1.0190 
Validation loss: 1.0177


In [None]:
pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge

# Example reference and generated summaries
reference_texts = ["The estranged wife of former football star Deion Sanders was released from custody Tuesday in suburban Dallas and said she hasn't been given a fair shake over allegations that she attacked him \n."

"Pilar Sanders was arrested on domestic violence-related charges Monday night, hours after Sanders sent a series of bizarre tweets saying she assaulted him \n."

"I understand that I have very little chance at beating a Hall of Fame, two-sport man that everyone seems to love and adore, said Pilar Sanders, proclaiming her innocence \n."

"I'm a full-time mom, 100% for my children, she said tearfully. And I just haven't been given a fair shake. \n"

"The Collin County Sheriff's Office said Pilar Sanders, under an emergency protective order, is forbidden from returning to the couple's home for 61 days and cannot threaten or harass any member of the family \n."

"In one of the messages posted on his verified Twitter account, Deion Sanders posted a picture of what he said were his children filling out complaints to give to police in Texas \n."

"Pray for me and my kids now! They just witnessed their mother and a friend jump me in my room, the first tweet, posted at 6:15 p.m., read. She's going to jail n I'm pressing charges! \n"

"Two minutes later, Sanders tweeted again \n."

"I'm sad my boys witnessed this mess but I warned the police department here that she was gone try n harm me and my boys. This is on my mama! it said \n."

"Shortly after that, Sanders tweeted a picture that showed him and his two boys, 10 and 12, filling out paperwork \n."

"Filling out police reports now! Thank God for this platform to issue the Truth,the caption read \n."

"Pilar Sanders was booked into jail Monday night on suspicion of assault family violence, a misdemeanor, according to booking records at the Collin County Jail. Bail was set at $264 \n."

"I can tell you that there are two sides to every story, and the truth will come out in court, Larry Friedman, an attorney for Pilar Sanders, said Tuesday \n."

"Deion Sanders played for several NFL teams, including the San Francisco 49ers, the Dallas Cowboys and the Atlanta Falcons. He was inducted into the NFL Hall of Fame and works as an analyst for the NFL Network.\n"
"During much of his NFL career, he also was an outfielder with four Major League Baseball teams and played in a World Series with the Atlanta Braves \n."

"The NFL Network and Sanders' business manager, Constance Schwartz, declined to comment about the incident.\n"

"But a clearly emotional Sanders spoke to Dallas television station KXAS on Monday night and appealed for help\n."

"My kids, they are scared for their life, Sanders told the station. They just saw two women jump their dad in his own house, in his room, in my room. It's sad \n."

"I got locks on my doors right now,he added. Is somebody going to have to die? Is it going to be me before the court does something and get this woman out of my house? It's absurd.\n"

"The couple has three children together. Sanders has two other children from an earlier relationship.\n"

"The couple married in 1999 and starred in a reality show, Deion & Pilar Prime Time Love, that aired on the Oxygen network. The marriage, soured, however, and the two are in the midst of a bitter divorce \n."

"In February, Pilar Sanders filed a suit against her husband and his aunt, Laura Jones. She said the aunt attacked her in their 10-bedroom, 29,000-square-foot home in Prosper, Texas, while Deion Sanders watched \n."

"At the time, the athlete tweeted that his wife was the aggressor and the aunt was in the home merely to fix his phone \n."

"Pilar Sanders also filed a separate suit against her husband and his daughter, Deiondra, after she called her stepmother a gold-digging (expletive) and the number one gold digger of the year in Twitter posts \n."

"In the second suit, Pilar Sanders demanded $200 million in damages for libelous and slanderous comments. She claims that her husband endorsed Deiondras false statements and himself tweeted he was tired of all Pilars lies and foolishness.\n"
]


####################
generated_texts = ["Buddy Holly finally got his star on Hollywoods Walk of Fame on Wednesday, which would have been the singer-songwriters 75th birthday \n."

"Its never too late when you get a fantastic thing to happen, his widow, Maria Elena Holly, told CNN after unveiling the star on the sidewalk along Vine Street at the entrance to the historic Capitol Records building \n."

"Holly was just 22 when he was killed in a plane crash, along with musicians Ritchie Valens and J.P. The Big Bopper Richardson \n."

"Im saying now, my dear Buddy, you loved to go to the movies. You told me that one of your dreams was to write scores for movies and make your mark in Hollywood, his widow said during the ceremony. Well, my dear, half of your dream unfortunately did not come true, but the other half did come true with a beautiful star on the Hollywood Walk of Fame. \n"

"Actor Gary Busey, who channeled Holly's voice and character in the 1978 movie The Buddy Holly Story, attended the dedication.\n"

"Hes here right now, Busey said. I feel his spirit in the air. It's beautiful."

"What would Holly be doing now if he were still alive?\n"

"Anything he wanted to, Busey said. Scoring movies, helping people in different countries who are in trouble, like writing a song for them and taking it over there and singing it to them.\n"

"Busey is working with T Bone Burnett on an album of Holly songs, which he said he would also perform in a tour\n."

"The Hollywood star ceremony was timed to coincide with this week's release of Listen to Me: Buddy Holly, a tribute album of his songs performed by 16 artists, including Ringo Starr, Stevie Nick, Brian Wilson, Jackson Browne, Chris Isaak, Linda Ronstadt and Lyle Lovett.\n"

"Phil Everly of the Everly Brothers, a contemporary of Holly, also attended the dedication."]

# Initialize the ROUGE scorer
rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_texts, reference_texts, avg=True)

# Print ROUGE scores
print("ROUGE scores:")
print(f"ROUGE-1: {scores['rouge-1']}")
print(f"ROUGE-2: {scores['rouge-2']}")
print(f"ROUGE-L: {scores['rouge-l']}")

ROUGE scores:
ROUGE-1: {'r': 0.12903225806451613, 'p': 0.23880597014925373, 'f': 0.16753926246100723}
ROUGE-2: {'r': 0.01889763779527559, 'p': 0.04054054054054054, 'f': 0.025778728208584182}
ROUGE-L: {'r': 0.12365591397849462, 'p': 0.22885572139303484, 'f': 0.16055845966868612}
