In [None]:
import os
import tarfile
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from google.colab import drive
import nltk
from tensorflow.keras.layers import Concatenate, Dense, Dot, Embedding, LSTM, Bidirectional, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy
import numpy as np
from rouge import Rouge

# Mount Google Drive
drive.mount('/content/drive')

# Specify the directory path
directory_path = '/content/drive/My Drive/'

# Specify the path to the .tgz file
file_path = '/content/drive/My Drive/cnn_stories.tgz'

# Extract the contents of the .tgz file
with tarfile.open(file_path, 'r:gz') as tar:
    tar.extractall('/content/data/')

# List of file names
file_names = [
    '8278100e57ce63728df109f76b9888e7a918ad90.story',
    '827811c8e01692a37b10247f3a3cd9ebc1ece71b.story',
    # Add more file names here
]

# Split the data
train_files, test_files = train_test_split(file_names, test_size=0.2, random_state=42)

# Print the number of files in each set
print("Number of training files:", len(train_files))
print("Number of testing files:", len(test_files))

# Initialize the Tokenizer class
tokenizer = Tokenizer()

# Preprocessing functions
def preprocess_content(content):
    # Tokenize the content using NLTK
    tokens = nltk.word_tokenize(content)
    return tokens

# Preprocess the articles
train_articles = []
test_articles = []
for file_name in train_files:
    with open(os.path.join(directory_path, 'data', file_name), 'r') as file:
        content = file.read()
        train_articles.append(preprocess_content(content))

for file_name in test_files:
    with open(os.path.join(directory_path, 'data', file_name), 'r') as file:
        content = file.read()
        test_articles.append(preprocess_content(content))

# Fit tokenizer on training data
tokenizer.fit_on_texts(train_articles)

# Convert the articles to sequences
train_sequences = tokenizer.texts_to_sequences(train_articles)
test_sequences = tokenizer.texts_to_sequences(test_articles)

# Pad the sequences to ensure equal length
max_sequence_length = max(len(seq) for seq in train_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Define the model architecture
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
hidden_units = 64

# Encoder model
encoder_inputs = tf.keras.Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True))(encoder_embedding)

# Attention mechanism
attention = Dense(1, activation='tanh')(encoder_lstm)
attention = tf.keras.layers.Flatten()(attention)
attention = Activation('softmax')(attention)
attention = tf.keras.layers.RepeatVector(hidden_units * 2)(attention)
attention = tf.keras.layers.Permute([2, 1])(attention)

# Apply attention weights
sent_representation = tf.keras.layers.multiply([encoder_lstm, attention])
sent_representation = tf.keras.layers.Lambda(lambda xin: tf.keras.backend.sum(xin, axis=-2))(sent_representation)

# Decoder model
decoder_inputs = tf.keras.Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True)(decoder_embedding)
decoder_output = Dense(vocab_size, activation='softmax')(decoder_lstm)

# Compile the model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the model
model.fit([train_sequences, train_sequences], train_sequences, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss = model.evaluate([test_sequences, test_sequences], test_sequences)

print("Test Loss:", loss)

# Generate summaries for test articles
predicted_summaries = model.predict([test_sequences, test_sequences])

# Convert predicted summaries back to text
predicted_texts = []
for summary in predicted_summaries:
    predicted_text = tokenizer.sequences_to_texts([np.argmax(summary, axis=1)])[0]
    predicted_texts.append(predicted_text)

# Example reference and generated summaries
reference_texts = ["The estranged wife of former football star Deion Sanders was released from a Texas jail Friday, hours after she was booked on an assault charge, according to the Collin County Sheriff's Office."]
generated_texts = ["Deion Sanders' wife arrested on assault charge"]

rouge = Rouge()

# Calculate ROUGE scores
scores = rouge.get_scores(generated_texts, reference_texts, avg=True)

# Print ROUGE scores
print("ROUGE scores:")
print(f"ROUGE-1: {scores['rouge-1']}")
print(f"ROUGE-2: {scores['rouge-2']}")
print(f"ROUGE-L: {scores['rouge-l']}")
