In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sabariking0","key":"302febd3828165c5c2cbe7c18eda763b"}'}

In [None]:
import os
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail

Dataset URL: https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
License(s): CC0-1.0
Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
100% 503M/503M [00:06<00:00, 40.3MB/s]
100% 503M/503M [00:06<00:00, 75.9MB/s]


In [None]:
import zipfile
!unzip newspaper-text-summarization-cnn-dailymail.zip -d cnn_dailymail_data


Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail_data/cnn_dailymail/test.csv  
  inflating: cnn_dailymail_data/cnn_dailymail/train.csv  
  inflating: cnn_dailymail_data/cnn_dailymail/validation.csv  


In [None]:
!ls cnn_dailymail_data

cnn_dailymail


In [None]:
!pip install tensorflow pandas numpy scikit-learn matplotlib nltk




In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load datasets with limits for debugging
train_data = pd.read_csv('/content/cnn_dailymail_data/cnn_dailymail/train.csv')
test_data = pd.read_csv('/content/cnn_dailymail_data/cnn_dailymail/test.csv')

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    return text

# Apply preprocessing
train_data['article'] = train_data['article'].apply(preprocess_text)
train_data['highlights'] = train_data['highlights'].apply(preprocess_text)

# Define maximum lengths
max_article_len = max(len(seq) for seq in train_data['article'])
max_summary_len = max(len(seq) for seq in train_data['highlights'])


In [None]:
# Tokenizers
article_tokenizer = Tokenizer()
article_tokenizer.fit_on_texts(train_data['article'])
article_vocab_size = len(article_tokenizer.word_index) + 1

summary_tokenizer = Tokenizer()
summary_tokenizer.fit_on_texts(train_data['highlights'])
summary_vocab_size = len(summary_tokenizer.word_index) + 1

# Convert text to sequences
x_train = article_tokenizer.texts_to_sequences(train_data['article'])
y_train = summary_tokenizer.texts_to_sequences(train_data['highlights'])

# Replace out-of-range indices with 0
x_train = [[token if token < article_vocab_size else 0 for token in seq] for seq in x_train]
y_train = [[token if token < summary_vocab_size else 0 for token in seq] for seq in y_train]

# Add <START> and <END> tokens for summaries
start_token = summary_vocab_size
end_token = summary_vocab_size + 1
y_train = [[start_token] + seq + [end_token] for seq in y_train]

# Update summary vocabulary size
summary_vocab_size += 2

# Padding sequences
x_train = pad_sequences(x_train, maxlen=max_article_len, padding='post')
y_train = pad_sequences(y_train, maxlen=max_summary_len, padding='post')


In [None]:
# Define Model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_article_len,))
enc_emb = Embedding(article_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(summary_vocab_size, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(summary_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [None]:
# Prepare Decoder Target Data
decoder_target_data = np.expand_dims(y_train[:, 1:], axis=-1)

# Train the model
model.fit(
    [x_train, y_train[:, :-1]],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - loss: 7.5453 - val_loss: 7.5442
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 7.5421 - val_loss: 7.5426
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 7.5384 - val_loss: 7.5405
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 7.5338 - val_loss: 7.5372
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - loss: 7.5269 - val_loss: 7.5309
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - loss: 7.5148 - val_loss: 7.5164
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 7.4872 - val_loss: 7.4767
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 7.4121 - val_loss: 7.4077
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step

In [None]:
# Save the trained model
# Save the model in SavedModel format
model.save("seq2seq_model_fixed.keras")

In [None]:
# Load the model
# Load the model in SavedModel format
model = tf.keras.models.load_model("seq2seq_model_fixed.keras")


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
# Define Encoder Model
encoder_model = Model(encoder_inputs, [state_h, state_c])

# Define Decoder Model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_states_input = Input(shape=(None, latent_dim))

decoder_outputs, state_h, state_c = decoder_lstm(
    dec_emb_layer(decoder_inputs),
    initial_state=[decoder_state_input_h, decoder_state_input_c]
)
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + [decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs, state_h, state_c]
)

In [None]:
# Decode Function
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.array([[start_token]])
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = summary_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<END>' or len(decoded_sentence.split()) > max_summary_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return decoded_sentence.strip()

In [None]:
# Prepare test data
x_test = article_tokenizer.texts_to_sequences(test_data['article'])
x_test = pad_sequences(x_test, maxlen=max_article_len, padding='post')

# Test on a few samples
for i in range(5):
    input_article = x_test[i].reshape(1, -1)
    predicted_summary = decode_sequence(input_article)
    print(f"Article: {test_data['article'][i]}")
    print(f"Predicted Summary: {predicted_summary}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 58s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2