In [26]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, LSTM, Dense, Embedding, Attention
from keras.models import Model
from keras.models import load_model

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
def remove_missing_rows(dataset_path, cleaned_dataset_path):
  # load the dataset
  data_frame = pd.read_csv(dataset_path)

  # drop the rows with missing values
  data_frame.dropna(subset=['summary', 'story', 'id'], inplace=True)

  # save the cleaned dataset
  data_frame.to_csv(cleaned_dataset_path, index=False)

In [28]:
original_dataset_path = '/content/drive/MyDrive/Colab Notebooks/dataset/original_dataset.csv'
cleaned_dataset_path = '/content/drive/MyDrive/Colab Notebooks/dataset/cleaned_dataset.csv'
preprocessed_dataset_path = '/content/drive/MyDrive/Colab Notebooks/dataset/preprocessed_dataset.csv'

In [29]:
remove_missing_rows(original_dataset_path, cleaned_dataset_path)

In [30]:
def preprocess_text(text):
  # remove characters
  text = re.sub(r'[^a-zA-Z\s\d]', '', text)

  # tokenize the text
  tokens = word_tokenize(text.lower())

  # remove stop words
  tokens = [token for token in tokens if token not in stop_words]

  # stem tokens
  tokens = [stemmer.stem(token) for token in tokens]

  # lemmatize tokens
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # join tokens
  tokens = ' '.join(tokens)

  return tokens

def preprocess_data_to_file(dataset_path, preprocessed_dataset_path):
  df = pd.read_csv(dataset_path)
  df['preprocessed_story'] = df['story'].apply(preprocess_text)
  df['preprocessed_summary'] = df['summary'].apply(preprocess_text)
  df.to_csv(preprocessed_dataset_path, index=False)

In [31]:
preprocess_data_to_file(cleaned_dataset_path, preprocessed_dataset_path)

In [32]:
def split_dataset(dataset_file):
  df = pd.read_csv(dataset_file)

  # split the dataset into training, validation, and testing sets
  train_ratio = 0.8
  val_ratio = 0.1
  test_ratio = 0.1

  train_df, remaining_df = train_test_split(df, test_size=1 - train_ratio, random_state=42)
  val_df, test_df = train_test_split(remaining_df, test_size=test_ratio / (test_ratio + val_ratio), random_state=42)

  train_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/train_data.csv', index=False)
  test_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/test_data.csv', index=False)
  val_df.to_csv('/content/drive/MyDrive/Colab Notebooks/dataset/val_data.csv', index=False)

split_dataset(preprocessed_dataset_path)

In [33]:
# load data
train_data = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/dataset/train_data.csv')
val_data = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/dataset/val_data.csv')
test_data = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/dataset/test_data.csv')

In [34]:
# tokenize input and target sequence
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_data['preprocessed_story'])
input_sequences = input_tokenizer.texts_to_sequences(train_data['preprocessed_story'])

In [35]:
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(train_data['preprocessed_summary'])
target_sequences = target_tokenizer.texts_to_sequences(train_data['preprocessed_summary'])

In [36]:
# pad the input and target sequence
max_input_length = 3000
max_target_length = 100
input_sequences = pad_sequences(input_sequences, maxlen=max_input_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_target_length, padding='post')

In [37]:
# define seq2seq model with attention
embedding_dim = 100  # dimensionality of the embedding vectors
hidden_units = 256  # number of hidden units in the LSTM layer

In [38]:
# encoder
encoder_inputs = Input(shape=(None, ))
encoder_embedding = Embedding(len(input_tokenizer.word_index) + 1, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(hidden_units, return_sequences=False, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [39]:
# decoder
decoder_inputs = Input(shape=(None, ))
decoder_embedding = Embedding(len(target_tokenizer.word_index) + 1, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

In [40]:
# attention
attention_layer = Attention()
context_vector = attention_layer([decoder_outputs, encoder_outputs])
decoder_combined_context = tensorflow.concat([context_vector, decoder_outputs], axis=-1)
decoder_dense = Dense(len(target_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

In [41]:
# define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [42]:
# compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# train model
batch_size = 64
epochs = 10
model.fit([input_sequences, target_sequences[:, :-1]], target_sequences[:, 1:], batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7be6115d2b90>

In [None]:
# save model
model.save('/content/drive/MyDrive/Colab Notebooks/models/seq2seq_attention_model.h5')

In [43]:
# evaluation
test_input_sequences = input_tokenizer.texts_to_sequences(test_data['preprocessed_story'])
test_input_sequences = pad_sequences(test_input_sequences, maxlen=max_input_length, padding='post')
test_target_sequences = target_tokenizer.texts_to_sequences(test_data['preprocessed_summary'])
test_target_sequences = pad_sequences(test_target_sequences, maxlen=max_target_length, padding='post')

In [None]:
# evaluate the model on test data
test_loss = model.evaluate([test_input_sequences, test_target_sequences[:, :-1]], test_target_sequences[:, 1:])

