In [2]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/tongtrantiendung/rl-sentence-compression', force=True)



Downloading rl-sentence-compression.zip to ./rl-sentence-compression


100%|██████████| 423M/423M [00:41<00:00, 10.7MB/s] 





In [6]:
import pandas as pd

data = pd.read_json('rl-sentence-compression/rl-sentence-compression/rl-sentence-compression/data/train-data/gigaword/train.jsonl', lines=True)

In [7]:
data = data[:1000]

In [8]:
data = data.drop_duplicates()

data = data.dropna()

data['text'] = data['text'].str.lower()

data['text'] = data['text'].str.replace('[^\w\s]', '')

data['text'] = data['text'].str.split()

stop_words = ['the', 'a', 'an', 'and', 'or', 'in', 'on', 'at', 'to']
data['text'] = data['text'].apply(lambda x: [word for word in x if word not in stop_words])

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

data['text'] = data['text'].apply(lambda x: [word for word in x if not word.isdigit()])

data['text'] = data['text'].apply(lambda x: ' '.join(x))

data = data.reset_index(drop=True)


In [18]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_val, y_train, y_val = train_test_split(data['text'], data['summary'], test_size=0.2, random_state=42)

max_vocab = 10000
max_len = 100
vectorizer = TextVectorization(max_tokens=max_vocab, output_sequence_length=max_len)
vectorizer.adapt(X_train.values)

X_train_seq = vectorizer(X_train.values)
X_val_seq = vectorizer(X_val.values)

vectorizer = TextVectorization(max_tokens=max_vocab, output_sequence_length=max_len)
vectorizer.adapt(y_train.values)

y_train_seq = vectorizer(y_train.values)
y_val_seq = vectorizer(y_val.values)

model = Sequential()
model.add(Embedding(max_vocab, 128))
model.add(SimpleRNN(128, return_sequences=True))
model.add(SimpleRNN(128, return_sequences=True))
model.add(Dense(max_vocab, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=10, batch_size=128)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3175ad880>

In [24]:
model_lstm = Sequential()
model_lstm.add(Embedding(max_vocab, 128))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(Dense(max_vocab, activation='softmax'))

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model_lstm.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=10, batch_size=128)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1096a3b20>

In [25]:
model_gru = Sequential()
model_gru.add(Embedding(max_vocab, 128))
model_gru.add(GRU(128, return_sequences=True))
model_gru.add(GRU(128, return_sequences=True))
model_gru.add(Dense(max_vocab, activation='softmax'))

model_gru.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model_gru.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=10, batch_size=128)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x10b7553a0>