In [9]:
import os
import json
import pandas as pd
import re

In [2]:
os.listdir('train-data/gigaword')

['indices.npy', 'train.jsonl', 'val.jsonl']

In [3]:
data = pd.read_json('./train-data/gigaword/train.jsonl', lines=True)

In [None]:
data

In [5]:
data = data.sample(10000)

In [6]:
data.reset_index(drop=True, inplace=True)

In [None]:
data

In [10]:
def text_cleaner(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)    
    return newString

In [12]:
cleaned_text = []
for t in data['text']:
    cleaned_text.append(text_cleaner(t))
cleaned_text[0]

'a coalition of anti immigration and us workers groups launched a television campaign monday in a bid to the halt the entry of     million skilled workers into the united states  '

In [65]:
data['text'] = cleaned_text

In [14]:
cleaned_summary = []
for t in data['summary']:
    cleaned_summary.append(text_cleaner(t))
for i in range(len(cleaned_summary)):
    cleaned_summary[i] = '_START_ '+ cleaned_summary[i] + ' _END_'

In [66]:
data['summary'] = cleaned_summary

In [19]:
cleaned_summary[0]

'_START_ anti immigration us group pushes for cutbacks in tv campaign _END_'

In [21]:
len(cleaned_summary[0])

74

In [67]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data['text'], data['summary'], test_size=0.2, random_state=42)

In [29]:
len(X_train)

8000

In [31]:
import tensorflow as tf

In [69]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, GRU, LSTM, Dense, TextVectorization
from tensorflow.keras.optimizers import Adam

In [71]:
max_vocab = 10000
max_len = 100
vectorizer = TextVectorization(max_tokens=max_vocab, output_sequence_length=max_len)
vectorizer.adapt(X_train.values)
X_train_seq = vectorizer(X_train.values)
X_val_seq = vectorizer(X_val.values)

In [72]:
vectorizer = TextVectorization(max_tokens=max_vocab, output_sequence_length=max_len)
vectorizer.adapt(y_train.values)

y_train_seq = vectorizer(y_train.values)
y_val_seq = vectorizer(y_val.values)

In [33]:
print("Num GPUs Available:", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available: 1


In [76]:
model = Sequential()
model.add(Embedding(max_vocab, 128))
model.add(SimpleRNN(128, return_sequences=True))
model.add(SimpleRNN(128, return_sequences=True))
model.add(Dense(max_vocab, activation='softmax'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 128)         1280000   
                                                                 
 simple_rnn_16 (SimpleRNN)   (None, None, 128)         32896     
                                                                 
 simple_rnn_17 (SimpleRNN)   (None, None, 128)         32896     
                                                                 
 dense_3 (Dense)             (None, None, 10000)       1290000   
                                                                 
Total params: 2,635,792
Trainable params: 2,635,792
Non-trainable params: 0
_________________________________________________________________


In [77]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.002), metrics=['accuracy'])

model.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x154358d2640>

In [78]:
model_lstm = Sequential()
model_lstm.add(Embedding(max_vocab, 128))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(LSTM(128, return_sequences=True))
model_lstm.add(Dense(max_vocab, activation='softmax'))

In [79]:
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.002), metrics=['accuracy'])

model_lstm.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1543a5e0490>

In [80]:
model_gru = Sequential()
model_gru.add(Embedding(max_vocab, 128))
model_gru.add(GRU(128, return_sequences=True))
model_gru.add(GRU(128, return_sequences=True))
model_gru.add(Dense(max_vocab, activation='softmax'))

In [81]:
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.002), metrics=['accuracy'])

model_gru.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x154403f1be0>