# NLP - Summary Generator

Lots of example code taken from: https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

In [72]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
data = pd.read_csv("./processed_data/fantasy_book_data.csv")

In [6]:
summary_data = list(data['summaries'])
# summary_data[0]
print('Number of summaries: %d' % (len(summary_data)))

Number of summaries: 16559


In [82]:
summary_lengths = [len(s) for s in summary_data]
mean_chars = np.mean(np.array(summary_lengths))
stdev_chars = np.std(np.array(summary_lengths))
max_chars = max(summary_lengths)
min_chars = min(summary_lengths)
print("Min: {:.2f}".format(min_chars))
print("Max: {:.2f}".format(max_chars))
print("Mean: {:.2f}".format(mean_chars))
print("Stdev: {:.2f}".format(stdev_chars))

Min: 11.00
Max: 58019.00
Mean: 2511.11
Stdev: 2902.99


In [83]:
seq_length = 200

In [86]:
stops = stopwords.words('english')
# stops.extend('.,[,],(,),;,/,-,\',?,",:,<,>,n\'t,|,#,\'s,\",\'re,\'ve,\'ll,\'d,\'re'.split(','))
# stops.extend(',')

chars = []
unknown_label = 'UNK'

char_dict = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .!?:,\'%-\(\)/$|&;[]"'

for c in char_dict:
    chars.append(c)

chars = list(set(chars))

chars.insert(0, unknown_label)

mapping = dict((c, i) for i, c in enumerate(chars))

In [116]:
unknown_chars = []
summary_data_cleaned = []
# encoded_summary_data = []

sequences = []

for i, summary in enumerate(summary_data):
#     filtered_summary = " ".join([w for w in word_tokenize(summary) if not w.lower() in stops])
    filtered_summary = (summary).strip()
    if len(filtered_summary) < seq_length + 1:
        continue
    summary_chars = [c if c in chars else 'UNK' for c in summary]
    filtered_summary = "".join(summary_chars)
#     encoded_summary_chars = [mapping[c] for c in summary_chars]
    unknown_chars = list(set(unknown_chars + [c for c in summary if c not in chars]))
    summary_data_cleaned.append(summary_chars)

    for j in range(seq_length, len(summary_chars)):
        # select sequence of tokens
        seq = filtered_summary[j-seq_length:j+1]
        sequences.append(seq)
#     encoded_summary_data.append(encoded_summary_chars)
        
    if i >= 10:
        break

In [117]:
# len(summary_data_cleaned)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 60942


In [141]:
max_sequences = 1000
encoded_sequences = []
for i, s in enumerate(sequences):
    if i >= max_sequences:
        break
    encoded_seq = [mapping[c] for c in s]
    encoded_sequences.append(encoded_seq)

In [119]:
vocab_size = len(mapping)

In [142]:
encoded_sequences = np.array(encoded_sequences)

In [143]:
X, y = encoded_sequences[:,:-1], encoded_sequences[:,-1]

In [127]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

Using TensorFlow backend.


In [144]:
X = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

In [145]:
X.shape

(1000, 200, 83)

In [146]:
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                47700     
_________________________________________________________________
dense_1 (Dense)              (None, 83)                6308      
Total params: 54,008
Trainable params: 54,008
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
 - 7s - loss: 3.9580 - accuracy: 0.0770
Epoch 2/100
 - 6s - loss: 3.1201 - accuracy: 0.1590
Epoch 3/100
 - 6s - loss: 3.0470 - accuracy: 0.1590
Epoch 4/100
 - 6s - loss: 3.0362 - accuracy: 0.1590
Epoch 5/100
 - 7s - loss: 3.0291 - accuracy: 0.1590
Epoch 6/100
 - 6s - loss: 3.0142 - accuracy: 0.1590
Epoch 7/100
 - 6s - loss: 3.0024 - accuracy: 0.1590
Epoch 8/100
 - 6s - loss: 2.9890 - accuracy: 0.1590
Epoch 9/100
 - 6s - loss: 2.9737 - accuracy: 0.1590
Epoch 10/100
 - 6s - loss: 2.9617 - acc

<keras.callbacks.callbacks.History at 0x14b41259408>

In [147]:
model.save('model_trained.h5')

In [152]:
from keras.preprocessing.sequence import pad_sequences

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

ERROR! Session/line number was not unique in database. History logging moved to new session 825


In [158]:
input_text = summary_data[2999][:100]
print(input_text)

 Tommy Stubbins is waiting for Doctor Dolittle's return from the Moon and when the Doctor does so, h


In [159]:
print(generate_seq(model, mapping, 200, input_text, 20))

 Tommy Stubbins is waiting for Doctor Dolittle's return from the Moon and when the Doctor does so, higs and the pirmales
