<a href="https://colab.research.google.com/github/rawar/tensorflow-notebooks/blob/master/lstm_text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Textgenerator based on news

Based on Enriques work of an [Word-level LSTM text generator](https://medium.com/coinmonks/word-level-lstm-text-generator-creating-automatic-song-lyrics-with-neural-networks-b8a1617104fb)

In [1]:
!pip install tf-nightly-gpu



In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [0]:
url = 'https://gist.githubusercontent.com/rawar/ae4cce269e29c2826163fbca60b544f4/raw/20594a832b44de4b4a5160fb2732399ba09a70e9/reco-content-data.csv'
text_corpus = pd.read_csv(url)

In [177]:
headlines_data_frame = text_corpus['title']
headlines_data_frame = headlines_data_frame.dropna()
headlines_data_frame.head()

0                             Google verkauft Motorola
1    Touchscreen im Winter: Handy-Handschuhe selber...
2    \Bei WhatsApp als Kontakt blockiert: So merkt ...
3    Paper: Facebook stellt neue App im Flipboard-S...
4               WhatsApp: mehr Privatsphäre für Nutzer
Name: title, dtype: object

In [178]:
#headlins_data_frame.tolist()[0]
#text_in_words = [w for w in headlins_data_frame.tolist()[0].split(' ') if w.strip() != '' or w == '\n']
text_in_words = headlins_data_frame.str.split(' ').tolist()
#text_in_words = headlins_data_frame.to_csv(None, header=False, index=False).split('\n')
print('Number of sentences:', len(text_in_words))

Number of sentences: 15751


In [179]:
print(text_in_words[15749:15750])

[['Fatburner-Training:', 'Diese', 'Übungen', 'sind', 'für', 'Männer']]


In [229]:
max_words_per_headline = 5
for example in text_in_words:
  new_max = len(example)
  if new_max > max:
    max_words_per_headline = new_max
    
print(max_words_per_headline)

5


In [0]:
words = set()
for sentence in text_in_words:
  for word in sentence:
    words.add(word)

In [187]:
len(words)

10654

In [0]:
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [0]:
SEQUENCE_LEN = 2
MIN_WORD_FREQUENCY = 1
STEP = 1
BATCH_SIZE = 32

In [0]:
sentences = []
next_words = []
ignored = 0
for sentence in text_in_words:
  for i in range(0, len(sentence) - SEQUENCE_LEN, STEP):
    sentences.append(sentence[i:i + SEQUENCE_LEN])
    next_words.append(sentence[i + SEQUENCE_LEN])

In [0]:
#print(sentences[0])

In [0]:
#print(next_words[0])

In [0]:
#print(sentences[2])

In [0]:
#print(next_words[2])

In [0]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)

In [184]:
(sentences_train, next_words_train), (sentences_test, next_words_test) = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 98214
Size of test set = 2005


In [188]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(words), output_dim=1024),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(words)),
    tf.keras.layers.Activation('softmax')
])

W0512 15:29:23.394547 140615091943296 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0512 15:29:23.398768 140615091943296 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0512 15:29:23.414896 140615091943296 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init

In [0]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [0]:
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [0]:
!mkdir checkpoints

In [0]:
file_path = "./checkpoints/lstm-text-gen-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
                "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % \
                (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)

In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(max_words_per_headline):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()


In [0]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = tf.keras.callbacks.LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=20)
callbacks_list = [checkpoint, print_callback, early_stopping]

In [0]:
examples_file = open('examples.txt', "w")

In [210]:
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


<tensorflow.python.keras.callbacks.History at 0x7fe2e7691d30>

In [213]:
!less examples.txt

7[?47h[?1h=
----- Generating text after Epoch: 0
----- Diversity:0.3
----- Generating with seed:
"auf YouTube"
auf YouTube
----- Generating text after Epoch: 0
----- Diversity:0.3
----- Generating with seed:
"Update belohnt"
Update belohnt euch bald mit mehr Werbung in den Vordergrund nerven mit Diamant- Display kommen im Camouflage-Look Das sind die neuen Funktionen für iPhone und A pple Watch einer Frau das Leben in Rot erscheinen erscheinen – als mehr Werbung  in den Stories nerven nerven für Schulen: Apple stellt AR und Notizen in den
----- Diversity:0.4
----- Generating with seed:
"Update belohnt"
Update belohnt euch bald mit mehr Werbung in den Stories nerven nerven auf dem S martphone: Vulkan und 4 mit mehr Werbung in den Stories nerven nerven für Schule n: Apple stellt neues neues Design und mehr Kraft Werbung in den Stories nerven  nerven in den Vordergrund nerven geht bei den Stories nerven nerven
----- Diversity:0.5
----- Generating with seed:
[K"

In [215]:
!ls -l checkpoints

total 2433480
-rw-r--r-- 1 root root 177988152 May 12 15:45 lstm-text-gen-epoch001-words10654-sequence2-minfreq1-loss1.9330-acc0.6770-val_loss1.5170-val_acc0.7207
-rw-r--r-- 1 root root 177988152 May 12 15:43 lstm-text-gen-epoch001-words10654-sequence2-minfreq1-loss2.6888-acc0.6056-val_loss2.5850-val_acc0.6205
-rw-r--r-- 1 root root 177988152 May 12 15:41 lstm-text-gen-epoch001-words10654-sequence2-minfreq1-loss3.4432-acc0.5528-val_loss4.0844-val_acc0.5719
-rw-r--r-- 1 root root 177988152 May 12 15:45 lstm-text-gen-epoch002-words10654-sequence2-minfreq1-loss1.3065-acc0.7536-val_loss1.0022-val_acc0.8026
-rw-r--r-- 1 root root 177988152 May 12 15:46 lstm-text-gen-epoch003-words10654-sequence2-minfreq1-loss0.9603-acc0.8018-val_loss0.7820-val_acc0.8309
-rw-r--r-- 1 root root 177988152 May 12 15:47 lstm-text-gen-epoch004-words10654-sequence2-minfreq1-loss0.7879-acc0.8247-val_loss0.6605-val_acc0.8477
-rw-r--r-- 1 root root 177988152 May 12 15:48 lstm-text-gen-epoch005-words10654-sequence2-mi

In [0]:
from google.colab import files
model_json = model.to_json()
with open("lstm-text-gen_acc88.json", "w") as json_file:
    json_file.write(model_json)

In [0]:
files.download("lstm-text-gen_acc88.json")

In [218]:
model.save_weights("lstm-text-gen_model.h5")
print("Saved model to disk")

Saved model to disk


In [221]:
!ls -lh

total 57M
drwxr-xr-x 2 root root 4.0K May 12 16:11 checkpoints
-rw-r--r-- 1 root root 118K May 12 16:26 examples.txt
-rw-r--r-- 1 root root 2.3K May 12 16:30 lstm-text-gen_acc88.json
-rw-r--r-- 1 root root  57M May 12 16:30 lstm-text-gen_model.h5
drwxr-xr-x 1 root root 4.0K May  8 16:22 sample_data


In [0]:
files.download('lstm-text-gen_model.h5')

In [241]:
seed_index = np.random.randint(len(sentences+sentences_test))
seed = (sentences+sentences_test)[seed_index]
word = 'Samsung'
for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
  sentence = seed
  for i in range(max_words_per_headline):
    x_pred = np.zeros((1, SEQUENCE_LEN))
    for t, word in enumerate(sentence):
      x_pred[0, t] = word_indices[word]
      preds = model.predict(x_pred, verbose=0)[0]
      next_index = sample(preds, diversity)
      next_word = indices_word[next_index]

      sentence = sentence[1:]
      sentence.append(next_word)
    
print(sentence)


['mit', 'Twitter']
