In [1]:
import json 
import string 
from bs4 import BeautifulSoup

import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
tokenizer = Tokenizer()

data = """Ich bin an diesem Ort so müde
Doch alles treibt mich zu ihm hin
Die Gesänge werden schweigen
Wo ich nicht war, komm ich nicht hin
Ich bin an diesem Ort so müde
Doch alles treibt mich zu ihm hin
Die Gesänge werden schweigen
Wo ich nicht war, komm ich nicht hin
Will ich doch weiter mich zerstreuen
Schlafend versuche ich mein Glück
Früher war auch nicht alles anders
Erinnerung is just a trick
Will ich doch weiter mich zerstreuen
Schlafend versuche ich mein Glück
Früher war auch nicht alles anders
Erinnerung is just a trick"""

corpus = data.lower().split("\n")
print(corpus)

['ich bin an diesem ort so müde', 'doch alles treibt mich zu ihm hin', 'die gesänge werden schweigen', 'wo ich nicht war, komm ich nicht hin', 'ich bin an diesem ort so müde', 'doch alles treibt mich zu ihm hin', 'die gesänge werden schweigen', 'wo ich nicht war, komm ich nicht hin', 'will ich doch weiter mich zerstreuen', 'schlafend versuche ich mein glück', 'früher war auch nicht alles anders', 'erinnerung is just a trick', 'will ich doch weiter mich zerstreuen', 'schlafend versuche ich mein glück', 'früher war auch nicht alles anders', 'erinnerung is just a trick']


In [13]:
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

{'ich': 1, 'nicht': 2, 'doch': 3, 'alles': 4, 'mich': 5, 'hin': 6, 'war': 7, 'bin': 8, 'an': 9, 'diesem': 10, 'ort': 11, 'so': 12, 'müde': 13, 'treibt': 14, 'zu': 15, 'ihm': 16, 'die': 17, 'gesänge': 18, 'werden': 19, 'schweigen': 20, 'wo': 21, 'komm': 22, 'will': 23, 'weiter': 24, 'zerstreuen': 25, 'schlafend': 26, 'versuche': 27, 'mein': 28, 'glück': 29, 'früher': 30, 'auch': 31, 'anders': 32, 'erinnerung': 33, 'is': 34, 'just': 35, 'a': 36, 'trick': 37}
38


In [17]:
input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

print(input_sequences[:5])

[[1, 8], [1, 8, 9], [1, 8, 9, 10], [1, 8, 9, 10, 11], [1, 8, 9, 10, 11, 12]]


In [21]:
max_seqeunce_len = max([len(x) for x in input_sequences])

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seqeunce_len, padding='pre'))
print(input_sequences[:5])

[[ 0  0  0  0  0  0  1  8]
 [ 0  0  0  0  0  1  8  9]
 [ 0  0  0  0  1  8  9 10]
 [ 0  0  0  1  8  9 10 11]
 [ 0  0  1  8  9 10 11 12]]


In [22]:
# feature and label segregation
xs, labels = input_sequences[:,:-1], input_sequences[:,-1]

In [24]:
# one-hot encodoing for predicted text as categorical value
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [28]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(total_words, 8))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_seqeunce_len-1)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
history = model.fit(xs, ys, epochs=1500, verbose=1)

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

In [32]:
# try an seen sequence
seed_text = "Ich bin an diesem"
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_seqeunce_len-1, padding='pre')

In [33]:
predicted = np.argmax(model.predict(token_list), axis=-1)
print(predicted)

[11]


In [34]:
for word, index in tokenizer.word_index.items():
  if index == predicted:
    print(word)
    break

ort


In [65]:
seed_text = "Svenja bin in Ort"
next_words = 5

print(tokenizer.word_index)

for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_seqeunce_len, padding='pre')
  predicted = np.argmax(model.predict(token_list), axis=-1)
  
  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  
  seed_text += " " + output_word

print(seed_text)

{'ich': 1, 'nicht': 2, 'doch': 3, 'alles': 4, 'mich': 5, 'hin': 6, 'war': 7, 'bin': 8, 'an': 9, 'diesem': 10, 'ort': 11, 'so': 12, 'müde': 13, 'treibt': 14, 'zu': 15, 'ihm': 16, 'die': 17, 'gesänge': 18, 'werden': 19, 'schweigen': 20, 'wo': 21, 'komm': 22, 'will': 23, 'weiter': 24, 'zerstreuen': 25, 'schlafend': 26, 'versuche': 27, 'mein': 28, 'glück': 29, 'früher': 30, 'auch': 31, 'anders': 32, 'erinnerung': 33, 'is': 34, 'just': 35, 'a': 36, 'trick': 37}
Svenja bin in Ort zu weiter weiter so hin


In [60]:
# windowed sentences
window_size = 6
sentences = []
all_text = []

corpus = data.lower()
words = corpus.split(" ")
print(len(words))
print(words)

range_size = len(words) - max_seqeunce_len
for i in range(0, range_size):
  this_sentence = ""
  for word in range(0, window_size-1):
    word = words[i+word]
    this_sentence = this_sentence + word
    this_sentence = this_sentence + " "
  sentences.append(this_sentence)

81
['ich', 'bin', 'an', 'diesem', 'ort', 'so', 'müde\ndoch', 'alles', 'treibt', 'mich', 'zu', 'ihm', 'hin\ndie', 'gesänge', 'werden', 'schweigen\nwo', 'ich', 'nicht', 'war,', 'komm', 'ich', 'nicht', 'hin\nich', 'bin', 'an', 'diesem', 'ort', 'so', 'müde\ndoch', 'alles', 'treibt', 'mich', 'zu', 'ihm', 'hin\ndie', 'gesänge', 'werden', 'schweigen\nwo', 'ich', 'nicht', 'war,', 'komm', 'ich', 'nicht', 'hin\nwill', 'ich', 'doch', 'weiter', 'mich', 'zerstreuen\nschlafend', 'versuche', 'ich', 'mein', 'glück\nfrüher', 'war', 'auch', 'nicht', 'alles', 'anders\nerinnerung', 'is', 'just', 'a', 'trick\nwill', 'ich', 'doch', 'weiter', 'mich', 'zerstreuen\nschlafend', 'versuche', 'ich', 'mein', 'glück\nfrüher', 'war', 'auch', 'nicht', 'alles', 'anders\nerinnerung', 'is', 'just', 'a', 'trick']


In [61]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(total_words, 8))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_seqeunce_len-1, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(max_seqeunce_len-1)))
model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
history = model.fit(xs, ys, epochs=1500, verbose=1)

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

In [66]:
seed_text = "Svenja bin in Ort"
next_words = 5

print(tokenizer.word_index)

for _ in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_seqeunce_len, padding='pre')
  predicted = np.argmax(model.predict(token_list), axis=-1)
  
  output_word = ""
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  
  seed_text += " " + output_word

print(seed_text)

{'ich': 1, 'nicht': 2, 'doch': 3, 'alles': 4, 'mich': 5, 'hin': 6, 'war': 7, 'bin': 8, 'an': 9, 'diesem': 10, 'ort': 11, 'so': 12, 'müde': 13, 'treibt': 14, 'zu': 15, 'ihm': 16, 'die': 17, 'gesänge': 18, 'werden': 19, 'schweigen': 20, 'wo': 21, 'komm': 22, 'will': 23, 'weiter': 24, 'zerstreuen': 25, 'schlafend': 26, 'versuche': 27, 'mein': 28, 'glück': 29, 'früher': 30, 'auch': 31, 'anders': 32, 'erinnerung': 33, 'is': 34, 'just': 35, 'a': 36, 'trick': 37}
Svenja bin in Ort zu weiter weiter so hin
