# 1. Importing Libraries

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense




In [2]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.15.0
  Downloading tensorflow_intel-2.15.0-cp310-cp310-win_amd64.whl (300.9 MB)
     -------------------------------------- 300.9/300.9 MB 5.5 MB/s eta 0:00:00
Collecting keras<2.16,>=2.15.0
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 10.9 MB/s eta 0:00:00
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1
  Downloading gast-0.5.4-py3-none-any.whl (19 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
     -------------------------------------- 442.0/442.0 kB 9.2 MB/s eta 0:00:00
Collecting tensorboard<2.16,>=2.15
  Downloading tensorboard-2.15.1-py3-none-any.whl (5.5

# 2. Loading data

In [64]:
with open('Drei-Meister-Balzac-Dickens-Dostojewski_Stefan-Zweig_Projekt-Gutenberg_36389-8.txt', 'r') as file:
    text = file.read()

# 3. Tokenizing the text to create sequence of words

In [65]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# 4. Creating input-output pairs

In [66]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# 5. Padding input sequences to have equal length

In [67]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len, padding='pre'))

# 6. Splitting sequences into input and output

In [68]:
X = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [69]:
len(input_sequences[:,:-1]) == len(input_sequences) # ????

True

# 7. Converting the output to one-hot encode vectors

In [70]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

# 8. Building the Model as RNN & FFNN

# !!! Choose 8.1 or 8.2. Don't run both !!!

## 8.1 building a RNN Architecture:

In [71]:
model = Sequential()
model.add(Embedding(total_words, 100,input_length = max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 15, 100)           1156800   
                                                                 
 lstm_1 (LSTM)               (None, 150)               150600    
                                                                 
 dense_12 (Dense)            (None, 11568)             1746768   
                                                                 
Total params: 3054168 (11.65 MB)
Trainable params: 3054168 (11.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


## 8.2 building a FFNN Architecture:

In [56]:
nr_unique_words = total_words

In [62]:
model = Sequential()
model.add(Embedding(total_words, 100,input_length = max_sequence_len-1))
# model.add(Dense(256, input_shape=(784,), activation="sigmoid"))
model.add(Dense(150, activation="sigmoid"))
model.add(Dense(total_words, activation="softmax"))
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 15, 100)           1156800   
                                                                 
 dense_10 (Dense)            (None, 15, 150)           15150     
                                                                 
 dense_11 (Dense)            (None, 15, 11568)         1746768   
                                                                 
Total params: 2918718 (11.13 MB)
Trainable params: 2918718 (11.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


# 9. Compile and Train the model

# !!! RUNNING RNN MODEL \/

In [72]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X,y,epochs=100,verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.src.callbacks.History at 0x19ab7418df0>

# 10. Saving the model

In [73]:
model.save('de_3meister_RNN_model_v2_100epochs.h5')

# 11. Generating the next word predictions

In [78]:
from tensorflow.keras.models import load_model
# Load the saved model
model = load_model('de_3meister_LSTM_model_v2_100epochs.h5')  # Replace 'my_model.h5' with the path to your saved model

seed_text = "Es ist"
next_words = 2

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

Es ist ein nur
