In [11]:
# Quelle: https://github.com/ryanmcdermott/trump-speeches/blob/master/speeches.txt

with open("speeches.txt", "r", encoding="utf-8") as file:
    contents = file.read()
    
contents = contents.split("\n")
contents = [line.strip() for line in contents if "SPEECH" not in line]

contents = "\n".join(contents)

In [12]:
# http://www.nltk.org/

import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

tokens = word_tokenize(contents)

cv = CountVectorizer(max_features=len(set(tokens)), lowercase=False, token_pattern="(.*)")
cv.fit(tokens)

features = cv.get_feature_names()

word_to_int = {}
int_to_word = {}

for i in range(0, len(features)):
    word = features[i]
    
    word_to_int[word] = i
    int_to_word[i] = word

[nltk_data] Downloading package punkt to /Users/jannis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
tokens_transformed = [word_to_int[word] for word in tokens if word in word_to_int]

In [14]:
import numpy as np

X = []
y = []

seq_length = 40

for i in range(0, len(tokens_transformed) - seq_length):
    X.append(tokens_transformed[i:i+seq_length])
    y.append(tokens_transformed[i+seq_length])
    
X = np.array(X)
y = np.array(y)

In [15]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, Flatten

model = Sequential()
model.add(Embedding(cv.max_features, 25, input_shape=(seq_length,)))
model.add(LSTM(50, return_sequences=False))

model.add(Dense(cv.max_features, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [16]:
from keras.utils import to_categorical

from keras.callbacks import ModelCheckpoint

model.fit(
    X, 
    to_categorical(y, num_classes=cv.max_features), 
    epochs=2, 
    batch_size=32, 
    validation_split=0.2)

Train on 164671 samples, validate on 41168 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1b82e0d5f8>

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 40, 25)            176525    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                15200     
_________________________________________________________________
dense_2 (Dense)              (None, 7061)              360111    
Total params: 551,836
Trainable params: 551,836
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.save("trump.mini.model")

In [10]:
import tensorflowjs as tfjs
import numpy as np

tfjs.converters.save_keras_model(model, "data/minimodel", quantization_dtype=np.uint8)