In [1]:
import numpy as np
import random
import json
import nltk
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

snowball = SnowballStemmer("german")

with open("qa.json", encoding="utf8") as file:
    data = json.load(file)

In [2]:
words = []
labels = []
questions = []
answers_y = []

#Tokenize and read Words, Labels, Docs
for intent in data["qa"]:
    for pattern in intent["questions"]:
        tokenized_words = nltk.word_tokenize(pattern, language='german')
        words.extend(tokenized_words)
        
        questions.append(pattern)
        answers_y.append(intent["tag"])
        
        if intent["tag"] not in labels:
            labels.append(intent["tag"])

In [3]:
from nltk.corpus import stopwords
import string

training_x = []
training_y = []

label_bag = [0 for i in range(len(labels))]

for i, question in enumerate(questions):
    #tokenize
    tokens = nltk.word_tokenize(question, language='german')
    #lowercase
    tokens = [token.lower() for token in tokens]
    #remove punctution
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    #remove non-alphabetic/non-numeric
    real_tokens = [token for token in stripped if token.isalpha() or token.isnumeric()]
    #stemming
    sequence = [snowball.stem(token) for token in real_tokens]
    
    training_x.append(sequence)
    
    #labels to one-hot-encoded labels
    output = label_bag[:]
    output[labels.index(answers_y[i])] = 1
    training_y.append(output)

training_y = np.array(training_y)

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(training_x)
sequences = tokenizer.texts_to_sequences(training_x)

word_index = tokenizer.word_index
print(len(word_index))

padded_sequences = pad_sequences(sequences, maxlen=300)
print(padded_sequences)

num_words = len(word_index)+1

100
[[  0   0   0 ...   0   0  37]
 [  0   0   0 ...   9  38  39]
 [  0   0   0 ...   0   0  40]
 ...
 [  0   0   0 ...   0  16   5]
 [  0   0   0 ...   0   0  99]
 [  0   0   0 ...   0  36 100]]


In [5]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, LSTM, Input, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant
from keras.models import Model

# define model
model = Sequential()
model.add(Embedding(num_words, 300, input_length=300))
model.add(LSTM(64))
model.add(Dense(len(labels), activation='softmax'))
print(model.summary())

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          30300     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 14)                910       
Total params: 124,650
Trainable params: 124,650
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
model.fit(padded_sequences, training_y, batch_size=1, epochs=100)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
 3/54 [>.............................] - ETA: 21s - loss: 0.0307 - acc: 1.0000

KeyboardInterrupt: 

In [7]:
print(training_x)

[['hallo'], ['ist', 'jemand', 'da'], ['hi'], ['gut', 'tag'], ['hey'], ['moin'], ['servus'], ['wie', 'geht'], ['was', 'geht'], ['mir', 'geht', 'es', 'auch', 'gut'], ['mir', 'geht', 'es', 'sup'], ['es', 'geht', 'mir', 'gut'], ['gut'], ['sup'], ['ganz', 'ok'], ['nicht', 'so', 'gut'], ['es', 'geht', 'mir', 'schlecht'], ['schlecht'], ['wie', 'viel', 'sport'], ['wie', 'oft', 'sollt', 'ich', 'sport', 'mach'], ['wie', 'kann', 'ich', 'ein', 'sixpack', 'aufbau'], ['wie', 'bekommt', 'man', 'bauchmuskeln'], ['worauf', 'ist', 'zu', 'acht', 'wenn', 'man', 'mit', 'dem', 'krafttraining', 'anfangt'], ['ich', 'mocht', 'ins', 'fitnessstudio', 'worauf', 'muss', 'ich', 'acht'], ['was', 'ist', 'zu', 'beacht', 'wenn', 'man', 'mit', 'fitness', 'anfang', 'will'], ['was', 'ist', 'unt', 'gesund', 'ernahr', 'zu', 'versteh'], ['wie', 'muss', 'ich', 'mich', 'ernahr', 'um', 'gesund', 'zu', 'bleib'], ['gesund', 'nahrung'], ['wie', 'ernahr', 'ich', 'mich', 'gesund'], ['wie', 'lang', 'muss', 'ich', 'traini', 'um', 'erf

In [8]:
#testing

test = ["ich möchte gerne mit fitness anfangen"]
preprocessed_test = []

#Preprocessing
for question in test:
    #tokenize
    tokens = nltk.word_tokenize(question, language='german')
    #lowercase
    tokens = [token.lower() for token in tokens]
    #remove punctution
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    #remove non-alphabetic/non-numeric
    real_tokens = [token for token in stripped if token.isalpha() or token.isnumeric()]
    #stemming
    sequence = [snowball.stem(token) for token in real_tokens]
    
    preprocessed_test.append(sequence)

print(preprocessed_test)
test_samples_tokens = tokenizer.texts_to_sequences(preprocessed_test)
padded_samples = pad_sequences(test_samples_tokens, maxlen=300)

results = model.predict(x=padded_samples)
result_index = np.argmax(results)
tag = labels[result_index]
print(results[0])
print(tag)
print(result_index)

for t in data["qa"]:
    if t["tag"] == tag:
        responses = t["answers"]

print(random.choice(responses))

if (results[0][result_index] < 0.50):
    print("ich weiß nicht")
else:
    print(results[0][result_index])

model.save('chatbot')

[['ich', 'mocht', 'gern', 'mit', 'fitness', 'anfang']]
[2.0884080e-03 4.9062946e-04 7.2286103e-04 2.0856734e-03 2.0017901e-03
 7.6918998e-03 8.8047105e-01 4.3495353e-02 1.3347347e-02 1.9683968e-04
 3.3437867e-02 1.3619683e-02 2.2670202e-04 1.2379009e-04]
anfangen
6
Wenn du mit dem Fitnesstraining beginnen möchtest, ist es sehr wichtig, dass du auf eine saubere Ausführung der Übungen achtest, um Verletzungen vorzubeugen. Einen für dich passenden Trainingsplan findest du unter ...de
0.88047105
