In [1]:
import json
import pandas as pd

with open("data/intents.json") as data_file:
    data = json.load(data_file)

In [3]:
text_input = []
intents = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        text_input.append(pattern)
        intents.append(intent['tag'])
        
df = pd.DataFrame({'text_input': text_input,
                   'intents': intents})

In [4]:
df.head()

Unnamed: 0,text_input,intents
0,Hai,salam
1,Hi,salam
2,Halo,salam
3,Apa Kabar,salam
4,Selamat Pagi,salam


In [5]:
df.intents.value_counts()

salam         11
bye            8
nama           6
kemampuan      5
KUA            5
pekerjaan      5
Berkas         3
pernikahan     3
rujuk          3
Name: intents, dtype: int64

In [7]:
import string

df.text_input = df.text_input.apply(lambda x: x.lower())

exclude = set(string.punctuation)
df.text_input = df.text_input.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [8]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y_train = le.fit_transform(df.intents)
y_train = to_categorical(y_train)

In [9]:
all_vocab = []
length = []

for idx, row in df.iterrows():
    sent = row['text_input']
    [all_vocab.append(i) for i in sent.split()]
    length.append(len(sent.split()))

In [10]:
len(all_vocab)

135

In [11]:
max(length)

7

In [12]:
len(set(all_vocab))

64

In [13]:
from tensorflow.keras.layers import TextVectorization

max_vocab_length = 86
max_length = 6

text_vectorization = TextVectorization(max_tokens=max_vocab_length,
                                       standardize='lower_and_strip_punctuation',
                                       split='whitespace',
                                       ngrams=None,
                                       output_mode='int',
                                       output_sequence_length=max_length
                                       )

In [14]:
text_vectorization.adapt(df.text_input)

In [15]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'apa',
 'kua',
 'aja',
 'lo',
 'kamu',
 'yang',
 'selamat',
 'ngapain',
 'mau',
 'lu',
 'bisa',
 'siapa',
 'rujuk',
 'nama',
 'kalo',
 'emang',
 'tuh',
 'tugas',
 'sih',
 'sape',
 'ok',
 'gimana',
 'di',
 'dah',
 'bawa',
 'wah',
 'tinggal',
 'syarat2',
 'siang',
 'si',
 'semoga',
 'saya',
 'sampai',
 'salam',
 'ping',
 'permisi',
 'perlu',
 'pekerjaan',
 'pagi',
 'p',
 'nya',
 'nikah',
 'menyenangkan',
 'menikah',
 'malam',
 'makasih',
 'lakukan',
 'lagi',
 'kerja',
 'kemampuan',
 'ke',
 'kabar',
 'jumpa',
 'hi',
 'harus',
 'harimu',
 'halo',
 'hai',
 'dibawa',
 'dadah',
 'daah',
 'bye',
 'berkas',
 'anda']

In [16]:
text_vectorization('halo saya mau menikah')

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([58, 33, 10, 45,  0,  0], dtype=int64)>

In [17]:
text_vectorization.get_vocabulary()[0]

''

In [18]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=max_vocab_length,
                      output_dim=16,
                      embeddings_initializer="uniform",
                      input_length=max_length)

In [19]:
import numpy as np
res_embed = embedding(np.array([[58, 33, 10, 45,  0,  0]]))
res_embed

<tf.Tensor: shape=(1, 6, 16), dtype=float32, numpy=
array([[[ 0.04194237,  0.0130688 ,  0.01129778, -0.01781829,
         -0.04763865,  0.03290497, -0.02823228,  0.00031004,
         -0.02198743, -0.00862699, -0.03788847,  0.02357319,
         -0.04832486,  0.0405406 ,  0.04538328,  0.03169156],
        [-0.03514776,  0.04857652, -0.01164766, -0.0059258 ,
          0.03125368,  0.01332663,  0.00911186, -0.03085229,
          0.03050241, -0.03810525,  0.03780134, -0.01485773,
          0.02375335,  0.04321754,  0.01261162,  0.01931432],
        [-0.00240798,  0.04827963,  0.04958204, -0.02234999,
         -0.00779647,  0.03184755, -0.00563807, -0.01604617,
          0.04825505, -0.03456446, -0.02288265, -0.01593714,
          0.04310013, -0.00767217, -0.01973579, -0.04475765],
        [ 0.04593131,  0.01069583,  0.04892487,  0.04448426,
         -0.04215707, -0.02442079, -0.0492838 ,  0.00950613,
          0.00554507, -0.04543059, -0.04565214,  0.02983906,
          0.03018907, -0.02684

In [20]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, LSTM
inputs = Input(shape=(1,), dtype='string')
x = text_vectorization(inputs)
x = embedding(x)
x = LSTM(12)(x)
outputs = Dense(9, activation='softmax')(x)
model_lstm = Model(inputs, outputs, name="LSTM_model")

In [21]:
model_lstm.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=["accuracy"])

In [22]:
model_lstm.fit(df.text_input,
               y_train,
               epochs=200,
               verbose=0)

<keras.callbacks.History at 0x15377353820>

In [23]:
model_lstm.evaluate(df.text_input, y_train)



[0.3025497496128082, 0.9591836929321289]

In [24]:
model_lstm.save("bot_model.tf")



INFO:tensorflow:Assets written to: bot_model.tf\assets


INFO:tensorflow:Assets written to: bot_model.tf\assets


In [25]:
import pickle
le_filename = open("label_encoder.pickle", "wb")
pickle.dump(le, le_filename)
le_filename.close()