## Persiapan

In [13]:
# Import library
import json
import nltk
from nltk.stem import WordNetLemmatizer
import random
import string
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore
from nltk.corpus import stopwords
from textblob import TextBlob 

nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

# Inisialisasi variabel
documents = []
patterns = []  # Daftar pola kalimat
words = []  # Daftar semua kata unik yang ditemukan dalam patterns
classes = []  # Kata semua tag intent unik 

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# JSON
# Konversi ke object python
with open("dataset.json", "r") as file:
    data = json.load(file)

print(data)

{'intents': [{'tag': 'perkenalan', 'patterns': ['hallo', 'hai', 'hi', 'hy', 'pagi', 'morning', 'siang', 'afternoon', 'sore', 'malam', 'bro', 'sis', 'kawan'], 'responses': ['Hai! AI-chan di sini. Mau tau informasi apa tentang Sahrul-sama?', 'Hallo! Saya AI-chan, salam kenal! Mau tau tentang informasi Sahrul-sama?']}, {'tag': 'salam', 'patterns': ['bye', 'good bye', 'selamat tinggal', 'sampai jumpa', 'see you', 'dadah'], 'responses': ['Bye! Semoga hari-harimu menyenangkan!', 'Dadahh! Senang bisa membantu, sampai jumpa!', 'Good bye! Sampai bertemu lagi!', 'Dahh, semoga harimu menyenangkan!', 'Sampai jumpa lagi! Semoga hari kamu luar biasa!']}, {'tag': 'pujian', 'patterns': ['keren', 'hebat', 'wow', 'mantap', 'kamu hebat!', 'kamu keren!', 'super'], 'responses': ['Sahrul-sama memang hebat!', 'Sahrul-sama memang keren, ganteng juga!', 'Saya tidak hebat, hanya Pahlawan Himmel dan Sahrul-sama yang hebat']}, {'tag': 'kata kasar', 'patterns': ['jelek', 'goblok', 'tolol', 'kamu jelek', 'kamu gobl

In [15]:
# Menghitung jumlah intents
jumlah_intents = len(data["intents"])

# Menampilkan jumlah intents
print("Jumlah intents:", jumlah_intents)

Jumlah intents: 28


## Pre Processing

In [16]:
# Spell Correction menggunakan TextBlob
def spell_correction(text):
    return str(TextBlob(text).correct())

In [17]:
for intent in data['intents']:  # Iterasi melalui array 'intents' dalam dataset
    for pattern in intent['patterns']:  # Iterasi melalui pola kalimat pada setiap intent
        # Koreksi ejaan menggunakan TextBlob
        corrected_pattern = spell_correction(pattern)
        
        # Tokenisasi setiap kata dalam pola kalimat yang telah dikoreksi
        w = nltk.word_tokenize(corrected_pattern)
        # Lemmatization dan filter kata umum menggunakan stopwords dari NLTK
        w = [lemmatizer.lemmatize(word.lower()) for word in w if word.lower() not in stop_words and word not in string.punctuation]  # Filter stopwords dan tanda baca
        words.extend(w)  # Menambahkan token yang ditemukan ke daftar 'words'
        patterns.append(corrected_pattern)

        # Menambahkan dokumen (token) dan label intent (tag) ke dalam corpus
        documents.append((w, intent['tag']))

        # Menambahkan tag intent ke dalam daftar 'classes' jika belum ada
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [18]:
# Sortir kata-kata untuk konsistensi
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))

print("Jumlah documents = ", len(documents))
print(documents)

print("\nJumlah classes = ", len(classes))
print(classes)

print("\nUnique lemmatized words = ", len(words))
print(words)

Jumlah documents =  126
[(['hallo'], 'perkenalan'), (['had'], 'perkenalan'), (['hi'], 'perkenalan'), (['he'], 'perkenalan'), (['page'], 'perkenalan'), (['morning'], 'perkenalan'), (['sing'], 'perkenalan'), (['afternoon'], 'perkenalan'), (['sore'], 'perkenalan'), (['madam'], 'perkenalan'), (['brow'], 'perkenalan'), (['his'], 'perkenalan'), (['kazan'], 'perkenalan'), (['bye'], 'salam'), (['good', 'bye'], 'salam'), (['seaman', 'lingual'], 'salam'), (['sample', 'jump'], 'salam'), (['see', 'you'], 'salam'), (['madam'], 'salam'), (['keen'], 'pujian'), (['heat'], 'pujian'), (['now'], 'pujian'), (['mental'], 'pujian'), (['same', 'heat'], 'pujian'), (['same', 'keen'], 'pujian'), (['super'], 'pujian'), (['week'], 'kata kasar'), (['goblok'], 'kata kasar'), (['toll'], 'kata kasar'), (['same', 'week'], 'kata kasar'), (['same', 'goblok'], 'kata kasar'), (['one'], 'kalimat singkat'), (['sip'], 'kalimat singkat'), (['baiklah'], 'kalimat singkat'), (['ilya'], 'kalimat singkat'), (['okelah'], 'kalimat s

In [19]:
# TF-IDF
# Inisialisasi TfidfVectorizer untuk TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(patterns)  # Menghasilkan vektor fitur TF-IDF

In [20]:
# Simpan words dan classes untuk digunakan dalam klasifikasi
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb')) 
pickle.dump(words, open('words.pkl', 'wb'))
pickle.dump(classes, open('class.pkl', 'wb'))

## Training

In [21]:
# Persiapkan variabel
training = []
output_empty = [0] * len(classes)

for i, doc in enumerate(documents):
    # Membuat output untuk klasifikasi
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    training.append([X[i].toarray()[0], output_row])  # Menggunakan X[i] sebagai input vektor TF-IDF

In [22]:
# Acak fitur dan masukkan ke array
random.shuffle(training)
training = np.array(training, dtype=object)

# Membuat fitur dan target
train_x = list(training[:, 0])
train_y = list(training[:, 1])

## Build Model Menggunakan NN

In [23]:
model = Sequential()

model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

adam = Adam(learning_rate=0.001)  
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [24]:
# Fitting model dan simpan model 
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('model.h5')

Epoch 1/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0443 - loss: 3.3004  
Epoch 2/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0386 - loss: 3.3009   
Epoch 3/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0553 - loss: 3.2752   
Epoch 4/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0878 - loss: 3.2322   
Epoch 5/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1599 - loss: 3.1921   
Epoch 6/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1626 - loss: 3.1120
Epoch 7/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2076 - loss: 3.0367   
Epoch 8/200
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2941 - loss: 2.9229
Epoch 9/200
[1m26/26[0m [32m

