<a href="https://colab.research.google.com/github/salzakartika1802/PROJECT-DEEP-LEARNING/blob/main/done.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow



In [4]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Fungsi untuk membuat generator sequences
def generate_sequences(file_path, tokenizer, max_sequence_len, batch_size=64, total_words=None):
    with open(file_path, "r", encoding="utf-8") as file:
        X, y = [], []
        for line in file:
            line = line.strip()
            if not line:  # Lewati baris kosong
                continue

            token_list = tokenizer.texts_to_sequences([line])[0]
            for i in range(1, len(token_list)):
                n_gram_sequence = token_list[:i+1]
                n_gram_sequence = pad_sequences([n_gram_sequence], maxlen=max_sequence_len, padding='pre')[0]
                X.append(n_gram_sequence[:-1])
                y.append(n_gram_sequence[-1])

                if len(X) == batch_size:
                    yield np.array(X), to_categorical(y, num_classes=total_words)
                    X, y = [], []

        if X:  # Yield sisa data jika ada
            yield np.array(X), to_categorical(y, num_classes=total_words)

# Parameter utama
file_path = "datasetnovel.txt"
max_sequence_len = 50
batch_size = 64

# 1. Persiapan Data
# Tokenisasi teks
with open(file_path, "r", encoding="utf-8") as file:
    data = file.read()

# Inisialisasi tokenizer
tokenizer = Tokenizer(num_words=10000)  # Batasi kosakata hingga 10.000 kata
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

# Hitung jumlah sequence yang valid untuk menentukan steps_per_epoch
sequence_count = 0
for line in data.split("\n"):
    line = line.strip()
    if not line:
        continue
    token_list = tokenizer.texts_to_sequences([line])[0]
    sequence_count += max(0, len(token_list) - 1)  # Setiap baris menyumbang sequence sebanyak len(token_list) - 1

steps_per_epoch = (sequence_count + batch_size - 1) // batch_size  # Pembulatan ke atas

# 2. Membangun Model
model = Sequential([
    Embedding(total_words, 64, input_length=max_sequence_len-1),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 3. Melatih Model dengan Generator dan Early Stopping
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

model.fit(
    x=generate_sequences(file_path, tokenizer, max_sequence_len, batch_size=batch_size, total_words=total_words),
    steps_per_epoch=steps_per_epoch,
    epochs=10,
    verbose=1,
    callbacks=[early_stopping]
)

# 4. Fungsi untuk Menghasilkan Teks Baru dengan Sampling
def sample_with_temperature(predictions, temperature=1.0):
    predictions = np.log(predictions + 1e-8) / temperature  # Tambahkan epsilon untuk stabilitas numerik
    exp_preds = np.exp(predictions)
    probabilities = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(probabilities), p=probabilities)

def generate_story(seed_text, next_words, max_sequence_len, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list, verbose=0)[0]
        predicted = sample_with_temperature(predictions, temperature)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Input dari pengguna
seed_text = input("Masukkan kalimat pertama: ")
next_words = int(input("Masukkan jumlah kata yang ingin dihasilkan: "))

# Menghasilkan teks baru
generated_text = generate_story(seed_text, next_words=next_words, max_sequence_len=max_sequence_len, temperature=1.0)
print("Generated Story:")
print(generated_text)


Epoch 1/10
[1m9620/9620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m953s[0m 99ms/step - accuracy: 0.1943 - loss: 4.8631
Epoch 2/10
[1m9620/9620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 3/10
[1m9620/9620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 4/10
[1m9620/9620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00
Epoch 5/10
[1m9620/9620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3us/step - accuracy: 0.0000e+00 - loss: 0.0000e+00


  self.gen.throw(typ, value, traceback)


Masukkan kalimat pertama: once upon a time
Masukkan jumlah kata yang ingin dihasilkan: 500
Generated Story:
once upon a time there was a group rex was very happy that could use his colors to climb to help its family plants and chased all about her adventure on his picnic all what talking friends and always make anything enjoying her lemonade very lively friends she could get a glow look too fast rescues brighter and pull solar sammy up up to play at the town from his garden to the tools to glow one day on carly after her imagination shine and happy to climbed from his leaves fred's had astronauts again near new things speed speed fences over and make it played tricks on a hand rhymes and daisies the wand to coat over the tree it could spend stories and helped his hive and keep them but it made his friend letting freddy fishes chocolate plants with each special adventure eyelids every day safely up and could find sing after any sugar wobbly flying in the home chasing lilly's friends the