# **Import Modules**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# **Data Gathering**

In [None]:
df = pd.read_csv("/content/dataset_timnas_cnn.csv")

In [None]:
df.head()

Unnamed: 0,title,content,link,timestamp
0,Penyebab Kekalahan Timnas Indonesia 1-5 dari A...,Timnas Indonesiaharus menelan kekalahan telak ...,https://www.cnnindonesia.com/olahraga/20250321...,2025-03-21 18:52:47.888965
1,"Gagal Penalti, Kevin Diks Tebus dengan Assist ...",Kevin Dikssukses menebus kegagalannya mencetak...,https://www.cnnindonesia.com/olahraga/20250321...,2025-03-21 18:52:48.162710
2,Janji Eliano Reijnders Jelang Indonesia vs Bah...,Eliano Reijnders berjanji akan memberikan pena...,https://www.cnnindonesia.com/olahraga/20250321...,2025-03-21 18:52:48.492959
3,Netizen Puji Debut Ole Romeny: Idola Baru Timn...,Ole Romenymendapat pujian dari netizen usai me...,https://www.cnnindonesia.com/olahraga/20250321...,2025-03-21 18:52:48.770771
4,Struick Terpukul Usai Indonesia Kalah Telak 1-...,Rafael Struickmenyebut kekalahan yang berat us...,https://www.cnnindonesia.com/olahraga/20250321...,2025-03-21 18:52:49.095235


# **Data Cleaning**

In [None]:
data = df["content"].copy(deep=True).iloc[:300] # Mencegah runtime crash jadi data diambil sebagian saja

In [None]:
data_content = data.astype(str).str.replace(".com", "", regex=False).str.split(".")
data_content = data_content.apply(lambda x: x[1:-1] if isinstance(x, list) and len(x) > 1 else [])
data_content = data_content.apply(lambda x: "\n".join([sentence.strip().lower() for sentence in x]))
data_content = data_content.str.replace(r'\[gambas:.*?\]|advertisement scroll to continue with content', '', regex=True)
data_content = data_content.apply(
    lambda x: re.sub(r'\b\d+\b|[@.,!?;:()\[\]{}\'"<>\/\\|`~#$%^&*+=_]|-', ' ', x)
)
data_content = data_content.apply(
    lambda x: re.sub(r'[ \t\r\f]+', ' ', x)
).str.strip()

In [None]:
full_sentence = data_content.str.cat(sep=' ')

In [None]:
full_sentence

'kekalahan ini menyisakan beberapa catatan penting yang menjadi penyebab hasil buruk tersebut\nberikut penyebab kekalahan indonesia saat dibantai australia \nbelum padu pemain baru bergabung terlalu singkat salah satu faktor yang mempengaruhi permainan timnas indonesia adalah kurangnya kekompakan tim\nsebagian besar pemain baru bergabung dengan skuad hanya tiga hingga empat hari sebelum pertandingan\nwaktu persiapan yang terbatas ini membuat koordinasi dan pemahaman antar pemain belum maksimal sehingga tak jarang terlihat kebingungannya dalam menerapkan strategi di lapangan\n \npertahanan yang kurang solid dan high pressing yang terlalu berisiko timnas indonesia juga menunjukkan pertahanan yang kurang solid\nkeputusan untuk bermain dengan tekanan tinggi atau high pressing justru menjadi bumerang karena sering kali pemain indonesia terjebak dan meninggalkan ruang kosong di pertahanan\nkondisi ini membuat australia dengan mudah memanfaatkan celah tersebut terutama ketika indonesia lebih 

# **Preprocessing Data**

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([full_sentence])
total_words = len(tokenizer.word_index) + 1

In [None]:
input_sequence = []

for line in full_sentence.split('\n'):
    if len(line.strip()) != 0:
       token_list = tokenizer.texts_to_sequences([line])[0]

       for i in range(1, len(token_list)):
           n_gram_sequence = token_list[:i+1]
           input_sequence.append(n_gram_sequence)

In [None]:
max_sequence_len = max([len(x) for x in input_sequence])
input_sequences = np.array(pad_sequences(input_sequence, maxlen=max_sequence_len, padding='pre'))

In [None]:
X = input_sequences[:,:-1]
y = input_sequences[:, -1]

In [None]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

# **Modeling LSTM**

In [None]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150),
    Dense(total_words, activation='softmax')
])



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 9ms/step - accuracy: 0.0305 - loss: 7.1475
Epoch 2/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 8ms/step - accuracy: 0.1038 - loss: 5.9607
Epoch 3/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.1871 - loss: 5.0453
Epoch 4/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.2495 - loss: 4.3683
Epoch 5/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.2988 - loss: 3.8283
Epoch 6/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.3533 - loss: 3.3840
Epoch 7/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - accuracy: 0.4081 - loss: 2.9919
Epoch 8/100
[1m2113/2113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - accuracy: 0.4625 - loss: 2.6651
Epoch 9

<keras.src.callbacks.history.History at 0x7928fc133310>

In [None]:
def generate_text(model, tokenizer, seed_text, next_words=1, max_sequence_len=20):
    # Removed tf.function to use eager execution

    # Generate words in a more efficient manner
    for _ in range(next_words):
        # Tokenize and pad in one step
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
            [token_list], maxlen=max_sequence_len-1, padding='pre'
        )

        # Make prediction - using eager execution
        predictions = model.predict(padded_sequence)
        predicted_index = tf.argmax(predictions, axis=-1).numpy()[0]

        # Faster word lookup using reverse word index
        output_word = tokenizer.index_word.get(predicted_index, "unknown")

        # Append the word
        seed_text += " " + output_word

    return seed_text

In [None]:
generate_text(model, tokenizer, "persiapan", 30, max_sequence_len)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49

'persiapan naturalisasi tim nasional indonesia itu sudah tampil penuh empat tetapi juga mencetak gol yang mendominasi lini tengah itu tim penting ini tampil impresif dengan pertandingan dan kompetisi tulis al ayam'