In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Contoh data
data = pd.read_csv('../data/selected_samples.csv')
# Membuat DataFrame
df = pd.DataFrame(data)

# Tokenisasi teks
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['lemmatized_text'])
sequences = tokenizer.texts_to_sequences(df['lemmatized_text'])
word_index = tokenizer.word_index

# Padding sequences
max_len = 100  # Panjang maksimal urutan
X_padded = pad_sequences(sequences, maxlen=max_len)

# Label multi-label
y = df[['pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik']].values

# Membagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.25, random_state=5)

# Membuat model LSTM dengan embedding layer
model = Sequential()
model.add(Input(shape=(max_len,)))
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dense(4, activation='sigmoid'))  # 4 output untuk 4 label

# Kompilasi model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Melatih model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluasi model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')



Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 429ms/step - accuracy: 0.0972 - loss: 0.6855 - val_accuracy: 0.0800 - val_loss: 0.6669
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 194ms/step - accuracy: 0.1254 - loss: 0.6614 - val_accuracy: 0.0800 - val_loss: 0.6656
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 221ms/step - accuracy: 0.1076 - loss: 0.6518 - val_accuracy: 0.0800 - val_loss: 0.6533
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 229ms/step - accuracy: 0.1207 - loss: 0.5921 - val_accuracy: 0.0800 - val_loss: 0.6126
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 233ms/step - accuracy: 0.1477 - loss: 0.5425 - val_accuracy: 0.1200 - val_loss: 0.7664
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 178ms/step - accuracy: 0.2476 - loss: 0.5740 - val_accuracy: 0.1600 - val_loss: 0.5604
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━

In [18]:
from sklearn.metrics import classification_report
import numpy as np 

# Prediksi probabilitas untuk data uji
y_pred_prob = model.predict(X_test)

# Ubah probabilitas menjadi label kelas
y_pred = np.argmax(y_pred_prob, axis=1)

# Tampilkan classification report
print(classification_report(y_test.argmax(axis=1), y_pred))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
              precision    recall  f1-score   support

           0       0.86      0.67      0.75        18
           1       1.00      0.05      0.09        22
           2       0.00      0.00      0.00         6
           3       0.09      0.75      0.17         4

    accuracy                           0.32        50
   macro avg       0.49      0.37      0.25        50
weighted avg       0.76      0.32      0.32        50

