In [76]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, InputLayer, Dropout
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [77]:
df = pd.read_csv("../data/preprocessing.csv")

In [78]:
df.head()

Unnamed: 0.1,Unnamed: 0,original_text,source,pornografi,sara,radikalisme,pencemaran_nama_baik,tokenized_text,stopword_tokenized,lemmatized_text
0,0,bukan cm spanduk prof video orasi mereka buku ...,twitter,0,0,1,0,"['bukan', 'cm', 'spanduk', 'prof', 'video', 'o...","['cm', 'spanduk', 'prof', 'video', 'orasi', 'b...",cm spanduk prof video orasi buku dll udh sngat...
1,1,memeqbeceq gy sangegatel yh tetekmemeky drnjng...,twitter,1,0,0,0,"['memeqbeceq', 'gy', 'sangegatel', 'yh', 'tete...","['memeqbeceq', 'gy', 'sangegatel', 'yh', 'tete...",memeqbeceq gy sangegatel yh tetekmemeky drnjng...
2,2,pertama kali denger lagunya enk banget in dan ...,instagram,0,0,0,0,"['pertama', 'kali', 'denger', 'lagunya', 'enk'...","['kali', 'denger', 'lagunya', 'enk', 'banget',...",kali denger lagunya enk banget in pngn praktek...
3,3,astajim ini pasti yang kasih penghargaan ke ib...,kaskus,0,0,0,0,"['astajim', 'ini', 'pasti', 'yang', 'kasih', '...","['astajim', 'kasih', 'penghargaan', 'sri', 'an...",astajim kasih penghargaan sri antek aseng wahy...
4,4,beda kalau disini kalau komplain lgs di bully ...,kaskus,0,0,0,0,"['beda', 'kalau', 'disini', 'kalau', 'komplain...","['beda', 'komplain', 'lgs', 'bully', 'ama', 'q...",beda komplain lgs bully ama quotgenkquot kl fr...


In [79]:
# Ambil teks dan label
texts = df['lemmatized_text'].astype(str).values
labels = df[['pornografi', 'sara', 'radikalisme', 'pencemaran_nama_baik']].values

In [80]:
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [81]:
# Padding sequences
maxlen = 100
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

In [82]:
bowvecortizer = CountVectorizer()
bow_vector = bowvecortizer.fit_transform(df['lemmatized_text'])

In [83]:
x_bow = bow_vector.toarray()

In [84]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.1, random_state=10)


In [85]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (693, 100)
y_train shape: (693, 4)
X_test shape: (77, 100)
y_test shape: (77, 4)


In [86]:
max_len = X_train[1]

In [87]:
# Membuat model LSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128))
model.add(Dense(4, activation='sigmoid'))  # 4 adalah jumlah label

# model = Sequential()
# model.add(Embedding(input_dim=5000, output_dim=128))  # input_dim adalah ukuran vocabulary, output_dim adalah dimensi embedding
# model.add(LSTM(128, return_sequences=True))  # return_sequences=True agar bisa menambahkan LSTM lain
# model.add(Dropout(0.5))  # Dropout untuk mencegah overfitting
# model.add(LSTM(64))  # LSTM kedua dengan ukuran layer 64
# model.add(Dense(4, activation='sigmoid'))  # 4 adalah jumlah label

In [88]:
# optimizer = Adam(learning_rate=0.001)

In [89]:
# # Membuat model LSTM
# model = Sequential()
# model.add(InputLayer(shape=(max_len,)))
# model.add(Dense(128, activation='relu'))  # Mengganti Embedding dengan Dense layer
# model.add(LSTM(128, return_sequences=False))
# model.add(Dense(4, activation='sigmoid'))  # 4 output untuk 4 label

In [90]:
# Kompilasi model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [91]:
# Melatih model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 107ms/step - accuracy: 0.2204 - loss: 0.6267 - val_accuracy: 0.1688 - val_loss: 0.5154
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 89ms/step - accuracy: 0.1345 - loss: 0.4864 - val_accuracy: 0.1688 - val_loss: 0.5181
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step - accuracy: 0.1905 - loss: 0.4693 - val_accuracy: 0.5584 - val_loss: 0.4729
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step - accuracy: 0.4774 - loss: 0.4112 - val_accuracy: 0.4675 - val_loss: 0.3440
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 94ms/step - accuracy: 0.4961 - loss: 0.2537 - val_accuracy: 0.5065 - val_loss: 0.3271
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 101ms/step - accuracy: 0.5196 - loss: 0.1767 - val_accuracy: 0.5844 - val_loss: 0.3532
Epoch 7/10
[1m22/22[0m [32m━━

<keras.src.callbacks.history.History at 0x1acca4a6b90>

In [92]:
from sklearn.metrics import classification_report

# Prediksi probabilitas untuk data uji
y_pred_prob = model.predict(X_test)

# Ubah probabilitas menjadi label kelas
y_pred = np.argmax(y_pred_prob, axis=1)

# Tampilkan classification report
print(classification_report(y_test.argmax(axis=1), y_pred))


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
              precision    recall  f1-score   support

           0       0.93      0.51      0.66        51
           1       0.62      0.62      0.62         8
           2       0.42      1.00      0.59         5
           3       0.34      0.77      0.48        13

    accuracy                           0.60        77
   macro avg       0.58      0.73      0.59        77
weighted avg       0.77      0.60      0.62        77



In [93]:
new_texts = ["ih memek kamu bau banget aku ga suka deh, tapi gapapa jadi pengen aku entot"]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=maxlen)
predictions = model.predict(new_padded_sequences)
print(predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[[0.84180343 0.00194653 0.00116207 0.00560451]]


In [96]:
import pickle

with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)