## **BI-LSTM + TF-IDF**

---

Dataset tweet berasal dari penelitian: [Emotion dataset from Indonesian public opinion](https://www.sciencedirect.com/science/article/pii/S2352340922006588?via%3Dihub)

Dataset                              : [link Dataset](https://github.com/Ricco48/Emotion-Dataset-from-Indonesian-Public-Opinion)

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ast

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
print("Tensorflow version:", tf.__version__)

Tensorflow version: 2.16.1


## 1. Persiapan Dataset

In [3]:
import pandas as pd

# Path ke file dataset
train_dataset_path = '..\Dataset\Training\Clean_train_data_full.csv'

# Membaca dataset
train_df = pd.read_csv(train_dataset_path, sep=',', header=None, usecols=[1,2], skiprows=1)
train_df.columns = ['tweet', 'label']

# Tampilkan 10 baris pertama dataset
print(train_df.head(10))


                                               tweet  label
0           ['pagi', 'sudah', 'di', 'buat', 'emosi']  Anger
1  ['kok', 'stabilitas', 'negara', 'memange', 'ta...  Anger
2  ['sudah', 'lah', 'emosi', 'terus', 'liat', 'em...  Anger
3  ['aib', 'bodoh', 'benar', 'sebelum', 'kata', '...  Anger
4    ['dih', 'kamu', 'yang', 'menyebalkan', 'bodoh']  Anger
5  ['asli', 'malu', 'maluin', 'orang', 'indo', 't...  Anger
6                          ['drama', 'abg', 'tolol']  Anger
7  ['masih', 'emosi', 'sih', 'sama', 'katla', 'ke...  Anger
8  ['bangsat', 'tribute', 'no', 'bencana', 'no', ...  Anger
9  ['ingin', 'pergi', 'jauh', 'terus', 'teriak', ...  Anger


## 2. Preprocessing Data

In [4]:
import ast

# Mengubah label menjadi indeks numerik
def get_label_idx(label):
    if label == 'Anger':
        return 0
    if label == 'Joy':
        return 1
    if label == 'Fear':
        return 2
    if label == "Love":
        return 3
    if label == "Sad":
        return 4
    if label == "Neutral":
        return 5

train_df['label'] = train_df['label'].apply(get_label_idx)

# Fungsi untuk menggabungkan teks yang telah diproses
def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])

train_df["preprocessed"] = train_df["tweet"].apply(join_text_list)
print(train_df["preprocessed"].head())


0                             pagi sudah di buat emosi
1    kok stabilitas negara memange tahun negara tid...
2                      sudah lah emosi terus liat emyu
3    aib bodoh benar sebelum kata aib itu muncul te...
4                      dih kamu yang menyebalkan bodoh
Name: preprocessed, dtype: object


## 3. TF-IDF Vectorization

Referensi : (Sentiment Analysis Based on Deep Learning: A Comparative Study)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inisialisasi TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=20000)  # Mengurangi fitur untuk menghemat memori

# Fit dan transform data teks
tfidf_data = vectorizer.fit_transform(train_df["preprocessed"])

# Konversi ke array
tfidf_data = tfidf_data.toarray()

print(tfidf_data.shape)


(7080, 16195)


## 4. Build and Compile Model

Referensi : https://github.com/janerjzou/sentiment_analysis_with_Bi-LSTM/blob/main/sentiment_analysis_with_Bi-LSTM.ipynb

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Reshape

# Membuat model dengan TF-IDF dan LSTM sesuai arsitektur yang diinginkan
model = Sequential([
    Dense(128, activation='relu', input_shape=(tfidf_data.shape[1],)),
    Reshape((128, 1)),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='softmax')  # Menggunakan softmax untuk multi-kelas
])

# Kompilasi model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


## 5. Pelatihan Model

In [7]:
%%time

from sklearn.model_selection import train_test_split
import tensorflow as tf

# Membagi dataset menjadi data pelatihan dan validasi
train_tfidf, valid_tfidf, train_labels, valid_labels = train_test_split(tfidf_data, train_df['label'], test_size=0.2, random_state=42)

# Mengubah data menjadi batch
batch_size = 32
train_data = tf.data.Dataset.from_tensor_slices((train_tfidf, train_labels)).batch(batch_size)
valid_data = tf.data.Dataset.from_tensor_slices((valid_tfidf, valid_labels)).batch(batch_size)

# Melatih model dengan batch
history = model.fit(train_data, epochs=10, 
                    validation_data=valid_data, verbose=2)


Epoch 1/10
177/177 - 57s - 324ms/step - accuracy: 0.2655 - loss: 1.7812 - val_accuracy: 0.2747 - val_loss: 1.7739
Epoch 2/10
177/177 - 42s - 237ms/step - accuracy: 0.2846 - loss: 1.7627 - val_accuracy: 0.2747 - val_loss: 1.7584
Epoch 3/10
177/177 - 43s - 245ms/step - accuracy: 0.2846 - loss: 1.7498 - val_accuracy: 0.2747 - val_loss: 1.7508
Epoch 4/10
177/177 - 51s - 291ms/step - accuracy: 0.2846 - loss: 1.7433 - val_accuracy: 0.2747 - val_loss: 1.7473
Epoch 5/10
177/177 - 41s - 229ms/step - accuracy: 0.2846 - loss: 1.7401 - val_accuracy: 0.2747 - val_loss: 1.7459
Epoch 6/10
177/177 - 42s - 235ms/step - accuracy: 0.2846 - loss: 1.7387 - val_accuracy: 0.2747 - val_loss: 1.7455
Epoch 7/10
177/177 - 38s - 216ms/step - accuracy: 0.2846 - loss: 1.7380 - val_accuracy: 0.2747 - val_loss: 1.7454
Epoch 8/10
177/177 - 40s - 223ms/step - accuracy: 0.2846 - loss: 1.7378 - val_accuracy: 0.2747 - val_loss: 1.7454
Epoch 9/10


KeyboardInterrupt: 

## 6. Evaluasi Model

In [1]:
import matplotlib.pyplot as plt

# Fungsi untuk plot grafik training history
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# Plot grafik akurasi dan loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# Evaluasi pada data validasi
from sklearn.metrics import classification_report

# Melakukan prediksi pada data validasi
y_pred = model.predict(valid_tfidf)
y_pred_classes = np.argmax(y_pred, axis=1)

# Mencetak classification report
print(classification_report(valid_labels, y_pred_classes))


NameError: name 'history' is not defined

## 7. Testing

In [None]:
import numpy as np

# Contoh teks sampel
sample_text = ["kecewa banget udah berbayar tapi gak bisa nonton tim uber main"]

# TF-IDF vektorisasi pada teks sampel
sample_tfidf = vectorizer.transform(sample_text).toarray()

# Melakukan prediksi dengan model
predictions = model.predict(sample_tfidf)
print(model.predict(sample_tfidf))

# Mendapatkan label dari prediksi
def get_label(predict):
    if predict == 0:
        return 'anger'
    if predict == 1:
        return 'joy'
    if predict == 2:
        return 'fear'
    if predict == 3:
        return 'love'
    if predict == 4:
        return 'sad'
    if predict == 5:
        return 'neutral'

# Print prediksi label
predicted_label = get_label(np.argmax(predictions))
print(predicted_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[[2.5055828e-02 1.1791671e-03 2.2522885e-05 8.0849772e-04 9.5384508e-01
  1.9088885e-02]]
sad
