# Library

In [1]:
import re
import string
import numpy as np
import pandas as pd
import unicodedata
import json
import swifter

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize

!pip install langdetect
from langdetect import detect

!pip install googletrans
from googletrans import Translator

!pip install Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dropout, Dense, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

import joblib
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lemil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lemil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lemil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lemil\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!




In [20]:
!pip install pydantic

Collecting pydantic
  Downloading pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
Collecting annotated-types>=0.6.0 (from pydantic)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.33.1 (from pydantic)
  Downloading pydantic_core-2.33.1-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Collecting typing-inspection>=0.4.0 (from pydantic)
  Downloading typing_inspection-0.4.0-py3-none-any.whl.metadata (2.6 kB)
Downloading pydantic-2.11.3-py3-none-any.whl (443 kB)
Downloading pydantic_core-2.33.1-cp312-cp312-win_amd64.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.0 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.0 MB 799.2 kB/s eta 0:00:02
   ---------- ----------------------------- 0.5/2.0 MB 799.2 kB/s 

# Importing Data

In [2]:
df = pd.read_csv('../../data/main_dataset/dataset_structured.csv')
data = df.copy()
data.head(20)

Unnamed: 0,sentimen,emosi,ulasan
0,Negative,Anger,bukan menyenangkan malah bikin kesal hp saya r...
1,Negative,Anger,kalo ngak niat bikin gamenya bagus hapus aja d...
2,Negative,Anger,"makin lama, makin gak jelas dri sblum di updat..."
3,Negative,Anger,semenjak update sangat sangat buruk setiap mai...
4,Negative,Anger,burik
5,Negative,Anger,5 turun ke 1 | narik padang - denpasar ! sudah...
6,Negative,Anger,bangkrut sodara.. Udah hapus aja aplikasinya d...
7,Negative,Anger,berita hoax kok di up..kenapa gak cek dan rice...
8,Negative,Anger,"Beritanya bikin hancur dunia Crypto, Market ta..."
9,Negative,Anger,beritanya ngawur tidak berdasar seakan-akan be...


# Preprocessing Text

In [3]:
import json
import re
import string
import unicodedata
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

# Setup awal
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Slang dictionary
merged_slang_file = '../../data/slang/merged_slang_dict.json'

with open(merged_slang_file, 'r', encoding='utf-8') as f:
    slang_dict = json.load(f)

print(f"Jumlah entri dalam slang_dict: {len(slang_dict)}")

# Stopwords dictionary
stop_words = {
    "yang", "untuk", "dan", "di", "ke", "dari", "ini", "itu",
    "dengan", "atau", "tapi"
}

# Custom preprocessor
class IndoTextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, lowercase=True, remove_non_ascii=True, remove_punctuation=True,
                 remove_numbers=True, remove_stopwords=True, stemming=True,
                 remove_extra_spaces=True):
        self.lowercase = lowercase
        self.remove_non_ascii = remove_non_ascii
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.stemming = stemming
        self.remove_extra_spaces = remove_extra_spaces

    def normalize_slang(self, text):
        tokens = text.split()
        return ' '.join(slang_dict.get(word, word) for word in tokens)

    def clean_text(self, text):
        if not isinstance(text, str) or len(text.strip()) == 0:
            return ""

        if self.lowercase:
            text = text.lower()

        text = re.sub(r"http\\S+|www\\S+|https\\S+", '', text)
        text = re.sub(r'<.*?>', '', text)

        if self.remove_non_ascii:
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))

        if self.remove_numbers:
            text = re.sub(r'\\d+', '', text)

        text = self.normalize_slang(text)

        tokens = text.split()

        if self.remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]

        if self.stemming:
            text = ' '.join(tokens)
            text = stemmer.stem(text)
            tokens = text.split()

        cleaned_text = ' '.join(tokens)

        if self.remove_extra_spaces:
            cleaned_text = re.sub(r'\\s+', ' ', cleaned_text).strip()

        if len(cleaned_text) <= 5 or re.fullmatch(r'(.)\\1{2,}', cleaned_text):
            return ""

        return cleaned_text

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.swifter.apply(self.clean_text)

# Pipeline
text_pipeline = Pipeline([
    ('preprocessing', IndoTextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_features=5000))
])

Jumlah entri dalam slang_dict: 16470


In [None]:
tfidf_matrix = text_pipeline.fit_transform(data['ulasan'])
data['cleaned_ulasan'] = text_pipeline.named_steps['preprocessing'].transform(data['ulasan'])
data.to_csv('cleaned_reviews.csv', index=False)
joblib.dump(text_pipeline, '../../models_dump/sentiment_emotion_classification_dump/indo_text_pipeline.pkl')

In [None]:
data = pd.read_csv('cleaned_reviews.csv')
data

# Modelling

In [None]:
# 1. Load Data and Preprocessed Inputs
df = pd.read_csv("cleaned_reviews.csv")
df = df.dropna(subset=['cleaned_ulasan', 'emosi', 'sentimen'])
df = df[df['cleaned_ulasan'].str.strip() != ""]

texts = df['cleaned_ulasan'].astype(str).tolist()
emosi_labels = df['emosi'].tolist()
sentimen_labels = df['sentimen'].tolist()

# 2. Encode Labels
le_emosi = LabelEncoder()
y_emosi = le_emosi.fit_transform(emosi_labels)
num_emosi_classes = len(le_emosi.classes_)

le_sentimen = LabelEncoder()
y_sentimen = le_sentimen.fit_transform(sentimen_labels)
num_sentimen_classes = len(le_sentimen.classes_)

with open('../../models_dump/sentiment_emotion_classification_dump/le_emosi.pkl', 'wb') as f:
    pickle.dump(le_emosi, f)
with open('../../models_dump/sentiment_emotion_classification_dump/le_sentimen.pkl', 'wb') as f:
    pickle.dump(le_sentimen, f)

# 3. Tokenize and Pad
vocab_size = 10000
embedding_dim = 64
max_length = 120
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

with open('../../models_dump/sentiment_emotion_classification_dump/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# 4. Train-Test Split
X_train, X_val, y_train_emosi, y_val_emosi, y_train_sentimen, y_val_sentimen = train_test_split(
    padded_sequences, y_emosi, y_sentimen, test_size=0.2, random_state=42)

y_train = {'emosi_output': y_train_emosi, 'sentimen_output': y_train_sentimen}
y_val = {'emosi_output': y_val_emosi, 'sentimen_output': y_val_sentimen}

## NN

In [9]:
class MultiOutputDataset(Sequence):
    def __init__(self, X, y, batch_size=128, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.X[batch_indices]
        batch_y = {
            'emosi_output': np.array(self.y['emosi_output'])[batch_indices],
            'sentimen_output': np.array(self.y['sentimen_output'])[batch_indices]
        }
        return batch_x, batch_y

    def on_epoch_end(self):
        self.indices = np.arange(len(self.X))
        if self.shuffle:
            np.random.shuffle(self.indices)

train_dataset = MultiOutputDataset(X_train, y_train, batch_size=128)
val_dataset = MultiOutputDataset(X_val, y_val, batch_size=128)

class AccuracyThresholdStop(tf.keras.callbacks.Callback):
    def __init__(self, threshold=0.96):
        super().__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        emosi_acc = logs.get('emosi_output_accuracy')
        sentimen_acc = logs.get('sentimen_output_accuracy')
        if emosi_acc and sentimen_acc and emosi_acc >= self.threshold and sentimen_acc >= self.threshold:
            print(f"\n✅ Stopping early at epoch {epoch + 1} as accuracy threshold reached.")
            self.model.stop_training = True

input_layer = Input(shape=(max_length,))
x = Embedding(vocab_size, embedding_dim)(input_layer)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)

emosi_output = Dense(num_emosi_classes, activation='softmax', name='emosi_output')(x)
sentimen_output = Dense(num_sentimen_classes, activation='softmax', name='sentimen_output')(x)

model = Model(inputs=input_layer, outputs=[emosi_output, sentimen_output])
model.compile(
    optimizer='adam',
    loss={
        'emosi_output': 'sparse_categorical_crossentropy',
        'sentimen_output': 'sparse_categorical_crossentropy'
    },
    metrics={
        'emosi_output': 'accuracy',
        'sentimen_output': 'accuracy'
    }
)

model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=200,
    verbose=1,
    callbacks=[AccuracyThresholdStop()]
)

model.save('../../models_dump/sentiment_emotion_classification_dump/nn_multitask_model.h5')

Epoch 1/200


  self._warn_if_super_not_called()


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 92ms/step - emosi_output_accuracy: 0.3178 - emosi_output_loss: 1.5508 - loss: 2.6527 - sentimen_output_accuracy: 0.3512 - sentimen_output_loss: 1.1010 - val_emosi_output_accuracy: 0.4014 - val_emosi_output_loss: 1.3265 - val_loss: 2.2448 - val_sentimen_output_accuracy: 0.5652 - val_sentimen_output_loss: 0.9172
Epoch 2/200
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 85ms/step - emosi_output_accuracy: 0.4174 - emosi_output_loss: 1.3145 - loss: 2.2206 - sentimen_output_accuracy: 0.5621 - sentimen_output_loss: 0.9034 - val_emosi_output_accuracy: 0.4221 - val_emosi_output_loss: 1.2866 - val_loss: 2.1550 - val_sentimen_output_accuracy: 0.5762 - val_sentimen_output_loss: 0.8686
Epoch 3/200
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 89ms/step - emosi_output_accuracy: 0.4561 - emosi_output_loss: 1.2444 - loss: 2.0741 - sentimen_output_accuracy: 0.6092 - sentimen_output_loss: 0.8306 



## Hybrid Method

In [10]:
feature_extractor = Model(inputs=model.input, outputs=model.layers[-3].output)
nn_features = feature_extractor.predict(padded_sequences)
nn_features = np.array(nn_features)

y_multi = np.column_stack((y_emosi, y_sentimen))

# Make Non-Negative Features for Naive Bayes
nn_features_non_neg = np.maximum(0, nn_features)

[1m641/641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step


In [11]:
svm_param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__gamma': ['scale', 'auto']
}

svm = SVC(probability=True)
svm_multi = MultiOutputClassifier(svm)
svm_grid = GridSearchCV(svm_multi, svm_param_grid, cv=3, verbose=2, n_jobs=-1)
svm_grid.fit(nn_features, y_multi)

joblib.dump(svm_grid.best_estimator_, "../../models_dump/sentiment_emotion_classification_dump/hybrid_nn_svm_model_tuned.pkl")

Fitting 3 folds for each of 12 candidates, totalling 36 fits


['hybrid_nn_svm_model_tuned.pkl']

In [13]:
nn_features_non_neg = np.maximum(0, nn_features)

nb_param_grid = {
    'estimator__alpha': [0.1, 0.5, 1.0, 2.0]
}

nb = MultinomialNB()
nb_multi = MultiOutputClassifier(nb)
nb_grid = GridSearchCV(nb_multi, nb_param_grid, cv=3, verbose=2, n_jobs=-1)
nb_grid.fit(nn_features_non_neg, y_multi)

joblib.dump(nb_grid.best_estimator_, "../../models_dump/sentiment_emotion_classification_dump/hybrid_nn_nb_model_tuned.pkl")

Fitting 3 folds for each of 4 candidates, totalling 12 fits


['../models_dump/sentiment_emotion_classification_dump/hybrid_nn_nb_model_tuned.pkl']

## Basic Method

In [14]:
text_pipeline = joblib.load('../../models_dump/sentiment_emotion_classification_dump/indo_text_pipeline.pkl')
tfidf_features = text_pipeline.transform(df['cleaned_ulasan'])

Pandas Apply:   0%|          | 0/20504 [00:00<?, ?it/s]

In [15]:
svm_param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__kernel': ['linear', 'rbf'],
    'estimator__gamma': ['scale', 'auto']
}

svm = SVC(probability=True)
svm_multi = MultiOutputClassifier(svm)
svm_grid = GridSearchCV(svm_multi, svm_param_grid, cv=3, verbose=2, n_jobs=-1)
svm_grid.fit(tfidf_features, y_multi)

joblib.dump(svm_grid.best_estimator_, "../../models_dump/sentiment_emotion_classification_dump/standalone_svm_model_tuned.pkl")

Fitting 3 folds for each of 12 candidates, totalling 36 fits


['../models_dump/sentiment_emotion_classification_dump/standalone_svm_model_tuned.pkl']

In [16]:
nb_param_grid = {
    'estimator__alpha': [0.1, 0.5, 1.0, 2.0]
}

nb = MultinomialNB()
nb_multi = MultiOutputClassifier(nb)
nb_grid = GridSearchCV(nb_multi, nb_param_grid, cv=3, verbose=2, n_jobs=-1)
nb_grid.fit(tfidf_features, y_multi)

joblib.dump(nb_grid.best_estimator_, "../../models_dump/sentiment_emotion_classification_dump/standalone_nb_model_tuned.pkl")

Fitting 3 folds for each of 4 candidates, totalling 12 fits


['../models_dump/sentiment_emotion_classification_dump/standalone_nb_model_tuned.pkl']

# Old (Ga dipake)

In [None]:
# 1. Load and Preprocess Data
data = pd.read_csv("/content/cleaned_reviews (3).csv")  # Update path as needed
df = data.dropna(subset=['cleaned_ulasan', 'emosi', 'sentimen']).reset_index(drop=True)

texts = df['cleaned_ulasan'].astype(str).tolist()
emosi_labels = df['emosi'].tolist()
sentimen_labels = df['sentimen'].tolist()

# Label Encoding
le_emosi = LabelEncoder()
encoded_emosi = le_emosi.fit_transform(emosi_labels)
num_emosi_classes = len(le_emosi.classes_)

le_sentimen = LabelEncoder()
encoded_sentimen = le_sentimen.fit_transform(sentimen_labels)
num_sentimen_classes = len(le_sentimen.classes_)

# Tokenization
vocab_size = 10000
embedding_dim = 64
max_length = 120
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# 2. Train-Test Split
X_train, X_val, y_train_emosi, y_val_emosi, y_train_sentimen, y_val_sentimen = train_test_split(
    padded_sequences, encoded_emosi, encoded_sentimen, test_size=0.2, random_state=42)

y_train = {
    'emosi_output': y_train_emosi,
    'sentimen_output': y_train_sentimen
}
y_val = {
    'emosi_output': y_val_emosi,
    'sentimen_output': y_val_sentimen
}

# 3. Custom Data Generator
class MultiOutputDataset(Sequence):
    def __init__(self, X, y, batch_size=32, shuffle=True):
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        batch_indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.X[batch_indices]
        batch_y = {
            'emosi_output': self.y['emosi_output'][batch_indices],
            'sentimen_output': self.y['sentimen_output'][batch_indices],
        }
        return batch_x, batch_y

    def on_epoch_end(self):
        self.indices = np.arange(len(self.X))
        if self.shuffle:
            np.random.shuffle(self.indices)

# Create datasets
train_dataset = MultiOutputDataset(X_train, y_train, batch_size=32)
val_dataset = MultiOutputDataset(X_val, y_val, batch_size=32)

# 4. Build Multi-output Neural Network
input_layer = Input(shape=(max_length,))
x = Embedding(vocab_size, embedding_dim)(input_layer)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)

emosi_output = Dense(num_emosi_classes, activation='softmax', name='emosi_output')(x)
sentimen_output = Dense(num_sentimen_classes, activation='softmax', name='sentimen_output')(x)

model = Model(inputs=input_layer, outputs=[emosi_output, sentimen_output])
model.compile(
    optimizer='adam',
    loss={
        'emosi_output': 'sparse_categorical_crossentropy',
        'sentimen_output': 'sparse_categorical_crossentropy'
    },
    metrics={
        'emosi_output': 'accuracy',
        'sentimen_output': 'accuracy'
    }
)

# 5. Early Stopping Based on Accuracy
class AccuracyThresholdStop(tf.keras.callbacks.Callback):
    def __init__(self, threshold=0.96):
        super().__init__()
        self.threshold = threshold

    def on_epoch_end(self, epoch, logs=None):
        emosi_acc = logs.get('emosi_output_accuracy')
        sentimen_acc = logs.get('sentimen_output_accuracy')
        if emosi_acc and sentimen_acc and emosi_acc >= self.threshold and sentimen_acc >= self.threshold:
            print(f"\n✅ Stopping early at epoch {epoch + 1} as accuracy threshold reached.")
            self.model.stop_training = True

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=200, verbose=1, callbacks=[AccuracyThresholdStop()])

# Save model and encoders
model.save('multitask_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('le_emosi.pkl', 'wb') as f:
    pickle.dump(le_emosi, f)
with open('le_sentimen.pkl', 'wb') as f:
    pickle.dump(le_sentimen, f)

# 6. Feature Extraction from Neural Network
feature_extractor = Model(inputs=model.input, outputs=model.layers[-3].output)
features = feature_extractor.predict(padded_sequences)

# 7. Train Hybrid Models (NN + SVM, NN + Naive Bayes)
y_multi = np.column_stack((encoded_emosi, encoded_sentimen))

# SVM Hybrid
svm = SVC(kernel='linear', probability=True)
svm_multi = MultiOutputClassifier(svm)
svm_multi.fit(features, y_multi)
joblib.dump(svm_multi, "svm_multi_output_model.pkl")

# Naive Bayes Hybrid (use non-negative features)
nb = MultinomialNB()
nb_multi = MultiOutputClassifier(nb)
nb_multi.fit(np.maximum(0, features), y_multi)
joblib.dump(nb_multi, "nb_multi_output_model.pkl")

print("✅ All models saved: NN, NN+SVM, NN+NB with multi-output classification.")

Epoch 1/200


  self._warn_if_super_not_called()


[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 145ms/step - emosi_output_accuracy: 0.3320 - emosi_output_loss: 1.5048 - loss: 2.5692 - sentimen_output_accuracy: 0.3939 - sentimen_output_loss: 1.0644 - val_emosi_output_accuracy: 0.4539 - val_emosi_output_loss: 1.2863 - val_loss: 2.1658 - val_sentimen_output_accuracy: 0.5647 - val_sentimen_output_loss: 0.8795
Epoch 2/200
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 147ms/step - emosi_output_accuracy: 0.4953 - emosi_output_loss: 1.2246 - loss: 2.0410 - sentimen_output_accuracy: 0.6213 - sentimen_output_loss: 0.8164 - val_emosi_output_accuracy: 0.5095 - val_emosi_output_loss: 1.1734 - val_loss: 1.9449 - val_sentimen_output_accuracy: 0.6577 - val_sentimen_output_loss: 0.7715
Epoch 3/200
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 147ms/step - emosi_output_accuracy: 0.5638 - emosi_output_loss: 1.0935 - loss: 1.7844 - sentimen_output_accuracy: 0.7127 - sentimen_output_loss: 0.69



[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 35ms/step
✅ All models saved: NN, NN+SVM, NN+NB with multi-output classification.


# Testing (Old - Ga dipake)

In [None]:
import pickle
import joblib
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re, string, unicodedata
from langdetect import detect
from googletrans import Translator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Define IndoTextPreprocessor
translator = Translator()
factory = StemmerFactory()
stemmer = factory.create_stemmer()
slang_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "gk": "tidak",
    "aja": "saja", "kalo": "kalau", "dgn": "dengan", "yg": "yang",
    "trs": "terus", "blm": "belum", "udh": "sudah"
}
stop_words = set(["yang", "untuk", "dan", "di", "ke", "dari", "ini", "itu", "dengan", "atau", "tapi"])

class IndoTextPreprocessor:
    def __init__(self):
        self.lowercase = True
        self.remove_non_ascii = True
        self.remove_punctuation = True
        self.remove_numbers = True
        self.remove_stopwords = True
        self.stemming = True
        self.remove_extra_spaces = True

    def normalize_slang(self, text):
        tokens = text.split()
        return ' '.join(slang_dict.get(word, word) for word in tokens)

    def clean_text(self, text):
        if not isinstance(text, str) or len(text.strip()) == 0:
            return ""
        try:
            lang = detect(text)
            if lang != "id":
                text = translator.translate(text, src=lang, dest="id").text
        except:
            pass
        if self.lowercase:
            text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'<.*?>', '', text)
        if self.remove_non_ascii:
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
        text = self.normalize_slang(text)
        tokens = text.split()
        if self.remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]
        if self.stemming:
            text = ' '.join(tokens)
            text = stemmer.stem(text)
            tokens = text.split()
        cleaned_text = ' '.join(tokens)
        if self.remove_extra_spaces:
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        if len(cleaned_text) <= 5 or re.fullmatch(r'(.)\1{2,}', cleaned_text):
            return ""
        return cleaned_text

    def transform(self, text_series):
        return text_series.apply(self.clean_text)

# Load Saved Components
tokenizer = pickle.load(open("tokenizer.pkl", "rb"))
le_emosi = pickle.load(open("le_emosi.pkl", "rb"))
le_sentimen = pickle.load(open("le_sentimen.pkl", "rb"))

nn_model = tf.keras.models.load_model("multitask_model.h5")
svm_multi = joblib.load("svm_multi_output_model.pkl")
nb_multi = joblib.load("nb_multi_output_model.pkl")

# Extract individual estimators
svm_emosi, svm_sentimen = svm_multi.estimators_
nb_emosi, nb_sentimen = nb_multi.estimators_

# Feature Extractor from NN Shared Layer
feature_extractor = tf.keras.Model(inputs=nn_model.input, outputs=nn_model.layers[-3].output)

# Final Hybrid Prediction Function (NN → Feature → SVM/NB)
def predict_all(raw_text_list, do_preprocessing=True):
    df = pd.Series(raw_text_list)

    if do_preprocessing:
        preprocessor = IndoTextPreprocessor()
        cleaned = preprocessor.transform(df).fillna("")
        cleaned = cleaned.apply(lambda x: x if x.strip() != "" else "kosong")
    else:
        cleaned = df.fillna("")

    # Tokenization
    sequences = tokenizer.texts_to_sequences(cleaned)
    padded = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')

    # Feature Extraction
    features = feature_extractor.predict(padded, batch_size=32)

    # Predictions
    features_nb = np.maximum(0, features)  # For Naive Bayes
    emosi_preds_svm = svm_emosi.predict(features)
    sentimen_preds_svm = svm_sentimen.predict(features)
    emosi_preds_nb = nb_emosi.predict(features_nb)
    sentimen_preds_nb = nb_sentimen.predict(features_nb)

    return {
        "cleaned_texts": cleaned.tolist(),
        "svm": {
            "emosi": le_emosi.inverse_transform(emosi_preds_svm),
            "sentimen": le_sentimen.inverse_transform(sentimen_preds_svm)
        },
        "nb": {
            "emosi": le_emosi.inverse_transform(emosi_preds_nb),
            "sentimen": le_sentimen.inverse_transform(sentimen_preds_nb)
        },
        "y_pred_raw": {
            "svm": {"emosi": emosi_preds_svm, "sentimen": sentimen_preds_svm},
            "nb": {"emosi": emosi_preds_nb, "sentimen": sentimen_preds_nb}
        }
    }

# === Example Run ===
if __name__ == "__main__":
    raw_texts = [
        "Pelayanan sangat cepat dan ramah!",
        "Gak suka sama kualitasnya.",
        "Very disappointing, would not recommend."
    ]

    result = predict_all(raw_texts)

    print("Cleaned Texts:", result["cleaned_texts"])
    print("SVM Emosi:", result["svm"]["emosi"])
    print("SVM Sentimen:", result["svm"]["sentimen"])
    print("NB Emosi:", result["nb"]["emosi"])
    print("NB Sentimen:", result["nb"]["sentimen"])

  pass


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 759ms/step
Cleaned Texts: ['layan sangat cepat ramah', 'tidak suka sama kualitas', 'very disappointing would not recommend']
SVM Emosi: ['Happy' 'Neutral' 'Neutral']
SVM Sentimen: ['Positive' 'Neutral' 'Neutral']
NB Emosi: ['Happy' 'Sad' 'Neutral']
NB Sentimen: ['Positive' 'Neutral' 'Neutral']


# Pipeline for API/Testing

In [None]:
import pickle
import joblib
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re, string, unicodedata
import chardet
from langdetect import detect
from googletrans import Translator
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from pydantic import BaseModel

# Text Preprocessing
translator = Translator()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

merged_slang_file = '../../data/slang/merged_slang_dict.json'
with open(merged_slang_file, 'r', encoding='utf-8') as f:
    slang_dict = json.load(f)
print(f"Jumlah entri dalam slang_dict: {len(slang_dict)}")

stop_words = {
    "yang", "untuk", "dan", "di", "ke", "dari", "ini", "itu",
    "dengan", "atau", "tapi"
}
# stop_words = set(stopwords.words('indonesian'))
#Lebih bagus pake ini biar lengkap stopwordsnya (lupa ubah aja ini masih make yg atas)

class IndoTextPreprocessor:
    def __init__(self):
        self.lowercase = True
        self.remove_non_ascii = True
        self.remove_punctuation = True
        self.remove_numbers = True
        self.remove_stopwords = True
        self.stemming = True
        self.remove_extra_spaces = True

    def normalize_slang(self, text):
        tokens = text.split()
        return ' '.join(slang_dict.get(word, word) for word in tokens)

    def clean_text(self, text):
        if not isinstance(text, str) or len(text.strip()) == 0:
            return ""
        try:
            lang = detect(text)
            if lang != "id":
                text = translator.translate(text, src=lang, dest="id").text
        except:
            pass
        if self.lowercase:
            text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'<.*?>', '', text)
        if self.remove_non_ascii:
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
        text = self.normalize_slang(text)
        tokens = text.split()
        if self.remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]
        if self.stemming:
            text = ' '.join(tokens)
            text = stemmer.stem(text)
            tokens = text.split()
        cleaned_text = ' '.join(tokens)
        if self.remove_extra_spaces:
            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        if len(cleaned_text) <= 5 or re.fullmatch(r'(.)\1{2,}', cleaned_text):
            return ""
        return cleaned_text

    def transform(self, text_series):
        return text_series.apply(self.clean_text)

# Load Components
try:
    tokenizer = pickle.load(open("../../models_dump/sentiment_emotion_classification_dump/tokenizer.pkl", "rb"))
    le_emosi = pickle.load(open("../../models_dump/sentiment_emotion_classification_dump/le_emosi.pkl", "rb"))
    le_sentimen = pickle.load(open("../../models_dump/sentiment_emotion_classification_dump/le_sentimen.pkl", "rb"))
    nn_model = tf.keras.models.load_model("../models_dump/sentiment_emotion_classification_dump/nn_multitask_model.h5")
    svm_multi = joblib.load("../../models_dump/sentiment_emotion_classification_dump/hybrid_nn_svm_model_tuned.pkl")
    nb_multi = joblib.load("../../models_dump/sentiment_emotion_classification_dump/hybrid_nn_nb_model_tuned.pkl")
    svm_basic = joblib.load("../../models_dump/sentiment_emotion_classification_dump/standalone_svm_model_tuned.pkl")
    nb_basic = joblib.load("../../models_dump/sentiment_emotion_classification_dump/standalone_nb_model_tuned.pkl")
except FileNotFoundError as e:
    raise RuntimeError(f"Model or tokenizer file missing: {e}")

# Extract individual models
svm_emosi, svm_sentimen = svm_multi.estimators_
nb_emosi, nb_sentimen = nb_multi.estimators_

# Feature Extractor
feature_extractor = tf.keras.Model(inputs=nn_model.input, outputs=nn_model.layers[-3].output)

# Prediction Logic
def predict_all(raw_text_list, do_preprocessing=True):
    if not isinstance(raw_text_list, list) or not all(isinstance(t, str) for t in raw_text_list):
        raise ValueError("Input must be a list of strings.")

    df = pd.Series(raw_text_list)

    if do_preprocessing:
        preprocessor = IndoTextPreprocessor()
        cleaned = preprocessor.transform(df).fillna("")
        cleaned = cleaned.apply(lambda x: x if x.strip() != "" else "kosong")
    else:
        cleaned = df.fillna("")

    # ---------- Tokenize and pad ----------
    sequences = tokenizer.texts_to_sequences(cleaned)
    padded = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')

    # ---------- Feature extraction for hybrids ----------
    features = feature_extractor.predict(padded, batch_size=32)
    features_nb = np.maximum(0, features)

    # ---------- Hybrid Models ----------
    emosi_preds_svm = svm_emosi.predict(features)
    sentimen_preds_svm = svm_sentimen.predict(features)
    emosi_preds_nb = nb_emosi.predict(features_nb)
    sentimen_preds_nb = nb_sentimen.predict(features_nb)

    # ---------- Standalone TF-IDF Models ----------
    text_pipeline = joblib.load("../../models_dump/sentiment_emotion_classification_dump/indo_text_pipeline.pkl")
    tfidf_features = text_pipeline.transform(cleaned)

    standalone_preds_svm = svm_basic.predict(tfidf_features)
    standalone_preds_nb = nb_basic.predict(tfidf_features)

    emosi_preds_svm_basic = standalone_preds_svm[:, 0]
    sentimen_preds_svm_basic = standalone_preds_svm[:, 1]
    emosi_preds_nb_basic = standalone_preds_nb[:, 0]
    sentimen_preds_nb_basic = standalone_preds_nb[:, 1]

    # ---------- NN Only Model ----------
    nn_pred_emosi_prob, nn_pred_sentimen_prob = nn_model.predict(padded, batch_size=32)
    nn_pred_emosi = np.argmax(nn_pred_emosi_prob, axis=1)
    nn_pred_sentimen = np.argmax(nn_pred_sentimen_prob, axis=1)

    return {
        "cleaned_texts": cleaned.tolist(),

        "hybrid": {
            "svm": {
                "emosi": le_emosi.inverse_transform(emosi_preds_svm).tolist(),
                "sentimen": le_sentimen.inverse_transform(sentimen_preds_svm).tolist()
            },
            "nb": {
                "emosi": le_emosi.inverse_transform(emosi_preds_nb).tolist(),
                "sentimen": le_sentimen.inverse_transform(sentimen_preds_nb).tolist()
            }
        },

        "standalone": {
            "svm": {
                "emosi": le_emosi.inverse_transform(emosi_preds_svm_basic).tolist(),
                "sentimen": le_sentimen.inverse_transform(sentimen_preds_svm_basic).tolist()
            },
            "nb": {
                "emosi": le_emosi.inverse_transform(emosi_preds_nb_basic).tolist(),
                "sentimen": le_sentimen.inverse_transform(sentimen_preds_nb_basic).tolist()
            }
        },

        "nn_only": {
            "emosi": le_emosi.inverse_transform(nn_pred_emosi).tolist(),
            "sentimen": le_sentimen.inverse_transform(nn_pred_sentimen).tolist()
        },

        "y_pred_raw": {
            "hybrid": {
                "svm": {"emosi": emosi_preds_svm.tolist(), "sentimen": sentimen_preds_svm.tolist()},
                "nb": {"emosi": emosi_preds_nb.tolist(), "sentimen": sentimen_preds_nb.tolist()}
            },
            "standalone": {
                "svm": {"emosi": emosi_preds_svm_basic.tolist(), "sentimen": sentimen_preds_svm_basic.tolist()},
                "nb": {"emosi": emosi_preds_nb_basic.tolist(), "sentimen": sentimen_preds_nb_basic.tolist()}
            },
            "nn_only": {
                "emosi": nn_pred_emosi.tolist(),
                "sentimen": nn_pred_sentimen.tolist(),
                "emosi_proba": nn_pred_emosi_prob.tolist(),
                "sentimen_proba": nn_pred_sentimen_prob.tolist()
            }
        }
    }


In [None]:
if __name__ == "__main__":
    raw_texts = [
        "Pelayanan sangat cepat dan ramah!",
        "Gak suka sama kualitasnya.",
        "Terbaik! Makasih ya 😊",
        "Harga mahal dan kualitas buruk sekali.",
        "Jelek bgt, gasuka banget, marah nih",
        "Cinta banget sama aplikasi ini",
        "Bantu Info nya, Bagaimana Cara Menggantikan Kartu Yg Terdaftar Jika Hilang, Dengan Kartu Yg Baru.?"
    ]

    result = predict_all(raw_texts)

    for i in range(len(raw_texts)):
        print(f"\n📝 Original: {raw_texts[i]}")
        print(f"🧼 Cleaned:  {result['cleaned_texts'][i]}")

        print("🔀 NN Only:")
        print(f"   Emosi:    {result['nn_only']['emosi'][i]}")
        print(f"   Sentimen: {result['nn_only']['sentimen'][i]}")

        print("🔗 Hybrid (NN+SVM):")
        print(f"   Emosi:    {result['hybrid']['svm']['emosi'][i]}")
        print(f"   Sentimen: {result['hybrid']['svm']['sentimen'][i]}")

        print("🔗 Hybrid (NN+NB):")
        print(f"   Emosi:    {result['hybrid']['nb']['emosi'][i]}")
        print(f"   Sentimen: {result['hybrid']['nb']['sentimen'][i]}")

        print("🧠 Standalone SVM (TF-IDF):")
        print(f"   Emosi:    {result['standalone']['svm']['emosi'][i]}")
        print(f"   Sentimen: {result['standalone']['svm']['sentimen'][i]}")

        print("🧠 Standalone NB (TF-IDF):")
        print(f"   Emosi:    {result['standalone']['nb']['emosi'][i]}")
        print(f"   Sentimen: {result['standalone']['nb']['sentimen'][i]}")

In [25]:
df_clean = pd.read_csv("../cleaned_reviews.csv")
df_clean.head()

Unnamed: 0,sentimen,emosi,ulasan,cleaned_ulasan
0,Negative,Anger,bukan menyenangkan malah bikin kesal hp saya r...,bukan senang bahkan bikin kesal handphone saya...
1,Negative,Anger,kalo ngak niat bikin gamenya bagus hapus aja d...,kalau tidak niat bikin game bagus hapus saja s...
2,Negative,Anger,"makin lama, makin gak jelas dri sblum di updat...",makin lama makin tidak jelas belum baru game t...
3,Negative,Anger,semenjak update sangat sangat buruk setiap mai...,semenjak baru sangat sangat buruk tiap main ba...
4,Negative,Anger,burik,


In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

def get_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, y_pred, average="weighted", zero_division=0),
        "F1-score": f1_score(y_true, y_pred, average="weighted", zero_division=0)
    }

def compare_models(y_true_emosi, y_true_sentimen, results):
    metrics = {
        "NN-only Emosi": get_metrics(y_true_emosi, results["y_pred_raw"]["nn_only"]["emosi"]),
        "NN-only Sentimen": get_metrics(y_true_sentimen, results["y_pred_raw"]["nn_only"]["sentimen"]),

        "NN-SVM Emosi": get_metrics(y_true_emosi, results["y_pred_raw"]["hybrid"]["svm"]["emosi"]),
        "NN-SVM Sentimen": get_metrics(y_true_sentimen, results["y_pred_raw"]["hybrid"]["svm"]["sentimen"]),

        "NN-NB Emosi": get_metrics(y_true_emosi, results["y_pred_raw"]["hybrid"]["nb"]["emosi"]),
        "NN-NB Sentimen": get_metrics(y_true_sentimen, results["y_pred_raw"]["hybrid"]["nb"]["sentimen"]),

        "TFIDF-SVM Emosi": get_metrics(y_true_emosi, results["y_pred_raw"]["standalone"]["svm"]["emosi"]),
        "TFIDF-SVM Sentimen": get_metrics(y_true_sentimen, results["y_pred_raw"]["standalone"]["svm"]["sentimen"]),

        "TFIDF-NB Emosi": get_metrics(y_true_emosi, results["y_pred_raw"]["standalone"]["nb"]["emosi"]),
        "TFIDF-NB Sentimen": get_metrics(y_true_sentimen, results["y_pred_raw"]["standalone"]["nb"]["sentimen"]),
    }

    df_metrics = pd.DataFrame(metrics)
    print("\n=== 📊 Model Performance Comparison ===")
    return df_metrics.round(4)


In [27]:
# Get raw texts from DataFrame
raw_texts = df_clean["cleaned_ulasan"].astype(str).tolist()

# Get true labels
y_true_emosi = df_clean["emosi"].tolist()
y_true_sentimen = df_clean["sentimen"].tolist()

# Encode true labels to match prediction format
y_true_emosi_enc = le_emosi.transform(y_true_emosi)
y_true_sentimen_enc = le_sentimen.transform(y_true_sentimen)

# Run prediction using all 5 models
results = predict_all(raw_texts, do_preprocessing=False)

# Compare all model performances
df_results = compare_models(y_true_emosi_enc, y_true_sentimen_enc, results)

# Display the comparison
print(df_results)





[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step


  pass


[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step

=== 📊 Model Performance Comparison ===
           NN-only Emosi  NN-only Sentimen  NN-SVM Emosi  NN-SVM Sentimen  \
Accuracy          0.8657            0.9047        0.8672           0.9021   
Precision         0.8717            0.9061        0.8741           0.9037   
Recall            0.8657            0.9047        0.8672           0.9021   
F1-score          0.8673            0.9047        0.8687           0.9021   

           NN-NB Emosi  NN-NB Sentimen  TFIDF-SVM Emosi  TFIDF-SVM Sentimen  \
Accuracy        0.8529          0.9160           0.8217              0.8530   
Precision       0.8652          0.9184           0.8229              0.8539   
Recall          0.8529          0.9160           0.8217              0.8530   
F1-score        0.8562          0.9165           0.8216              0.8530   

           TFIDF-NB Emosi  TFIDF-NB Sentimen  
Accuracy           0.7018             0.7596  
Precisio