# **Import Modules**

In [2]:
!pip install nlpaug transformers -q

In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf
import warnings
import nlpaug.augmenter.word as naw

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, LSTM, Dense, Dropout,
                                     GlobalMaxPooling1D, Conv1D, GRU,
                                     Bidirectional, BatchNormalization,
                                     SpatialDropout1D, MaxPooling1D,
                                     Activation, GlobalAveragePooling1D)
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight

warnings.filterwarnings('ignore')

# **Memuat Data**

In [4]:
!wget --no-check-certificate https://raw.githubusercontent.com/rioooranteai/nlp-project/main/Analisis%20Sentimen%20-%20Coretax/Dataset/Dataset.xlsx -O "/content/Dataset.xlsx"

--2025-04-22 10:40:34--  https://raw.githubusercontent.com/rioooranteai/nlp-project/main/Analisis%20Sentimen%20-%20Coretax/Dataset/Dataset.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1240575 (1.2M) [application/octet-stream]
Saving to: ‘/content/Dataset.xlsx’


2025-04-22 10:40:35 (35.0 MB/s) - ‘/content/Dataset.xlsx’ saved [1240575/1240575]



In [5]:
df = pd.read_excel("/content/Dataset.xlsx", usecols=['full_text', 'sentiment'])
df.head()

Unnamed: 0,full_text,sentiment
0,hai kak mohon maaf atas ketidaknyamanannya apa...,neutral
1,hai kak mohon maaf atas ketidaknyamanannya apa...,neutral
2,hai kak apakah alamat kakak pindah ke wilayah ...,neutral
3,hhaaai sha open jasa ya ges lapor spt masa pph...,neutral
4,pajak yang dibayarkan lewat sakti seharusnya 3...,neutral


In [6]:
df.tail()

Unnamed: 0,full_text,sentiment
14576,gw curiga ini server sakti dipake buat coretax...,negative
14577,kak hari kringpajak coba buka dulu coretaxnya ...,neutral
14578,kayak pernah denger tentang coretax deh ngga a...,neutral
14579,setuju sangat setuju sebagai individu yang mem...,negative
14580,sering dapet pengkinian data kak seperti coretax,negative


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14581 entries, 0 to 14580
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   full_text  14580 non-null  object
 1   sentiment  14581 non-null  object
dtypes: object(2)
memory usage: 228.0+ KB


# **Split Data**

In [8]:
data = df.copy(deep=True)
data.dropna(inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14580 entries, 0 to 14580
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   full_text  14580 non-null  object
 1   sentiment  14580 non-null  object
dtypes: object(2)
memory usage: 341.7+ KB


In [10]:
le = LabelEncoder()

data['sentiment'] = le.fit_transform(data['sentiment'])

In [11]:
X = data['full_text']
y = data['sentiment']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Handling Unbalance Data**

## **Nlpaug**

In [13]:
X_train_pos = X_train[y_train == 2]
X_train_pos.reset_index(drop=True, inplace=True)

In [14]:
aug_insert = naw.ContextualWordEmbsAug(
    model_path='indolem/indobertweet-base-uncased',
    model_type='bert',
    action='insert',
    device='cuda'
)

aug_subs = naw.ContextualWordEmbsAug(
    model_path='indolem/indobertweet-base-uncased',
    model_type='bert',
    action='substitute',
    device='cuda'
)

In [15]:
augmented_data = []

for data_text in X_train_pos:
    for _ in range(10):
        augmented_text_insert = aug_insert.augment(data_text)
        augmented_data.append(augmented_text_insert)

        augmented_text_subs = aug_subs.augment(data_text)
        augmented_data.append(augmented_text_subs)

In [16]:
X_train_nlpaug = pd.concat([X_train, pd.Series(augmented_data)], ignore_index=True)
y_train_nlpaug = pd.concat([y_train, pd.Series([2] * len(augmented_data))], ignore_index=True)

# **Tokenisasi**

In [17]:
all_words = []

for text in data['full_text']:
    words = text.lower().split()
    all_words.extend(words)

word_counts = Counter(all_words)

sorted_counts = word_counts.most_common()

total_freq = sum(word_counts.values())

cumulative = 0
num_words = 0

for word, count in sorted_counts:
    cumulative += count
    num_words += 1
    if cumulative / total_freq >= 0.95:
        break

print(f"Jumlah kata yang mencakup 95% dari total frekuensi: {num_words}")

Jumlah kata yang mencakup 95% dari total frekuensi: 6213


In [18]:
VOCAB_SIZE = 6213
MAX_LEN = 100
OOV_TOKEN = '<OOV>'
EMBEDDING_DIM = 100
BATCH_SIZE = 64

In [19]:
tk = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)
tk.fit_on_texts(X_train_nlpaug)

seq_train = tk.texts_to_sequences(X_train_nlpaug)
pad_train = pad_sequences(seq_train, maxlen=MAX_LEN, padding='post', truncating='post')

seq_test = tk.texts_to_sequences(X_test)
pad_test = pad_sequences(seq_test, maxlen=MAX_LEN, padding='post', truncating='post')

In [20]:
y_train_encode = tf.keras.utils.to_categorical(y_train_nlpaug, num_classes=3)
y_test_encode = tf.keras.utils.to_categorical(y_test, num_classes=3)

# **Pemodelan dengan Deep Learning**

In [50]:
classes = np.unique(y_train_nlpaug)
weights = compute_class_weight(class_weight='balanced',
                               classes=classes,
                               y=y_train_nlpaug)

class_weight = dict(zip(classes, weights))

In [51]:
STEPS_PER_EPOCH = int(len(pad_train) / BATCH_SIZE)

In [52]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=STEPS_PER_EPOCH*5,
    decay_rate=0.9,
    staircase=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

In [53]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'best_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

class Mycallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.92 and logs.get('val_accuracy')>0.92):
      print('\nAkurasi telah mencapai > 92%')
      self.model.stop_training = True

stop_train = Mycallback()

callbacks = [early_stopping, checkpoint, stop_train]

## **LSTM**

In [54]:
model = tf.keras.Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM,
                              input_length=MAX_LEN,
                              embeddings_regularizer=tf.keras.regularizers.l2(1e-5)),

    Bidirectional(LSTM(16, return_sequences=True)),

    GlobalAveragePooling1D(),

    Dense(8, activation='relu',
          kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(3, activation='softmax')
])

In [55]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

In [None]:
hist = model.fit(pad_train, y_train_encode,
                    batch_size=BATCH_SIZE,
                    class_weight=class_weight,
                    epochs=100,
                    validation_data=(pad_test, y_test_encode),
                    verbose=2,
                    callbacks=callbacks,
                    steps_per_epoch=STEPS_PER_EPOCH)

Epoch 1/100

Epoch 1: val_accuracy improved from -inf to 0.79973, saving model to best_model.h5




382/382 - 85s - 223ms/step - accuracy: 0.7356 - loss: 0.6857 - precision_2: 0.8917 - recall_2: 0.4969 - val_accuracy: 0.7997 - val_loss: 0.5437 - val_precision_2: 0.8249 - val_recall_2: 0.7689
Epoch 2/100

Epoch 2: val_accuracy did not improve from 0.79973
382/382 - 5s - 14ms/step - accuracy: 0.8333 - loss: 0.6407 - precision_2: 0.8889 - recall_2: 0.6667 - val_accuracy: 0.7901 - val_loss: 0.5585 - val_precision_2: 0.8070 - val_recall_2: 0.7627
Epoch 3/100

Epoch 3: val_accuracy improved from 0.79973 to 0.83162, saving model to best_model.h5




382/382 - 115s - 301ms/step - accuracy: 0.8166 - loss: 0.5245 - precision_2: 0.9166 - recall_2: 0.6014 - val_accuracy: 0.8316 - val_loss: 0.4585 - val_precision_2: 0.8433 - val_recall_2: 0.8196
Epoch 4/100

Epoch 4: val_accuracy did not improve from 0.83162
382/382 - 3s - 8ms/step - accuracy: 0.8611 - loss: 0.4841 - precision_2: 0.9565 - recall_2: 0.6111 - val_accuracy: 0.8316 - val_loss: 0.4589 - val_precision_2: 0.8434 - val_recall_2: 0.8200
Epoch 5/100


Evaluate Model

In [None]:
hist

In [1]:
y_test_probs = model.predict(pad_test)
y_pred = np.argmax(y_test_probs, axis=1)
y_true = np.argmax(y_test_encode, axis=1)

target_names = le.inverse_transform([0,1,2])

print(classification_report(y_true, y_pred, target_names=target_names))

NameError: name 'model' is not defined

## **CNN**

In [44]:
cnn_model = Sequential([
    # 1. Embedding + SpatialDropout
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM,
              input_length=MAX_LEN,
              embeddings_regularizer=tf.keras.regularizers.l2(1e-5)),
    # Reshape the output of the Embedding layer to 3D
    tf.keras.layers.Reshape((MAX_LEN, EMBEDDING_DIM)),  # Add this line
    SpatialDropout1D(0.3),

    # 2. Conv Block 1
    Conv1D(64, 3, padding='same', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    BatchNormalization(),
    Activation('relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    # 3. Conv Block 2
    Conv1D(64, 5, padding='same', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    BatchNormalization(),
    Activation('relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),

    # 4. Global Pooling
    GlobalAveragePooling1D(),

    # 5. Fully Connected
    Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    BatchNormalization(),
    Dropout(0.5),

    # 6. Output
    Dense(3, activation='softmax')
])

cnn_model.summary()

In [45]:
cnn_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [46]:
tf.config.run_functions_eagerly(True)

cnn_hist = cnn_model.fit(pad_train, y_train_encode,
                    epochs=100,
                    batch_size=BATCH_SIZE,
                    validation_data=(pad_test, y_test_encode),
                    verbose=2,
                    callbacks=callbacks,
                    steps_per_epoch=STEPS_PER_EPOCH)

Epoch 1/100


ValueError: Unknown variable: <Variable path=sequential_4/embedding_7/embeddings, shape=(6213, 100), dtype=float32, value=[[-0.00053514 -0.03063952  0.0261015  ... -0.01596267  0.02950436
   0.01930836]
 [ 0.01141304  0.02812502 -0.02740994 ... -0.0302422   0.03470713
   0.03468462]
 [-0.02912616  0.04708688  0.03848168 ... -0.02813857  0.01659379
  -0.03159082]
 ...
 [ 0.02955835  0.03898885  0.04103046 ...  0.02214685 -0.02832992
  -0.04777972]
 [ 0.00844231 -0.01098595  0.03203896 ...  0.012782    0.01895117
   0.04217687]
 [-0.04081379  0.04288638  0.01586945 ... -0.01320772  0.0298248
  -0.00310557]]>. This optimizer can only be called for the variables it was originally built with. When working with a new set of variables, you should recreate a new optimizer instance.

Evaluate Model

In [None]:
y_test_probs = model.predict(pad_test)
y_pred = np.argmax(y_test_probs, axis=1)
y_true = np.argmax(y_test_encode, axis=1)

target_names = le.inverse_transform([0,1,2])

print(classification_report(y_true, y_pred, target_names=target_names))

## **GRU**

In [70]:
gru_model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM,
                                 input_length=MAX_LEN,
                                 embeddings_regularizer=tf.keras.regularizers.l2(1e-5)),

    SpatialDropout1D(0.3),

    Bidirectional(tf.keras.layers.GRU(64, recurrent_dropout=0.2,
                                      reset_after=True)),

    Dense(64, activation='relu',
                kernel_regularizer=tf.keras.regularizers.l2(1e-4)),

    BatchNormalization(),

    Dropout(0.4),

    Dense(3, activation='softmax')
])

In [71]:
gru_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [72]:
gru_hist = gru_model.fit(pad_train, y_train_encode,
                    epochs=100,
                    batch_size=BATCH_SIZE,
                    validation_data=(pad_test, y_test_encode),
                    verbose=2,
                    callbacks=callbacks,
                    steps_per_epoch=STEPS_PER_EPOCH)

Epoch 1/100

Epoch 1: val_accuracy did not improve from 0.72497
182/182 - 97s - 532ms/step - accuracy: 0.8679 - loss: 0.3439 - val_accuracy: 0.5988 - val_loss: 0.8658
Epoch 2/100

Epoch 2: val_accuracy improved from 0.72497 to 0.82407, saving model to best_model.h5




182/182 - 90s - 495ms/step - accuracy: 0.9078 - loss: 0.2562 - val_accuracy: 0.8241 - val_loss: 0.4953
Epoch 3/100

Epoch 3: val_accuracy did not improve from 0.82407
182/182 - 14s - 79ms/step - accuracy: 0.9209 - loss: 0.2357 - val_accuracy: 0.8158 - val_loss: 0.5054
Epoch 4/100

Epoch 4: val_accuracy improved from 0.82407 to 0.82785, saving model to best_model.h5




182/182 - 103s - 564ms/step - accuracy: 0.9334 - loss: 0.1912 - val_accuracy: 0.8278 - val_loss: 0.4473
Epoch 5/100

Epoch 5: val_accuracy improved from 0.82785 to 0.84122, saving model to best_model.h5




182/182 - 89s - 491ms/step - accuracy: 0.9311 - loss: 0.1987 - val_accuracy: 0.8412 - val_loss: 0.4498
Epoch 6/100

Epoch 6: val_accuracy improved from 0.84122 to 0.84225, saving model to best_model.h5




182/182 - 19s - 105ms/step - accuracy: 0.9495 - loss: 0.1569 - val_accuracy: 0.8422 - val_loss: 0.4691
Epoch 7/100

Epoch 7: val_accuracy did not improve from 0.84225
182/182 - 147s - 810ms/step - accuracy: 0.9523 - loss: 0.1457 - val_accuracy: 0.8313 - val_loss: 0.4816
Epoch 8/100

Epoch 8: val_accuracy did not improve from 0.84225
182/182 - 91s - 500ms/step - accuracy: 0.9499 - loss: 0.1523 - val_accuracy: 0.8422 - val_loss: 0.4762
Epoch 9/100

Epoch 9: val_accuracy improved from 0.84225 to 0.84396, saving model to best_model.h5




182/182 - 14s - 75ms/step - accuracy: 0.9335 - loss: 0.1853 - val_accuracy: 0.8440 - val_loss: 0.4857
Epoch 10/100

Epoch 10: val_accuracy did not improve from 0.84396
182/182 - 95s - 523ms/step - accuracy: 0.9639 - loss: 0.1194 - val_accuracy: 0.8179 - val_loss: 0.5494
Epoch 11/100

Epoch 11: val_accuracy did not improve from 0.84396
182/182 - 90s - 496ms/step - accuracy: 0.9584 - loss: 0.1267 - val_accuracy: 0.8302 - val_loss: 0.5346
Epoch 12/100

Epoch 12: val_accuracy did not improve from 0.84396
182/182 - 14s - 75ms/step - accuracy: 0.9478 - loss: 0.1324 - val_accuracy: 0.8371 - val_loss: 0.5227
Epoch 13/100

Epoch 13: val_accuracy did not improve from 0.84396
182/182 - 92s - 504ms/step - accuracy: 0.9696 - loss: 0.1012 - val_accuracy: 0.8416 - val_loss: 0.5490
Epoch 14/100

Epoch 14: val_accuracy did not improve from 0.84396
182/182 - 90s - 495ms/step - accuracy: 0.9671 - loss: 0.1117 - val_accuracy: 0.8313 - val_loss: 0.5180
Epoch 14: early stopping
Restoring model weights from 

Evaluate

In [None]:
y_test_probs = model.predict(pad_test)
y_pred = np.argmax(y_test_probs, axis=1)
y_true = np.argmax(y_test_encode, axis=1)

target_names = le.inverse_transform([0,1,2])

print(classification_report(y_true, y_pred, target_names=target_names))