In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("jago_reviews.csv")
print(df.columns)
df.head()

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion'],
      dtype='object')


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,62258d58-0b85-4390-94ae-d367e5ce77fe,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,sering kasi bonus dan ada pinjaman online bung...,5,0,8.62.3,2025-04-11 09:30:10,,,8.62.3
1,b4aa5fc8-e35d-45eb-9fb8-2e2d87351aca,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,keren,5,0,8.62.3,2025-04-11 09:05:06,,,8.62.3
2,f0cda9ec-b9d3-40c1-9ec4-1228b4574fc8,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,saya suka..gampang...apalagi kalau bank nya ad...,5,0,8.52.0,2025-04-11 09:04:08,,,8.52.0
3,883161ef-d7b0-4dee-bf43-383ab3ea5a28,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,mengatur keuangan semakin mudah dan cepat..mak...,5,0,8.62.2,2025-04-11 07:02:57,,,8.62.2
4,c11771b4-8327-4cbc-95eb-0a6731d6312a,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,baru daftar kelogout masuk lagi udh ga bisa lo...,1,0,,2025-04-11 00:31:37,"Halo, Jagoan. Mohon maaf atas kendala yang ter...",2025-04-11 01:13:51,


In [3]:
def label_sentiment(score):
    if score >= 4:
        return 'positif'
    elif score == 3:
        return 'netral'
    else:
        return 'negatif'

df['label'] = df['score'].apply(label_sentiment)
print(df['label'].value_counts())

label
positif    6885
negatif    2733
netral      382
Name: count, dtype: int64


In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # hapus simbol/angka
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned'] = df['content'].astype(str).apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rizqiamaliakartika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df[['content', 'cleaned']].sample(5)

Unnamed: 0,content,cleaned
8150,So far so good. Well done.,so far so good well done
1167,Ojk kerja kalian apaa? Ini apk sampah bebas ba...,ojk kerja apaa apk sampah bebas banget bayangi...
2221,Rekomendet,rekomendet
4590,buruk terlalu byk kantong tidak aman,buruk byk kantong aman
2884,"Sangat kecewa dengan bank jago, saya kemarin b...",kecewa bank jago kemarin menginstal aplikasiny...


**RANDOM FOREST**

In [6]:
import os

# Folder tujuan penyimpanan resource
nltk_data_dir = "/Users/rizqiamaliakartika/PYTHON/DICODING/PENGEMBANGAN/nltk_data"

# Pastikan folder ditambahkan ke path nltk
nltk.data.path.append(nltk_data_dir)

# Download dua resource ini ke lokasi yang sama
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)


[nltk_data] Downloading package punkt to /Users/rizqiamaliakartika/PYT
[nltk_data]     HON/DICODING/PENGEMBANGAN/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/rizqiamaliakartika
[nltk_data]     /PYTHON/DICODING/PENGEMBANGAN/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text.lower())

df['tokens'] = df['cleaned'].apply(tokenize)

In [8]:
from gensim.models import Word2Vec

# Melatih model Word2Vec
model_w2v = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [9]:
import numpy as np

# Fungsi untuk mendapatkan vektor rata-rata dari sebuah kalimat
def get_vector(tokens, model_w2v):
    vector = np.zeros(100)  # Ukuran vektor yang digunakan
    count = 0
    for word in tokens:
        if word in model_w2v.wv:
            vector += model_w2v.wv[word]
            count += 1
    if count > 0:
        vector /= count
    return vector

# Menerapkan fungsi untuk mendapatkan vektor kalimat
df['vectors'] = df['tokens'].apply(lambda x: get_vector(x, model_w2v))

In [10]:
# Menyusun fitur (X) dan label (y)
X = np.array(df['vectors'].tolist())
y = df['label']

In [11]:
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestClassifier

# Menggunakan Random Forest untuk klasifikasi
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [13]:
# Prediksi pada data uji
y_pred = rf_model.predict(X_test)

# Evaluasi model
print("Akurasi: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Menampilkan Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Akurasi:  0.8245

Classification Report:
              precision    recall  f1-score   support

     negatif       0.71      0.77      0.74       575
      netral       0.00      0.00      0.00        78
     positif       0.88      0.90      0.89      1347

    accuracy                           0.82      2000
   macro avg       0.53      0.55      0.54      2000
weighted avg       0.80      0.82      0.81      2000


Confusion Matrix:
[[ 440    2  133]
 [  41    0   37]
 [ 137    1 1209]]


**LSTM**

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['encoded_label'], test_size=0.3, random_state=42)

In [16]:
# Tokenisasi dan padding
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000  # jumlah kata unik yang akan dipakai
max_len = 100      # panjang maksimal setiap input (padding)

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.6669 - loss: 0.7987 - val_accuracy: 0.6950 - val_loss: 0.7433
Epoch 2/5
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.6930 - loss: 0.7430 - val_accuracy: 0.6950 - val_loss: 0.7475
Epoch 3/5
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.6994 - loss: 0.7302 - val_accuracy: 0.6950 - val_loss: 0.7535
Epoch 4/5
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.6919 - loss: 0.7409 - val_accuracy: 0.6950 - val_loss: 0.7444
Epoch 5/5
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.6881 - loss: 0.7360 - val_accuracy: 0.6950 - val_loss: 0.7504


<keras.src.callbacks.history.History at 0x319048f50>

In [19]:
y_pred_prob = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_prob, axis=1)

print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step

Akurasi: 0.68

Classification Report:
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00       845
      netral       0.00      0.00      0.00       115
     positif       0.68      1.00      0.81      2040

    accuracy                           0.68      3000
   macro avg       0.23      0.33      0.27      3000
weighted avg       0.46      0.68      0.55      3000


Confusion Matrix:
[[   0    0  845]
 [   0    0  115]
 [   0    0 2040]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
