# import library

In [53]:
!pip install sastrawi



In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
!pip install imbalanced-learn # Install the imbalanced-learn package



In [81]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, LSTM, Dense, Dropout,
                                     Conv1D, MaxPooling1D, GlobalMaxPooling1D,
                                     SpatialDropout1D, Bidirectional)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from gensim.models import Word2Vec


# load data

In [58]:
df = pd.read_csv('/content/drive/MyDrive/SPK/DBS/Proyek sentimen/hasil_scraping_sotify.csv')
df

Unnamed: 0,userName,score,content
0,Pengguna Google,5,sangat cocok untuk aku yang suka dengerin lagu...
1,Pengguna Google,5,bagussss
2,Pengguna Google,5,bagus utk semua dengarin musik
3,Pengguna Google,5,good
4,Pengguna Google,5,Bagus
...,...,...,...
11995,Pengguna Google,5,aku suka banget sama lagu cocok banget.bisa hi...
11996,Pengguna Google,2,Ngeleg aja
11997,Pengguna Google,5,Lagunya lenkap lumayan lah
11998,Pengguna Google,5,"tak hidup klo tak spotify, tapi tolonglah turu..."


# Text PreProcessing

In [59]:
def clean_text(text):
    # Ubah ke huruf kecil
    text = text.lower()

    # Hapus angka
    text = re.sub(r'\d+', '', text)

    # Hapus tanda baca
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Hapus white space di awal dan akhir, dan extra space di tengah
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    # Hapus karakter khusus seperti emoji (opsional)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Hapus URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    # Hapus mention (@user) dan hashtag (#tag)
    text = re.sub(r'@\w+|#\w+', '', text)

    return text

# Terapkan ke kolom 'content'
df['content'] = df['content'].astype(str).apply(clean_text)

Fungsi preprocessing digunakan untuk membersihkan teks dengan mengubah semua huruf menjadi huruf kecil, menghapus angka, dan menghilangkan tanda baca. Tujuannya agar data teks lebih konsisten dan siap untuk diproses dalam analisis.

In [60]:
df

Unnamed: 0,userName,score,content
0,Pengguna Google,5,sangat cocok untuk aku yang suka dengerin lagu
1,Pengguna Google,5,bagussss
2,Pengguna Google,5,bagus utk semua dengarin musik
3,Pengguna Google,5,good
4,Pengguna Google,5,bagus
...,...,...,...
11995,Pengguna Google,5,aku suka banget sama lagu cocok bangetbisa hil...
11996,Pengguna Google,2,ngeleg aja
11997,Pengguna Google,5,lagunya lenkap lumayan lah
11998,Pengguna Google,5,tak hidup klo tak spotify tapi tolonglah turun...


# label

In [61]:
def get_sentiment(text):
    from textblob import TextBlob
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return 'positif'
    elif polarity < 0:
        return 'negatif'
    else:
        return 'netral'

df['sentiment'] = df['content'].apply(get_sentiment)

In [62]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

# model 1: SVM + TF-IDF + Split 80/20

In [63]:
# Encode label
y = LabelEncoder().fit_transform(df['label'])
X = df['content']

# Pisahkan data menjadi train dan test
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [64]:
# Preprocessing untuk model berbasis TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_raw)
X_test_tfidf = vectorizer.transform(X_test_raw)

# Terapkan SMOTE hanya pada data pelatihan
smote = SMOTE(random_state=42)
X_train_tfidf_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

print("Distribusi label setelah SMOTE:", Counter(y_train_smote))

Distribusi label setelah SMOTE: Counter({1: 8319, 2: 8319, 0: 8319})


In [65]:
# 1. Model SVM
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_tfidf_smote, y_train_smote)
y_pred_svm = svm.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("\nEvaluasi Model SVM:")
print(f"Accuracy: {accuracy_svm}")
print(classification_report(y_test, y_pred_svm, target_names=['negatif', 'netral', 'positif']))


Evaluasi Model SVM:
Accuracy: 0.9833333333333333
              precision    recall  f1-score   support

     negatif       0.96      0.63      0.76        41
      netral       0.98      1.00      0.99      2080
     positif       0.97      0.92      0.94       279

    accuracy                           0.98      2400
   macro avg       0.97      0.85      0.90      2400
weighted avg       0.98      0.98      0.98      2400



# model 2 : RF+ TF-IDF + Split 80/20

In [66]:
# 2. Model Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_tfidf_smote, y_train_smote)
y_pred_rf = rf.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("\nEvaluasi Model Random Forest:")
print(f"Accuracy: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf, target_names=['negatif', 'netral', 'positif']))


Evaluasi Model Random Forest:
Accuracy: 0.9625
              precision    recall  f1-score   support

     negatif       1.00      0.51      0.68        41
      netral       0.96      1.00      0.98      2080
     positif       0.97      0.76      0.85       279

    accuracy                           0.96      2400
   macro avg       0.98      0.76      0.84      2400
weighted avg       0.96      0.96      0.96      2400



#  model 3 : LTSM

In [67]:
# Preprocessing untuk model LSTM
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_raw)

X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Konversi label ke one-hot encoding
y_train_cat = pd.get_dummies(y_train).values
y_test_cat = pd.get_dummies(y_test).values

In [68]:
model_lstm = Sequential([
    Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    LSTM(128, return_sequences=True, kernel_regularizer='l2'),
    Dropout(0.5),
    LSTM(64, kernel_regularizer='l2'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
]

history = model_lstm.fit(
    X_train_pad, y_train_cat,
    epochs=15,
    batch_size=64,
    validation_data=(X_test_pad, y_test_cat),
    callbacks=callbacks
)

oss, accuracy_lstm = model_lstm.evaluate(X_test_pad, y_test_cat)
print("\nEvaluasi Model LSTM:")
print(f"Accuracy: {accuracy_lstm}")

Epoch 1/15




[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 427ms/step - accuracy: 0.8509 - loss: 2.2217 - val_accuracy: 0.8667 - val_loss: 0.5175 - learning_rate: 0.0010
Epoch 2/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 405ms/step - accuracy: 0.8686 - loss: 0.4889 - val_accuracy: 0.8667 - val_loss: 0.4523 - learning_rate: 0.0010
Epoch 3/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 407ms/step - accuracy: 0.8724 - loss: 0.4360 - val_accuracy: 0.8671 - val_loss: 0.3655 - learning_rate: 0.0010
Epoch 4/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 404ms/step - accuracy: 0.9126 - loss: 0.3206 - val_accuracy: 0.9479 - val_loss: 0.2430 - learning_rate: 0.0010
Epoch 5/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 451ms/step - accuracy: 0.9648 - loss: 0.1980 - val_accuracy: 0.9550 - val_loss: 0.2325 - learning_rate: 0.0010
Epoch 6/15
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

# MODEL 4 : CNN

In [69]:
# mode CNN
cnn_model = Sequential()
# Change input_dim to match max_words (10000)
cnn_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(64, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(3, activation='softmax'))

cnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callback early stopping (kalau belum didefinisikan)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training CNN
# Use X_train_pad instead of X_train_padded
cnn_history = cnn_model.fit(
    X_train_pad, y_train,  # Changed to X_train_pad
    epochs=15,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)

# Evaluasi CNN
# Use X_test_pad instead of X_test_padded
_, cnn_accuracy = cnn_model.evaluate(X_test_pad, y_test)  # Changed to X_test_pad
print(f'CNN Accuracy: {cnn_accuracy * 100:.2f}%')

Epoch 1/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 122ms/step - accuracy: 0.8486 - loss: 0.5809 - val_accuracy: 0.8724 - val_loss: 0.2934
Epoch 2/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 116ms/step - accuracy: 0.9144 - loss: 0.2763 - val_accuracy: 0.9656 - val_loss: 0.0941
Epoch 3/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 119ms/step - accuracy: 0.9815 - loss: 0.0602 - val_accuracy: 0.9797 - val_loss: 0.0622
Epoch 4/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 119ms/step - accuracy: 0.9886 - loss: 0.0287 - val_accuracy: 0.9812 - val_loss: 0.0622
Epoch 5/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 120ms/step - accuracy: 0.9905 - loss: 0.0219 - val_accuracy: 0.9818 - val_loss: 0.0685
Epoch 6/15
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 121ms/step - accuracy: 0.9945 - loss: 0.0175 - val_accuracy: 0.9766 - val_loss: 0.0943
[1m75/75

# RF+Word2Vec+Split 70/30

In [84]:
# 1. Tokenisasi data (pastikan X_raw adalah data mentah dan sudah dibersihkan sebelumnya)
# Assign X to X_raw since X contains the raw, cleaned text data
X_raw = X
tokenized_sentences = [sentence.split() for sentence in X_raw]

# 2. Latih model Word2Vec
w2v_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# 3. Fungsi untuk mendapatkan rata-rata vektor Word2Vec untuk setiap kalimat
def get_sentence_vector(tokens, model, vector_size=100):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# 4. Ubah seluruh kalimat menjadi representasi vektor
X_w2v = np.array([get_sentence_vector(tokens, w2v_model, 100) for tokens in tokenized_sentences])

# 5. Split data (70/30)
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, y, test_size=0.3, random_state=42, stratify=y)

# 6. Latih Random Forest
rf_w2v = RandomForestClassifier(random_state=42)
rf_w2v.fit(X_train_w2v, y_train_w2v)

# 7. Prediksi
y_pred_rf_w2v = rf_w2v.predict(X_test_w2v)

# 8. Evaluasi
accuracy_rf_w2v = accuracy_score(y_test_w2v, y_pred_rf_w2v)
print("\nEvaluasi Model Random Forest (Word2Vec):")
print(f"Accuracy: {accuracy_rf_w2v * 100:.2f}%")
print(classification_report(y_test_w2v, y_pred_rf_w2v, target_names=['negatif', 'netral', 'positif']))


Evaluasi Model Random Forest (Word2Vec):
Accuracy: 93.75%
              precision    recall  f1-score   support

     negatif       0.67      0.03      0.06        61
      netral       0.94      0.99      0.97      3120
     positif       0.90      0.67      0.77       419

    accuracy                           0.94      3600
   macro avg       0.84      0.56      0.60      3600
weighted avg       0.93      0.94      0.93      3600



#  inference atau testing

In [73]:
def predict_text_class_svm_rf(text, vectorizer, model):
    # Bersihkan teks
    cleaned = clean_text(text)
    # TF-IDF transform
    tfidf_vector = vectorizer.transform([cleaned])
    # Prediksi kelas
    pred = model.predict(tfidf_vector)[0]
    # Mapping label
    label_map = {0: 'negatif', 1: 'netral', 2: 'positif'}
    return label_map[pred]

In [76]:
def predict_text_class_dl(text, tokenizer, model):
    # Bersihkan teks terlebih dahulu
    cleaned = clean_text(text)

    # Tokenisasi dan padding
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)

    # Prediksi
    pred_proba = model.predict(padded)
    pred_class = np.argmax(pred_proba, axis=1)[0]

    # Mapping label numerik ke kategori
    label_map = {0: 'negatif', 1: 'netral', 2: 'positif'}
    return label_map[pred_class]

In [86]:
def predict_rf_word2vec(text, model_w2v, rf_model):
    cleaned = clean_text(text)
    tokens = cleaned.split()
    vec = get_sentence_vector(tokens, model_w2v)
    pred = rf_model.predict([vec])[0]
    label_map = {0: 'negatif', 1: 'netral', 2: 'positif'}
    return label_map[pred]

In [87]:
new_text = "Aplikasi ini biasa aja"
print("Prediksi SVM :", predict_text_class_svm_rf(new_text, vectorizer, svm))
print("Prediksi RF  :", predict_text_class_svm_rf(new_text, vectorizer, rf))
print("Prediksi LSTM:", predict_text_class_dl(new_text, tokenizer, model_lstm))
print("Prediksi CNN :", predict_text_class_dl(new_text, tokenizer, cnn_model))
print("Prediksi RF + Word2Vec:", predict_rf_word2vec(new_text, w2v_model, rf_w2v))

Prediksi SVM : netral
Prediksi RF  : netral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Prediksi LSTM: netral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Prediksi CNN : netral
Prediksi RF + Word2Vec: netral


In [78]:
!pip freeze > requirements.txt
from google.colab import files
files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>