# Import Library

In [135]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from gensim.models import Word2Vec
from google.colab import drive
drive.mount('drive')


Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


# Load dataset

In [136]:
df = pd.read_csv('drive/MyDrive/ProyekAnalisisSentimen/dataset.csv')

# Download stopwords and tokenizer

In [137]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Define stopwords

In [138]:
stop_words = set(stopwords.words('indonesian'))

# Sentiment Lexicon

In [139]:
positive_words = {"bagus", "mantap", "puas", "baik", "hebat", "luar biasa", "menyenangkan"}
negative_words = {"buruk", "jelek", "kecewa", "tidak puas", "parah", "menyedihkan", "payah"}


# Function for cleaning text

In [140]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# Function to determine sentiment

In [141]:
def label_sentiment(text):
    words = set(text.split())
    pos_count = len(words & positive_words)
    neg_count = len(words & negative_words)
    if pos_count > neg_count:
        return "positive"
    elif neg_count > pos_count:
        return "negative"
    else:
        return "neutral"

# Apply preprocessing

In [142]:
df["clean_text"] = df["review"].astype(str).apply(clean_text)
df["sentiment"] = df["clean_text"].apply(label_sentiment)

# Drop empty rows

In [143]:
df = df[df["clean_text"].str.strip() != ""]

# Split data

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Handle class imbalance

In [145]:
# Pastikan jumlah data sebelum proses
assert len(X_train) == len(y_train), f"Jumlah tidak sama: X_train={len(X_train)}, y_train={len(y_train)}"

# 1. Konversi teks ke TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 2. Terapkan SMOTE hanya sekali
smote = SMOTE(random_state=42)
X_train_tfidf_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# 3. Pastikan hasil SMOTE tetap sinkron
print(f"Setelah SMOTE: X_train_tfidf_resampled={X_train_tfidf_resampled.shape}, y_train_resampled={y_train_resampled.shape}")


Setelah SMOTE: X_train_tfidf_resampled=(6369, 3741), y_train_resampled=(6369,)


# Train SVM Model

In [146]:
svm_model = SVC(kernel='linear', C=1.5)
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.9915110356536503
              precision    recall  f1-score   support

    negative       1.00      0.86      0.92        28
     neutral       0.99      1.00      1.00       531
    positive       1.00      0.97      0.98        30

    accuracy                           0.99       589
   macro avg       1.00      0.94      0.97       589
weighted avg       0.99      0.99      0.99       589



# Train Word2Vec Model

In [147]:
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

X_train_w2v = np.array([np.mean([w2v_model.wv[word] for word in text.split() if word in w2v_model.wv] or [np.zeros(100)], axis=0) for text in X_train])
X_test_w2v = np.array([np.mean([w2v_model.wv[word] for word in text.split() if word in w2v_model.wv] or [np.zeros(100)], axis=0) for text in X_test])

# Train Random Forest Model

In [148]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train_w2v, y_train)
y_pred_rf = rf_model.predict(X_test_w2v)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9490662139219015
              precision    recall  f1-score   support

    negative       1.00      0.04      0.07        28
     neutral       0.95      1.00      0.97       531
    positive       1.00      0.90      0.95        30

    accuracy                           0.95       589
   macro avg       0.98      0.65      0.66       589
weighted avg       0.95      0.95      0.93       589



# Tokenizer for LSTM

In [149]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100)


# Prepare embedding matrix

In [150]:
embedding_matrix = np.zeros((5000, 100))
for word, i in tokenizer.word_index.items():
    if i < 5000 and word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Encode labels

In [151]:
y_train_encoded = y_train.map({"negative": 0, "neutral": 1, "positive": 2})
y_test_encoded = y_test.map({"negative": 0, "neutral": 1, "positive": 2})

# Convert labels to categorical

In [152]:
y_train_encoded = to_categorical(y_train_encoded, num_classes=3)
y_test_encoded = to_categorical(y_test_encoded, num_classes=3)

# Build LSTM Model

In [153]:
lstm_model = Sequential([
    Embedding(5000, 100, weights=[embedding_matrix], input_length=100, trainable=False),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(3, activation='softmax')
])

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



# Train LSTM Model

In [154]:
lstm_model.fit(X_train_seq, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_seq, y_test_encoded))

y_pred_lstm = np.argmax(lstm_model.predict(X_test_seq), axis=1)
y_test_labels = np.argmax(y_test_encoded, axis=1)
print("LSTM Accuracy:", accuracy_score(y_test_labels, y_pred_lstm))


Epoch 1/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 558ms/step - accuracy: 0.8551 - loss: 0.6156 - val_accuracy: 0.9015 - val_loss: 0.3484
Epoch 2/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 523ms/step - accuracy: 0.9074 - loss: 0.3479 - val_accuracy: 0.9015 - val_loss: 0.3418
Epoch 3/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 508ms/step - accuracy: 0.9019 - loss: 0.3511 - val_accuracy: 0.9015 - val_loss: 0.3409
Epoch 4/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 486ms/step - accuracy: 0.8966 - loss: 0.3582 - val_accuracy: 0.9015 - val_loss: 0.3319
Epoch 5/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 561ms/step - accuracy: 0.9064 - loss: 0.3400 - val_accuracy: 0.9015 - val_loss: 0.3367
Epoch 6/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 509ms/step - accuracy: 0.9008 - loss: 0.3559 - val_accuracy: 0.9015 - val_loss: 0.3315
Epoch 7/10
[1m74/74[