# Jupyter Notebook: Sentiment and Emotion Classifier

# Instalasi Modul

In [11]:
import sys
import subprocess

# Daftar modul yang diperlukan
required_modules = [
    'pandas', 'numpy', 'scikit-learn', 'imbalanced-learn',
    'Sastrawi', 'matplotlib', 'seaborn'
]

# Fungsi untuk menginstal modul jika belum ada
def install_modules(modules):
    for module in modules:
        try:
            __import__(module)
        except ImportError:
            print(f"Installing {module}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", module])

# Jalankan instalasi
install_modules(required_modules)
print("All required modules are installed.")

Installing scikit-learn...
Installing imbalanced-learn...
All required modules are installed.


# Import Library

In [12]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from multiprocessing import Pool

# Load Dataset

In [13]:
# Load dataset
df = pd.read_csv('dataset/dataset_structured.csv')

# Pemeriksaan kolom
expected_columns = ['sentimen', 'emosi', 'ulasan']
if not all(col in df.columns for col in expected_columns):
    raise ValueError(f"Dataset harus memiliki kolom: {expected_columns}")

print("Dataset loaded successfully.")
print("Kolom dataset:", df.columns.tolist())
print("Contoh data pertama:\n", df.head())

Dataset loaded successfully.
Kolom dataset: ['sentimen', 'emosi', 'ulasan']
Contoh data pertama:
    sentimen  emosi                                             ulasan
0  Negative  Anger  bukan menyenangkan malah bikin kesal hp saya r...
1  Negative  Anger  kalo ngak niat bikin gamenya bagus hapus aja d...
2  Negative  Anger  makin lama, makin gak jelas dri sblum di updat...
3  Negative  Anger  semenjak update sangat sangat buruk setiap mai...
4  Negative  Anger                                              burik


# preprocessing Teks

In [14]:
# Kamus normalisasi slang bahasa Indonesia
slang_dict = {
    'gk': 'tidak', 'gak': 'tidak', 'bgt': 'banget', 'burik': 'jelek',
    'anjs': 'anjing', 'goblok': 'bodoh', 'kontol': 'kasar', 'sialan': 'kasar'
}

def normalize_slang(text):
    words = text.split()
    return ' '.join(slang_dict.get(word, word) for word in words)

# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess_text(text):
    # Ubah ke huruf kecil
    text = text.lower()
    # Hapus kode transaksi
    text = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', text)
    # Hapus tanda baca dan karakter khusus
    text = re.sub(r'[^\w\s]', '', text)
    # Normalisasi slang
    text = normalize_slang(text)
    # Stemming
    text = stemmer.stem(text)
    return text

# Terapkan preprocessing pada kolom 'ulasan'
df['Cleaned_Review'] = df['ulasan'].apply(preprocess_text)
print("Preprocessing selesai. Contoh data setelah preprocessing:\n", df[['ulasan', 'Cleaned_Review']].head())

Preprocessing selesai. Contoh data setelah preprocessing:
                                               ulasan  \
0  bukan menyenangkan malah bikin kesal hp saya r...   
1  kalo ngak niat bikin gamenya bagus hapus aja d...   
2  makin lama, makin gak jelas dri sblum di updat...   
3  semenjak update sangat sangat buruk setiap mai...   
4                                              burik   

                                      Cleaned_Review  
0  bukan senang malah bikin kesal hp saya realme ...  
1  kalo ngak niat bikin gamenya bagus hapus aja d...  
2  makin lama makin tidak jelas dri sblum di upda...  
3  semenjak update sangat sangat buruk tiap main ...  
4                                              jelek  


# Pemisahan Data

In [22]:
# Memisahkan data untuk dua tugas:
# - Klasifikasi sentimen (menggunakan `sentimen` sebagai target).
# - Klasifikasi emosi (menggunakan `emosi` sebagai target).
# Data dibagi 80% training dan 20% testing dengan stratifikasi.

# Untuk sentimen
X_sentiment = df['Cleaned_Review']
y_sentiment = df['sentimen']
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(
    X_sentiment, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)

# Untuk emosi
X_emotion = df['Cleaned_Review']
y_emotion = df['emosi']
X_train_emo, X_test_emo, y_train_emo, y_test_emo = train_test_split(
    X_emotion, y_emotion, test_size=0.2, random_state=42, stratify=y_emotion
)

print("Data sentimen - Training samples:", len(X_train_sent))
print("Data sentimen - Testing samples:", len(X_test_sent))
print("Data emosi - Training samples:", len(X_train_emo))
print("Data emosi - Testing samples:", len(X_test_emo))

Data sentimen - Training samples: 16920
Data sentimen - Testing samples: 4230
Data emosi - Training samples: 16920
Data emosi - Testing samples: 4230


# Pembuatan Pipeline

In [23]:
# Membuat pipeline untuk kedua tugas:
# - Ekstraksi fitur menggunakan `TfidfVectorizer`.
# - Penanganan ketidakseimbangan kelas menggunakan `SMOTE`.
# - Model SVM dengan parameter probabilitas.

#%%
# Pipeline untuk sentimen
sentiment_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(probability=True))
])

# Pipeline untuk emosi
emotion_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(probability=True))
])

print("Pipeline untuk sentimen dan emosi telah dibuat.")

Pipeline untuk sentimen dan emosi telah dibuat.


# Tuning Hyperparameter

In [30]:
# Melakukan Grid Search untuk menemukan parameter terbaik.
# **Pastikan sel "Pembuatan Pipeline" sudah dijalankan untuk mendefinisikan `sentiment_pipeline` dan `emotion_pipeline`.**

#%%
# Pemeriksaan apakah pipeline tersedia
try:
    sentiment_pipeline
    emotion_pipeline
    print("Sentiment and emotion pipelines are defined.")
except NameError:
    raise NameError("Pipeline not defined. Please run the 'Pembuatan Pipeline' cell first.")

param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'tfidf__max_features': [3000, 5000]
}

# Grid Search untuk sentimen
grid_search_sent = GridSearchCV(sentiment_pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_sent.fit(X_train_sent, y_train_sent)
print("Best parameters for sentiment:", grid_search_sent.best_params_)

# Grid Search untuk emosi
grid_search_emo = GridSearchCV(emotion_pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_emo.fit(X_train_emo, y_train_emo)
print("Best parameters for emotion:", grid_search_emo.best_params_)


Sentiment and emotion pipelines are defined.


3919.74s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.75s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.77s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.80s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.81s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.83s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
3919.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make th

ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 339, in _fit
    self._validate_steps()
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 230, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't


# Evaluasi Model

In [31]:
# Mengevaluasi model pada data testing:
# - Classification report untuk precision, recall, dan F1-score.
# - Confusion matrix untuk visualisasi kesalahan klasifikasi.

# Sentimen
y_pred_sent = grid_search_sent.predict(X_test_sent)
print("Classification Report for Sentiment:")
print(classification_report(y_test_sent, y_pred_sent))

# Emosi
y_pred_emo = grid_search_emo.predict(X_test_emo)
print("Classification Report for Emotion:")
print(classification_report(y_test_emo, y_pred_emo))

# Visualisasi Confusion Matrix untuk Sentimen
cm_sent = confusion_matrix(y_test_sent, y_pred_sent)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_sent, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Sentiment Classification')
plt.savefig('cm_sentiment.png')
plt.show()

# Visualisasi Confusion Matrix untuk Emosi
cm_emo = confusion_matrix(y_test_emo, y_pred_emo)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_emo, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Emotion Classification')
plt.savefig('cm_emotion.png')
plt.show()

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Simpan Model

In [None]:
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(grid_search_sent.best_estimator_, f)

with open('emotion_model.pkl', 'wb') as f:
    pickle.dump(grid_search_emo.best_estimator_, f)

print("Model telah disimpan sebagai 'sentiment_model.pkl' dan 'emotion_model.pkl'.")

# Prediksi Ulasan Baru

In [None]:
# Fungsi untuk memprediksi sentimen dan emosi dari ulasan baru.
# Contoh ulasan: "Aplikasi ini burik banget, gk bisa login!"

def predict_new_review(review):
    cleaned_review = preprocess_text(review)
    sentiment = grid_search_sent.best_estimator_.predict([cleaned_review])[0]
    emotion = grid_search_emo.best_estimator_.predict([cleaned_review])[0]
    return {'Sentiment': sentiment, 'Emotion': emotion}

# Contoh penggunaan
new_review = "Aplikasi ini burik banget, gk bisa login!"
prediction = predict_new_review(new_review)
print("Prediction for new review:", prediction)