# Jupyter Notebook: Sentiment and Emotion Classifier

# Instalasi Modul

In [6]:
import sys
import subprocess

# Daftar modul yang diperlukan
required_modules = [
    'pandas', 'numpy', 'scikit-learn', 'imbalanced-learn',
    'Sastrawi', 'matplotlib', 'seaborn'
]

# Fungsi untuk menginstal modul jika belum ada
def install_modules(modules):
    for module in modules:
        try:
            __import__(module)
        except ImportError:
            print(f"Installing {module}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", module])

# Jalankan instalasi
install_modules(required_modules)
print("All required modules are installed.")

Installing scikit-learn...



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m


Installing imbalanced-learn...
All required modules are installed.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m


# Import Library

In [18]:
# Import Library
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline  # Ganti import Pipeline
from imblearn.over_sampling import SMOTE
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from multiprocessing import Pool
from imblearn.pipeline import Pipeline  # Ganti dengan ini

# Load Dataset

In [8]:
# Load dataset
df = pd.read_csv('../dataset/dataset_structured.csv')

# Pemeriksaan kolom
expected_columns = ['sentimen', 'emosi', 'ulasan']
if not all(col in df.columns for col in expected_columns):
    raise ValueError(f"Dataset harus memiliki kolom: {expected_columns}")

print("Dataset loaded successfully.")
print("Kolom dataset:", df.columns.tolist())
print("Contoh data pertama:\n", df.head())

Dataset loaded successfully.
Kolom dataset: ['sentimen', 'emosi', 'ulasan']
Contoh data pertama:
    sentimen  emosi                                             ulasan
0  Negative  Anger  bukan menyenangkan malah bikin kesal hp saya r...
1  Negative  Anger  kalo ngak niat bikin gamenya bagus hapus aja d...
2  Negative  Anger  makin lama, makin gak jelas dri sblum di updat...
3  Negative  Anger  semenjak update sangat sangat buruk setiap mai...
4  Negative  Anger                                              burik


# preprocessing Teks

In [10]:
# Import library yang diperlukan
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Load slang dictionary dari file
slang_df = pd.read_csv('../dataset/slang_indo.csv', header=None, names=['slang', 'formal'])
slang_dict = dict(zip(slang_df['slang'], slang_df['formal']))

# Load stopwords dari file
with open('../dataset/stopwords-id.txt', 'r') as f:
    stopwords = set(f.read().splitlines())

# Tambahkan slang tambahan yang belum ada di file
additional_slang = {
    'burik': 'jelek',
    'anjs': 'anjing',
    'goblok': 'bodoh',
    'kontol': 'kasar',
    'sialan': 'kasar',
    'anjg': 'anjing',
    'bgs': 'bagus',
    'bgt': 'banget',
    'bnyk': 'banyak',
    'bsk': 'besok',
    'byk': 'banyak',
    'dmn': 'dimana',
    'gmn': 'gimana',
    'jd': 'jadi',
    'jg': 'juga',
    'klo': 'kalau',
    'kyk': 'seperti',
    'lg': 'lagi',
    'mau': 'ingin',
    'mngkn': 'mungkin',
    'msh': 'masih',
    'nggak': 'tidak',
    'ngga': 'tidak',
    'pdhl': 'padahal',
    'pny': 'punya',
    'sbnrnya': 'sebenarnya',
    'sdh': 'sudah',
    'skrg': 'sekarang',
    'sm': 'sama',
    'spt': 'seperti',
    'sy': 'saya',
    'tdk': 'tidak',
    'tp': 'tapi',
    'udh': 'sudah',
    'utk': 'untuk',
    'yg': 'yang'
}
slang_dict.update(additional_slang)

# Inisialisasi stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def normalize_slang(text):
    """Normalisasi slang words ke bentuk formal"""
    words = text.split()
    return ' '.join(slang_dict.get(word, word) for word in words)

def remove_repeated_words(text):
    """Menghapus kata yang diulang berurutan"""
    words = text.split()
    result = []
    for i in range(len(words)):
        if i == 0 or words[i] != words[i-1]:
            result.append(words[i])
    return ' '.join(result)

def preprocess_text(text):
    """
    Fungsi untuk melakukan preprocessing teks
    """
    # Ubah ke huruf kecil
    text = text.lower()
    
    # Hapus kode transaksi
    text = re.sub(r'\b\d+[a-zA-Z]+\d+\b', '', text)
    
    # Hapus URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Hapus tanda baca dan karakter khusus
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Hapus angka
    text = re.sub(r'\d+', '', text)
    
    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Normalisasi slang
    text = normalize_slang(text)
    
    # Hapus kata yang diulang
    text = remove_repeated_words(text)
    
    # Hapus stopwords
    words = text.split()
    words = [word for word in words if word not in stopwords]
    text = ' '.join(words)
    
    # Stemming
    text = stemmer.stem(text)
    
    return text

# Terapkan preprocessing pada kolom 'ulasan'
print("Memulai preprocessing teks...")
df['Cleaned_Review'] = df['ulasan'].apply(preprocess_text)
print("Preprocessing selesai. Contoh data setelah preprocessing:")
print(df[['ulasan', 'Cleaned_Review']].head())

Memulai preprocessing teks...
Preprocessing selesai. Contoh data setelah preprocessing:
                                              ulasan  \
0  bukan menyenangkan malah bikin kesal hp saya r...   
1  kalo ngak niat bikin gamenya bagus hapus aja d...   
2  makin lama, makin gak jelas dri sblum di updat...   
3  semenjak update sangat sangat buruk setiap mai...   
4                                              burik   

                                      Cleaned_Review  
0  senang bikin kesal hp realme c ngeblank hitam ...  
1  niat bikin gamenya bagus hapus gamenya narik m...  
2  sblum update game update suka main bug tlong b...  
3  semenjak update buruk main bareng putus sinyal...  
4                                              jelek  


# Pemisahan Data

In [14]:
# Pemisahan Data
print("Memulai pemisahan data...")

# Untuk sentimen
X_sentiment = df['Cleaned_Review']  # Menggunakan kolom yang sudah di-preprocess
y_sentiment = df['sentimen']
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(
    X_sentiment, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)

# Untuk emosi
X_emotion = df['Cleaned_Review']  # Menggunakan kolom yang sudah di-preprocess
y_emotion = df['emosi']
X_train_emo, X_test_emo, y_train_emo, y_test_emo = train_test_split(
    X_emotion, y_emotion, test_size=0.2, random_state=42, stratify=y_emotion
)

print("Data sentimen - Training samples:", len(X_train_sent))
print("Data sentimen - Testing samples:", len(X_test_sent))
print("Data emosi - Training samples:", len(X_train_emo))
print("Data emosi - Testing samples:", len(X_test_emo))


Memulai pemisahan data...
Data sentimen - Training samples: 16920
Data sentimen - Testing samples: 4230
Data emosi - Training samples: 16920
Data emosi - Testing samples: 4230


# Pembuatan Pipeline

In [15]:
# Membuat pipeline untuk kedua tugas:
# - Ekstraksi fitur menggunakan `TfidfVectorizer`.
# - Penanganan ketidakseimbangan kelas menggunakan `SMOTE`.
# - Model SVM dengan parameter probabilitas.

#%%
# Pipeline untuk sentimen
sentiment_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(probability=True))
])

# Pipeline untuk emosi
emotion_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC(probability=True))
])

print("Pipeline untuk sentimen dan emosi telah dibuat.")

Pipeline untuk sentimen dan emosi telah dibuat.


# Tuning Hyperparameter

In [19]:
# Tuning Hyperparameter
print("\nMemulai tuning hyperparameter...")

# Definisikan parameter grid
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'tfidf__max_features': [3000, 5000]
}

# Grid Search untuk sentimen
print("Tuning model sentimen...")
grid_search_sent = GridSearchCV(
    sentiment_pipeline, 
    param_grid, 
    cv=5, 
    scoring='f1_weighted', 
    n_jobs=-1,
    verbose=1
)

# Fit model sentimen
grid_search_sent.fit(X_train_sent, y_train_sent)
print("Best parameters for sentiment:", grid_search_sent.best_params_)
print("Best score for sentiment:", grid_search_sent.best_score_)

# Grid Search untuk emosi
print("\nTuning model emosi...")
grid_search_emo = GridSearchCV(
    emotion_pipeline, 
    param_grid, 
    cv=5, 
    scoring='f1_weighted', 
    n_jobs=-1,
    verbose=1
)

# Fit model emosi
grid_search_emo.fit(X_train_emo, y_train_emo)
print("Best parameters for emotion:", grid_search_emo.best_params_)
print("Best score for emotion:", grid_search_emo.best_score_)


Memulai tuning hyperparameter...
Tuning model sentimen...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 562, in _fit
    self._validate_steps()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/pipeline.py", line 339, in _validate_steps
    raise TypeError(
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't


# Evaluasi Model

In [None]:
# Mengevaluasi model pada data testing:
# - Classification report untuk precision, recall, dan F1-score.
# - Confusion matrix untuk visualisasi kesalahan klasifikasi.

# Sentimen
y_pred_sent = grid_search_sent.predict(X_test_sent)
print("Classification Report for Sentiment:")
print(classification_report(y_test_sent, y_pred_sent))

# Emosi
y_pred_emo = grid_search_emo.predict(X_test_emo)
print("Classification Report for Emotion:")
print(classification_report(y_test_emo, y_pred_emo))

# Visualisasi Confusion Matrix untuk Sentimen
cm_sent = confusion_matrix(y_test_sent, y_pred_sent)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_sent, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Sentiment Classification')
plt.savefig('cm_sentiment.png')
plt.show()

# Visualisasi Confusion Matrix untuk Emosi
cm_emo = confusion_matrix(y_test_emo, y_pred_emo)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_emo, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Emotion Classification')
plt.savefig('cm_emotion.png')
plt.show()

# Simpan Model

In [None]:
with open('sentiment_model.pkl', 'wb') as f:
    pickle.dump(grid_search_sent.best_estimator_, f)

with open('emotion_model.pkl', 'wb') as f:
    pickle.dump(grid_search_emo.best_estimator_, f)

print("Model telah disimpan sebagai 'sentiment_model.pkl' dan 'emotion_model.pkl'.")

# Prediksi Ulasan Baru

In [None]:
# Fungsi untuk memprediksi sentimen dan emosi dari ulasan baru.
# Contoh ulasan: "Aplikasi ini burik banget, gk bisa login!"

def predict_new_review(review):
    cleaned_review = preprocess_text(review)
    sentiment = grid_search_sent.best_estimator_.predict([cleaned_review])[0]
    emotion = grid_search_emo.best_estimator_.predict([cleaned_review])[0]
    return {'Sentiment': sentiment, 'Emotion': emotion}

# Contoh penggunaan
new_review = "Aplikasi ini burik banget, gk bisa login!"
prediction = predict_new_review(new_review)
print("Prediction for new review:", prediction)