In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [36]:
def preprocess_text(text):
  '''
    2 hal yang dilakukan pada tahap preprocess input text:
    - lowercasing
    - menghilangkan tanda baca
  '''
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  return text

def evaluate_model(y_test, y_pred, model_name):
  '''
    evaluasi model menggunakan Accuracy, Precision, Recall, F1-score, dan menampilkan detail classification report.
  '''
  print(f"Evaluation Metrics for {model_name}:")
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  f1 = f1_score(y_test, y_pred, average='weighted')

  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision:.4f}")
  print(f"Recall: {recall:.4f}")
  print(f"F1-Score: {f1:.4f}")
  print("\nDetailed classification report:")
  print(classification_report(y_test, y_pred))
  print("-----------------------------------------------------")

  return accuracy, precision, recall, f1

In [4]:
# load the data
data_train = pd.read_csv("train_preprocess.tsv", sep="\t", header=None, names=['text', 'label'])
data_test = pd.read_csv("test_preprocess.tsv", sep="\t", header=None, names=['text', 'label'])

# preprocess input data
data_train['clean_text'] = data_train['text'].apply(preprocess_text)
data_test['clean_text'] = data_test['text'].apply(preprocess_text)

# encode label agar menjadi numeric
le = LabelEncoder()

data_train['label_encoded'] = le.fit_transform(data_train['label'])
data_test['label_encoded'] = le.transform(data_test['label'])

In [25]:
x_train = data_train['clean_text']
y_train = data_train['label_encoded']

x_test = data_test['clean_text']
y_test = data_test['label_encoded']

# mengubah input text menjadi Bag of Words menggunakan CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
x_train_bow = vectorizer.fit_transform(x_train)
x_test_bow = vectorizer.transform(x_test)

In [44]:
# 1. XGBoost Classifier
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(x_train_bow, y_train)
y_pred_xgb = xgb_model.predict(x_test_bow)
xgb_acc, xgb_pre, xgb_rec, xgb_f1 = evaluate_model(y_test, y_pred_xgb, "XGBoost")

Evaluation Metrics for XGBoost:
Accuracy: 0.7500
Precision: 0.7771
Recall: 0.7500
F1-Score: 0.7345

Detailed classification report:
              precision    recall  f1-score   support

           0       0.67      0.94      0.79       204
           1       0.83      0.34      0.48        88
           2       0.85      0.74      0.79       208

    accuracy                           0.75       500
   macro avg       0.79      0.67      0.69       500
weighted avg       0.78      0.75      0.73       500

-----------------------------------------------------


In [38]:
# 2. Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train_bow, y_train)
y_pred_rf = rf_model.predict(x_test_bow)
rf_acc, rf_pre, rf_rec, rf_f1 = evaluate_model(y_test, y_pred_rf, "Random Forest")

Evaluation Metrics for Random Forest:
Accuracy: 0.6780
Precision: 0.7178
Recall: 0.6780
F1-Score: 0.6574

Detailed classification report:
              precision    recall  f1-score   support

           0       0.60      0.92      0.73       204
           1       0.77      0.26      0.39        88
           2       0.81      0.62      0.70       208

    accuracy                           0.68       500
   macro avg       0.73      0.60      0.61       500
weighted avg       0.72      0.68      0.66       500

-----------------------------------------------------


In [39]:
# 3. Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(x_train_bow, y_train)
y_pred_svm = svm_model.predict(x_test_bow)
svm_acc, svm_pre, svm_rec, svm_f1 = evaluate_model(y_test, y_pred_svm, "SVM")

Evaluation Metrics for SVM:
Accuracy: 0.7280
Precision: 0.7681
Recall: 0.7280
F1-Score: 0.7155

Detailed classification report:
              precision    recall  f1-score   support

           0       0.64      0.97      0.77       204
           1       0.78      0.40      0.53        88
           2       0.89      0.63      0.74       208

    accuracy                           0.73       500
   macro avg       0.77      0.67      0.68       500
weighted avg       0.77      0.73      0.72       500

-----------------------------------------------------


In [49]:
dict = {'Model': ['XGB Classifier', 'RF Classifier', 'SVM Classifier'],
        'Accuracy': [xgb_acc, rf_acc, svm_acc],
        'Precision': [xgb_pre, rf_pre, svm_pre],
        'Recall': [xgb_rec, rf_rec, svm_rec],
        'F1': [xgb_f1, rf_f1, svm_f1]}

eval_df = pd.DataFrame(dict)
eval_df.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,XGB Classifier,0.75,0.777105,0.75,0.734484
2,SVM Classifier,0.728,0.768092,0.728,0.715462
1,RF Classifier,0.678,0.717768,0.678,0.657352


In [50]:
# prediksi menggunakan XGB Classifier

text = ['elektabilitas ipul - puti sebesar 33,3 persen .',
        'jangan pernah pesan melalui ya teman-teman . mengecewakan ! villa - nya kotor , semua peralatan nya tidak layak pakai',
        'ini nih tempat makan yang dicari kalau lagi ingin makan burger , french fries , dan ice cream cone . atau kalau lagi buru-buru .']

processed_text = [preprocess_text(el) for el in text]

text_bow = vectorizer.transform(processed_text)

prediction = xgb_model.predict(text_bow)

sentiment = le.inverse_transform(prediction)

data = {'Text': text,
        'Label encoded': prediction,
        'Sentiment': sentiment}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Text,Label encoded,Sentiment
0,"elektabilitas ipul - puti sebesar 33,3 persen .",1,neutral
1,jangan pernah pesan melalui ya teman-teman . m...,0,negative
2,ini nih tempat makan yang dicari kalau lagi in...,2,positive
