In [4]:
# Sentiment Analysis
# with bag of words as the feature and implemented using 3 different classical ML algorithm

In [5]:
import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shidq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shidq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
train_data = pd.read_csv('datasets/train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_data = pd.read_csv('datasets/test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
validation_data = pd.read_csv('datasets/valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

In [8]:
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']

X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

X_val = validation_data.drop(columns=['label'])
y_val = validation_data['label']

In [9]:
print(train_data.head())

                                                text     label
0  warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1  mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2  lokasi strategis di jalan sumatera bandung . t...  positive
3  betapa bahagia nya diri ini saat unboxing pake...  positive
4  duh . jadi mahasiswa jangan sombong dong . kas...  negative


In [10]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess(sent):
    # menghapus semua token nonletter
    remove_nonletters = re.sub("[^a-zA-Z]", " ", sent)
    # mengubah semua token agar menjadi lowercase
    lower = nltk.word_tokenize(remove_nonletters.lower())
    # menghilangkan stopwords
    stop_words = set(stopwords.words('indonesian'))
    # stemming token
    stopwords_removed = [stemmer.stem(token) for token in lower if not token in stop_words]
    return " ".join(stopwords_removed)

In [59]:
preprocess("memainkan")

'main'

In [12]:
train_data['cleaned'] = train_data['text'].apply(preprocess)
test_data['cleaned'] = test_data['text'].apply(preprocess)
validation_data['cleaned'] = validation_data['text'].apply(preprocess)

# train_data['cleaned'].head()
# test_data['cleaned'].head()
# validation_data['cleaned'].head()

In [13]:
# feature extraction menggunakan Bag of Words dengan menghitung frekuensi kemunculan kata dalam teks dan mengubahnya menjadi vektor angka

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_data['cleaned']).toarray()
test_features = vectorizer.transform(test_data['cleaned']).toarray()
val_features = vectorizer.transform(validation_data['cleaned']).toarray()

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluating {model_name} Model")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted'))
    print("Recall:", recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
    print()

In [15]:
from sklearn.linear_model import LogisticRegression

print("Training Softmax Regression...")
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
softmax_reg.fit(train_features, y_train)

Training Softmax Regression...


In [16]:
softmax_pred = softmax_reg.predict(test_features)
evaluate_model("Softmax Regression", y_test, softmax_pred)

Evaluating Softmax Regression Model
Accuracy: 0.736
Precision: 0.7345994473930293
Recall: 0.736
F1 Score: 0.728736079617032



In [17]:
from sklearn.neural_network import MLPClassifier

print("Training MLP (Neural Network)...")
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, alpha=0.0001)
mlp.fit(train_features, train_data['label'])

Training MLP (Neural Network)...


In [18]:
mlp_pred = mlp.predict(test_features)
evaluate_model("MLP", y_test, mlp_pred)

Evaluating MLP Model
Accuracy: 0.68
Precision: 0.6752411714241514
Recall: 0.68
F1 Score: 0.6768642758341864



In [19]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)


In [20]:
from xgboost import XGBClassifier

print("Training XGBoost...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(train_features, y_train_encoded)

Training XGBoost...


In [21]:
xgb_pred = xgb.predict(test_features)

xgb_pred_labels = label_encoder.inverse_transform(xgb_pred)
evaluate_model("XGBoost", y_test, xgb_pred_labels)

Evaluating XGBoost Model
Accuracy: 0.688
Precision: 0.7285472880061116
Recall: 0.688
F1 Score: 0.6617940145502645



In [32]:
from sklearn.naive_bayes import MultinomialNB

print("Training Naive Bayes...")
nb = MultinomialNB()
nb.fit(train_features, train_data['label'])

Training Naive Bayes...


In [33]:
nb_pred = nb.predict(test_features)
evaluate_model("Naive Bayes", y_test, nb_pred)

Evaluating Naive Bayes Model
Accuracy: 0.648
Precision: 0.6694141921593254
Recall: 0.648
F1 Score: 0.6259488288288289



In [24]:
print(softmax_reg.classes_)

['negative' 'neutral' 'positive']


In [61]:
print("Prediksi probabilitas Softmax Regression:")
pred_softmax = softmax_reg.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_softmax[0][0]) + "\nneutral score: " + str(pred_softmax[0][1]) + "\npositive score: " + str(pred_softmax[0][2]))

Prediksi probabilitas Softmax Regression:
negative score: 0.27046574309135657
neutral score: 0.003936441569325418
positive score: 0.725597815339318


In [26]:
print(mlp.classes_)

['negative' 'neutral' 'positive']


In [63]:
print("Prediksi probabilitas MLP:")
pred_mlp = mlp.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_mlp[0][0]) + "\nneutral score: " + str(pred_mlp[0][1]) + "\npositive score: " + str(pred_mlp[0][2]))

Prediksi probabilitas MLP:
negative score: 0.07417802838188653
neutral score: 0.0018544066038416874
positive score: 0.9239675650142718


In [43]:
print(label_encoder.classes_)

['negative' 'neutral' 'positive']


In [54]:
print("Prediksi probabilitas XGBoost:")
pred_xgb = xgb.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_xgb[0][0]) + "\nneutral score: " + str(pred_xgb[0][1]) + "\npositive score: " + str(pred_xgb[0][2]))

Prediksi probabilitas XGBoost:
negative score: 0.98493284
neutral score: 0.014920079
positive score: 0.00014713347


In [34]:
print(nb.classes_)

['negative' 'neutral' 'positive']


In [50]:
print("Prediksi probabilitas Naive Bayes:")
pred_nb = nb.predict_proba(vectorizer.transform([preprocess('makanan di sini enak sekali. saya suka!')]))
print('negative score: ' + str(pred_nb[0][0]) + "\nneutral score: " + str(pred_nb[0][1]) + "\npositive score: " + str(pred_nb[0][2]))

Prediksi probabilitas Naive Bayes:
negative score: 0.6043436503356656
neutral score: 0.3956194573557446
positive score: 3.6892308586609945e-05
