In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle
import numpy as np




In [3]:

data = pd.read_csv('train_preprocess.tsv.txt', delimiter='\t', header=None, names=['review', 'sentiment'])


In [34]:
data

Unnamed: 0,sentiment,cleaned_review
0,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,neutral,mohon ulama lurus dan k mmbri hujjah partai ap...
2,positive,lokasi strategis di jalan sumatera bandung tem...
3,positive,betapa bahagia nya diri ini saat unboxing pake...
4,negative,duh jadi mahasiswa jangan sombong dong kasih k...
...,...,...
10995,positive,tidak kecewa
10996,positive,enak rasa masakan nya apalagi kepiting yang me...
10997,neutral,hormati partaipartai yang telah berkoalisi
10998,negative,pagi pagi di tol pasteur sudah macet parah bik...


In [4]:

def clean_and_replace_alay(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['cleaned_review'] = data['review'].apply(clean_and_replace_alay)

In [5]:

data_preprocessed = data['cleaned_review'].tolist()

print(data_preprocessed[:5])

['warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung tahu berkualitas dipadu keahlian memasak dipadu kretivitas jadilah warung yang menyajikan menu utama berbahan tahu ditambah menu umum lain seperti ayam semuanya selera indonesia harga cukup terjangkau jangan lewatkan tahu bletoka nya tidak kalah dengan yang asli dari tegal', 'mohon ulama lurus dan k mmbri hujjah partai apa yang harus diwlh agar suara islam tidak pecahpecah', 'lokasi strategis di jalan sumatera bandung tempat nya nyaman terutama sofa di lantai paella nya enak sangat pas dimakan dengan minum bir dingin appetiser nya juga enakenak', 'betapa bahagia nya diri ini saat unboxing paket dan barang nya bagus menetapkan beli lagi', 'duh jadi mahasiswa jangan sombong dong kasih kartu kuning segala belajar dulu yang baik tidak usahlah ikutikut politik nanti sudah selesai kuliah nya mau ikut politik juga tidak telat dasar mahasiswa']


In [8]:
# 5. Split Data
X = data['cleaned_review']
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0 if x == 'negative' else 2)

# feature extraction
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [15]:
# 7. Split dataset le training and testing
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# 8. Training model MLPclassifier
model = MLPClassifier()
model.fit(X_train, y_train)

print('Training selesai')

Training selesai


In [16]:
# evaluasi model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Save the trained model
with open('mlp_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the processed data
data.to_csv('cleaned_data.csv', index=False)

print(f'Data has been cleaned and exported to /mnt/data/cleaned_data.csv')

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       680
           1       0.87      0.90      0.89      1281
           2       0.79      0.64      0.71       239

    accuracy                           0.84      2200
   macro avg       0.82      0.78      0.79      2200
weighted avg       0.84      0.84      0.84      2200

Accuracy: 0.8386363636363636
Data has been cleaned and exported to /mnt/data/cleaned_data.csv


In [18]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold


X = X_vectorized  
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0 if x == 'negative' else 2)

kf = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []

for iteration, (train_index, test_index) in enumerate(kf.split(X), start=1):
    data_train, data_test = X[train_index], X[test_index]
    target_train, target_test = y.iloc[train_index], y.iloc[test_index]
    
    clf = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=10, alpha=1e-4, solver='adam', random_state=1)
    clf.fit(data_train, target_train)
    
    preds = clf.predict(data_test)
    
    accuracy = accuracy_score(target_test, preds)
    
    print(f"Training ke-{iteration}")
    print(classification_report(target_test, preds))
    print("======================================")
    
    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)

print("\n\nRata-rata Accuracy: ", average_accuracy)




Training ke-1
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       680
           1       0.89      0.91      0.90      1281
           2       0.80      0.70      0.75       239

    accuracy                           0.85      2200
   macro avg       0.83      0.81      0.82      2200
weighted avg       0.85      0.85      0.85      2200





Training ke-2
              precision    recall  f1-score   support

           0       0.81      0.82      0.81       706
           1       0.90      0.90      0.90      1274
           2       0.74      0.74      0.74       220

    accuracy                           0.86      2200
   macro avg       0.82      0.82      0.82      2200
weighted avg       0.86      0.86      0.86      2200





Training ke-3
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       682
           1       0.88      0.92      0.90      1303
           2       0.83      0.76      0.80       215

    accuracy                           0.86      2200
   macro avg       0.84      0.82      0.83      2200
weighted avg       0.86      0.86      0.86      2200





Training ke-4
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       698
           1       0.89      0.90      0.89      1273
           2       0.81      0.70      0.75       229

    accuracy                           0.85      2200
   macro avg       0.83      0.80      0.81      2200
weighted avg       0.85      0.85      0.85      2200

Training ke-5
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       670
           1       0.92      0.90      0.91      1285
           2       0.82      0.74      0.78       245

    accuracy                           0.86      2200
   macro avg       0.84      0.83      0.83      2200
weighted avg       0.86      0.86      0.86      2200



Rata-rata Accuracy:  0.8564545454545455




In [29]:
# Load the vectorizer and the model
with open('vectorizer.pkl', 'rb') as f:
    count_vect = pickle.load(f)

with open('mlp_model.pkl', 'rb') as f:
    model = pickle.load(f)
original_text = """
pakai kartu kredit bca tidak untung malah rugi besar
"""


cleaned_text = clean_and_replace_alay(original_text)


text_vectorized = count_vect.transform([cleaned_text])

result = model.predict(text_vectorized)[0]


print("Sentimen")

if result == 0 :
    print("Negative")
elif result == 1 :
    print("Positive")
elif result == 2 :
    print("Neutral")


Sentimen
Negative


In [28]:
data

Unnamed: 0,review,sentiment,cleaned_review
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus dan k mmbri hujjah partai ap...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis di jalan sumatera bandung tem...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh jadi mahasiswa jangan sombong dong kasih k...
...,...,...,...
10995,tidak kecewa,positive,tidak kecewa
10996,enak rasa masakan nya apalagi kepiting yang me...,positive,enak rasa masakan nya apalagi kepiting yang me...
10997,hormati partai-partai yang telah berkoalisi,neutral,hormati partaipartai yang telah berkoalisi
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative,pagi pagi di tol pasteur sudah macet parah bik...


In [33]:
data

Unnamed: 0,sentiment,cleaned_review
0,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,neutral,mohon ulama lurus dan k mmbri hujjah partai ap...
2,positive,lokasi strategis di jalan sumatera bandung tem...
3,positive,betapa bahagia nya diri ini saat unboxing pake...
4,negative,duh jadi mahasiswa jangan sombong dong kasih k...
...,...,...
10995,positive,tidak kecewa
10996,positive,enak rasa masakan nya apalagi kepiting yang me...
10997,neutral,hormati partaipartai yang telah berkoalisi
10998,negative,pagi pagi di tol pasteur sudah macet parah bik...
