In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Membaca Data dari File Excel
df = pd.read_excel('/content/sample_data/DATA UNTUK NAIVE BAYES.xlsx')

# 2. Preprocessing Data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # menghapus URL
    text = re.sub(r'@\w+', '', text)     # menghapus mention
    text = re.sub(r'#\w+', '', text)     # menghapus hashtag
    text = re.sub(r'[^\w\s]', '', text)  # menghapus tanda baca
    text = text.lower()                  # konversi teks ke huruf kecil
    return text

# Pastikan semua nilai dalam kolom 'Isi Tweet' adalah string
df['Isi Tweet'] = df['Isi Tweet'].astype(str)
df['cleaned_tweet'] = df['Isi Tweet'].apply(clean_text)

# 3. Mengganti label sentimen manual (Anda dapat menggunakan dataset yang sudah diberi label)
# Misalnya, kita membuat kolom 'sentimen' untuk keperluan contoh ini
# 1 untuk positif, 0 untuk netral, -1 untuk negatif
# Silakan ganti dengan metode pelabelan otomatis atau manual sesuai kebutuhan
df['sentimen'] = np.random.choice([1, 0, -1], size=len(df))  # Contoh acak

# 4. Ekstraksi Fitur dengan TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_tweet']).toarray()
y = df['sentimen']

# 5. Melatih Model Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# 6. Menguji Model dan Mengevaluasi Hasil
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Menyimpan hasil prediksi ke dalam file
df_test = df.iloc[y_test.index]
df_test['predicted_sentimen'] = y_pred
df_test.to_excel('predicted_tweets.xlsx', index=False)


Accuracy: 0.32616940581542353
Classification Report:
               precision    recall  f1-score   support

          -1       0.30      0.28      0.29       259
           0       0.32      0.45      0.37       253
           1       0.36      0.26      0.30       279

    accuracy                           0.33       791
   macro avg       0.33      0.33      0.32       791
weighted avg       0.33      0.33      0.32       791

Confusion Matrix:
 [[ 72 121  66]
 [ 76 113  64]
 [ 89 117  73]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predicted_sentimen'] = y_pred
