In [19]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [20]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon') 

analyzer = SentimentIntensityAnalyzer()

# Load data
df = pd.read_csv('whatsapp_reviews.csv')
df['content'] = df['content'].fillna('') 

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positif'
    elif scores['compound'] <= -0.05:
        return 'negatif'
    else:
        return 'netral'

df['score'] = df['content'].apply(get_sentiment)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [21]:

# --- Pra-pemrosesan Data ---
def preprocess_text(text):
    text = text.lower() # Case folding
    text = re.sub(r'[^a-z\s]', '', text) # Hapus angka dan tanda baca
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')] # Hapus stopwords
    stemmer = PorterStemmer() # Atau WordNetLemmatizer untuk lemmatization
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

# --- Pelabelan Data (Contoh menggunakan VADER untuk bahasa Inggris) ---
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def get_sentiment_vader(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positif'
    elif scores['compound'] <= -0.05:
        return 'negatif'
    else:
        return 'netral'

# Pra-pemrosesan
df['processed_content'] = df['content'].apply(preprocess_text)

# Pelabelan
df['sentiment'] = df['processed_content'].apply(get_sentiment_vader)

X = df['processed_content']
y = df['sentiment']

In [22]:
 #--- Skema Percobaan 1: SVM, TF-IDF, 80/20 ---
print("\n--- Skema 1: SVM, TF-IDF, 80/20 ---")
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000) 
X_train_tfidf_1 = tfidf_vectorizer.fit_transform(X_train_1)
X_test_tfidf_1 = tfidf_vectorizer.transform(X_test_1)

# Pelatihan SVM
svm_model_1 = SVC(kernel='linear')
svm_model_1.fit(X_train_tfidf_1, y_train_1)
y_pred_svm_1 = svm_model_1.predict(X_test_tfidf_1)

print("Akurasi Skema 1:", accuracy_score(y_test_1, y_pred_svm_1))
print("Classification Report Skema 1:\n", classification_report(y_test_1, y_pred_svm_1))



--- Skema 1: SVM, TF-IDF, 80/20 ---
Akurasi Skema 1: 0.94
Classification Report Skema 1:
               precision    recall  f1-score   support

     negatif       0.86      0.74      0.80       223
      netral       0.92      0.98      0.95       626
     positif       0.97      0.96      0.96      1151

    accuracy                           0.94      2000
   macro avg       0.92      0.89      0.90      2000
weighted avg       0.94      0.94      0.94      2000



In [23]:
# --- Skema Percobaan 2: Random Forest, Word2Vec, 80/20 (Optimized) ---
print("\n--- Skema 2: Random Forest, Word2Vec, 80/20 (Optimized) ---")
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tokenized_sentences_train = [text.split() for text in X_train_2]
tokenized_sentences_test = [text.split() for text in X_test_2]

word2vec_model = Word2Vec(
    tokenized_sentences_train,
    vector_size=300,        
    window=10,             
    min_count=1,            
    workers=4,
    sg=1,                  
    epochs=20,             
    negative=10             
)

# Fungsi untuk membuat vektor dokumen dari vektor kata-kata
def document_vector(word2vec_model, doc):
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

X_train_w2v_2 = np.array([document_vector(word2vec_model, sentence) for sentence in tokenized_sentences_train])
X_test_w2v_2 = np.array([document_vector(word2vec_model, sentence) for sentence in tokenized_sentences_test])

# Filter keluar baris yang masih mengandung NaN (jika ada dokumen yang benar-benar kosong setelah pemrosesan)
nan_rows_train = np.isnan(X_train_w2v_2).any(axis=1)
if np.any(nan_rows_train):
    X_train_w2v_2 = X_train_w2v_2[~nan_rows_train]
    y_train_2 = y_train_2[~nan_rows_train]
    print(f"Menghapus {np.sum(nan_rows_train)} baris dengan NaN di training set Word2Vec.")

nan_rows_test = np.isnan(X_test_w2v_2).any(axis=1)
if np.any(nan_rows_test):
    X_test_w2v_2 = X_test_w2v_2[~nan_rows_test]
    y_test_2 = y_test_2[~nan_rows_test]
    print(f"Menghapus {np.sum(nan_rows_test)} baris dengan NaN di testing set Word2Vec.")

# Pelatihan Random Forest dengan Hyperparameter Tuning (menggunakan RandomizedSearchCV)
param_dist = {
    'n_estimators': [200, 300, 400, 500, 600],
    'max_depth': [15, 25, 35, None],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2', 0.7, 0.8, 0.9]
}

rf_classifier = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search_rf = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_dist,
    n_iter=10,  # Jumlah iterasi hyperparameter tuning
    cv=2,       # K-folds cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

random_search_rf.fit(X_train_w2v_2, y_train_2)

print("\nBest parameters for Random Forest (Skema 2):", random_search_rf.best_params_)
print("Best cross-validation accuracy (Skema 2):", random_search_rf.best_score_)

best_rf_model_2 = random_search_rf.best_estimator_
y_pred_rf_2_optimized = best_rf_model_2.predict(X_test_w2v_2)

print("Akurasi Skema 2 (Optimized - Final):", accuracy_score(y_test_2, y_pred_rf_2_optimized))
print("Classification Report Skema 2 (Optimized - Final):\n", classification_report(y_test_2, y_pred_rf_2_optimized))



--- Skema 2: Random Forest, Word2Vec, 80/20 (Optimized) ---
Fitting 2 folds for each of 10 candidates, totalling 20 fits

Best parameters for Random Forest (Skema 2): {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.8, 'max_depth': 35}
Best cross-validation accuracy (Skema 2): 0.8601083395848963
Akurasi Skema 2 (Optimized - Final): 0.858
Classification Report Skema 2 (Optimized - Final):
               precision    recall  f1-score   support

     negatif       0.66      0.55      0.60       223
      netral       0.86      0.85      0.85       626
     positif       0.89      0.92      0.90      1151

    accuracy                           0.86      2000
   macro avg       0.80      0.77      0.79      2000
weighted avg       0.85      0.86      0.86      2000



In [24]:
# --- Skema Percobaan 3: Random Forest, TF-IDF, 70/30 ---
print("\n--- Skema 3: Random Forest, TF-IDF, 70/30 ---")
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# TF-IDF
tfidf_vectorizer_3 = TfidfVectorizer(max_features=5000)
X_train_tfidf_3 = tfidf_vectorizer_3.fit_transform(X_train_3)
X_test_tfidf_3 = tfidf_vectorizer_3.transform(X_test_3)

# Pelatihan Random Forest
rf_model_3 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_3.fit(X_train_tfidf_3, y_train_3)
y_pred_rf_3 = rf_model_3.predict(X_test_tfidf_3)

print("Akurasi Skema 3:", accuracy_score(y_test_3, y_pred_rf_3))
print("Classification Report Skema 3:\n", classification_report(y_test_3, y_pred_rf_3))



--- Skema 3: Random Forest, TF-IDF, 70/30 ---
Akurasi Skema 3: 0.934
Classification Report Skema 3:
               precision    recall  f1-score   support

     negatif       0.91      0.72      0.80       334
      netral       0.88      0.99      0.93       939
     positif       0.97      0.94      0.96      1727

    accuracy                           0.93      3000
   macro avg       0.92      0.88      0.90      3000
weighted avg       0.94      0.93      0.93      3000

