In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from collections import Counter
from IPython.display import display, HTML

# Load the dataset
file_path = 'comments_data.csv'
data_new = pd.read_csv(file_path)

# Enhanced text preprocessing function
def clean_text_enhanced(text):
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)
    # Remove leading and trailing spaces
    text = text.strip()
    # Hilangkan kata-kata spesifik seperti 'di', 'ke', 'dari', 'yang', 'bang', dan sejenisnya
    text = re.sub(r'\b(di|ke|dari|yang|bang|dan|atau|untuk|dengan|pada|oleh|ini|itu|saja|juga|karena|tetapi|agar|sehingga|adalah|seperti|namun|meskipun|walaupun|bahkan|hanya|masih|sudah|belum|akan|telah|dalam|luar|antara|tanpa|setelah|sebelum|hingga|sampai|sebab|akibat|maka|jadi|kalau|jika|bila|apabila|supaya|meski|walau|serta|atau|dan|pun|lagi|malah|justru|apalagi|bahwa|oleh|untuk|dengan|pada|di|ke|dari|yang|bang|yg|nya|ada|sama|buat|aja|ya)\b', '', text)
    return text

# Clean the comment column
data_new['Cleaned_Comment'] = data_new['Comment'].apply(clean_text_enhanced)

# Enhanced sentiment labeling with more precise keywords
def sentiment_label_enhanced(comment):
    # Positive sentiment based on specific keywords
    positive_keywords = ['baik', 'bagus', 'suka', 'terbaik', 'mantap', 'luar biasa', 'senang', 'puas', 'hebat', 'keren', 'indah', 'menyenangkan', 'asik', 'top', 'cool', 'cakep', 'ciamik', 'murah', 'terjangkau', 'ekonomis', 'worth', 'worth it', 'berharga', 'amazing', 'fantastic', 'superb', 'excellent', 'perfect', 'happy', 'love', 'great', 'awesome', 'laris', 'murah', 'terlaris', 'diskon', 'promo', 'hemat', 'spesial', 'affordable', 'bagus banget', 'sangat puas', 'sangat suka', 'sangat baik', 'sangat keren', 'sangat mantap', 'sangat indah', 'sangat menyenangkan', 'sangat luar biasa', 'sangat worth it', 'sangat murah', 'sangat terjangkau', 'sangat ekonomis', 'sangat hebat', 'sangat amazing', 'sangat fantastic', 'sangat perfect', 'sangat happy', 'sangat love', 'sangat great', 'sangat awesome', 'sangat top', 'sangat cool', 'sangat ciamik', 'sangat spesial', 'sangat affordable', 'sangat worth', 'sangat worth it', 'sangat berharga']
    negative_keywords = ['buruk', 'jelek', 'tidak suka', 'gagal', 'tidak puas', 'kekecewaan', 'menyedihkan', 'parah', 'benci', 'sampah', 'payah', 'kecewa', 'zonk', 'hancur', 'nyesek', 'lebay', 'mahal', 'overpriced', 'tidak terjangkau']
    # Check for positive keywords
    if any(word in comment for word in positive_keywords):
        return 2  # Positive
    
    # Check for negative keywords
    elif any(word in comment for word in negative_keywords):
        return 0  # Negative
    
    # Neutral sentiment for all others
    else:
        return 1  # Neutral

# Apply sentiment labels to the 'Cleaned_Comment' column
data_new['Sentiment'] = data_new['Cleaned_Comment'].apply(sentiment_label_enhanced)

# Display the first few rows of the updated data with cleaned comments and sentiment labels
data_new[['Cleaned_Comment', 'Sentiment']].head()

# Display the total count of each sentiment label
sentiment_counts = data_new['Sentiment'].value_counts()
print("Sentiment Counts:")
print(sentiment_counts)

# Display the DataFrame as a scrollable table
display(HTML(data_new.to_html(index=False, max_rows=10, max_cols=7, notebook=True)))



Sentiment Counts:
Sentiment
1    11821
2     2920
0      259
Name: count, dtype: int64


Timestamp,Username,VideoID,Comment,Date,Cleaned_Comment,Sentiment
2025-04-07T16:03:13Z,@mochamedadama2567,qlKDYBZysoc,"2025, Infinix note 50",2025-04-07T16:03:13Z,infinix note,1
2025-03-29T18:07:25Z,@EkomargoriskiEkomargoriski,qlKDYBZysoc,Setia,2025-03-29T18:07:25Z,setia,1
2025-01-29T13:36:05Z,@afdhalaljibran1845,qlKDYBZysoc,pengguna 3 tahun yang lalu hadir nih wkwk awet ...,2025-01-29T13:36:05Z,pengguna tahun lalu hadir nih wkwk awet cas k...,1
2024-11-08T09:38:37Z,@belutwakwaw1017,qlKDYBZysoc,Nonton 2024 dulu 3 jutaan skrang dbwah 2 jutaan...,2024-11-08T09:38:37Z,nonton dulu jutaan skrang dbwah jutaan spek ...,2
2024-10-05T09:25:10Z,@donypras6150,qlKDYBZysoc,Mendang mending TECNO,2024-10-05T09:25:10Z,mendang mending tecno,1
...,...,...,...,...,...,...
2022-04-26T22:47:59Z,@muhamadsaiful9570,zp4QcvyvQi0,Bang coba review invinix not 11 nfc,2022-04-26T22:47:59Z,coba review invinix not nfc,1
2022-04-26T06:31:35Z,@yusniarzen,zp4QcvyvQi0,Infinix yg ada 2 speker selain ini apa ya,2022-04-26T06:31:35Z,infinix speker selain apa,1
2022-04-26T03:10:40Z,@inxgee,zp4QcvyvQi0,"bang coba aktifin DTS audionya, soalnya speaker...",2022-04-26T03:10:40Z,coba aktifin dts audionya soalnya speakernya h...,1
2022-05-14T02:04:47Z,@nitanovilia647,zp4QcvyvQi0,Caranya gimana ya,2022-05-14T02:04:47Z,caranya gimana,1


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = data_new['Cleaned_Comment']
y = data_new['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization for feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()


# SVM with TF-IDF

In [3]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# SVM with TF-IDF Vectorizer
svm_model = make_pipeline(TfidfVectorizer(max_features=10000), SVC(kernel='linear'))

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
train_accuracy_svm = svm_model.score(X_train, y_train)
test_accuracy_svm = accuracy_score(y_test, y_pred_svm)

train_accuracy_svm, test_accuracy_svm


(0.9825833333333334, 0.9773333333333334)

# Count Vectorizer dan Naive Bayes

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# Tokenisasi dan pembersihan teks (sudah dilakukan sebelumnya)
# Split the data into training and testing sets
X = data_new['Cleaned_Comment']
y = data_new['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes with CountVectorizer
nb_model = make_pipeline(CountVectorizer(max_features=10000), MultinomialNB())

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
train_accuracy_nb = nb_model.score(X_train, y_train)
test_accuracy_nb = accuracy_score(y_test, y_pred_nb)

# Print the accuracies
print(f"Training Accuracy: {train_accuracy_nb * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy_nb * 100:.2f}%")


Training Accuracy: 96.98%
Testing Accuracy: 94.30%


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# Tokenisasi dan pembersihan teks (sudah dilakukan sebelumnya)
# Split the data into training and testing sets
X = data_new['Cleaned_Comment']
y = data_new['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bag of Words with Logistic Regression
logreg_model = make_pipeline(CountVectorizer(max_features=10000), LogisticRegression(max_iter=1000))

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model
train_accuracy_logreg = logreg_model.score(X_train, y_train)
test_accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

# Print the accuracies
print(f"Training Accuracy: {train_accuracy_logreg * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy_logreg * 100:.2f}%")


Training Accuracy: 99.17%
Testing Accuracy: 97.83%
