In [None]:
import pandas as pd
import re
import nltk
# Pastikan library Sastrawi sudah terinstall: pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

# Download stopwords Bahasa Indonesia (hanya diperlukan jika belum pernah diunduh)
nltk.download('stopwords')

# Inisialisasi stopwords dan stemmer
stop_words = set(stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def remove_stopwords_and_stem(text):
    """
    Fungsi untuk menghapus stopwords dan melakukan stemming pada teks.
    """
    # Tokenisasi (pisahkan berdasarkan spasi)
    tokens = text.split()
    # Hapus stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lakukan stemming pada setiap token
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

# --- Load Dataset ---
file_path = "./dataset/recommendation/test_comments_with_predictions.csv"
dataset = pd.read_csv(file_path)

# Tampilkan beberapa baris awal dan informasi dataset
print(dataset.head())
dataset.info()

# --- Cek Kondisi Dataset ---
print("\n❓ Jumlah Missing Values:")
print(dataset.isnull().sum())

print("\n♻️ Jumlah Duplikat:", dataset.duplicated(subset=['comment']).sum())

# --- Pembersihan Dataset ---
# Hapus baris dengan komentar null
cleaneddataset = dataset.dropna(subset=['comment'])
# Hapus baris dengan komentar duplikat
cleaneddataset = cleaneddataset.drop_duplicates(subset=['comment'], keep='first')

# Case folding: ubah semua teks ke huruf kecil
cleaneddataset['comment'] = cleaneddataset['comment'].str.lower()
cleaneddataset['predicted_label'] = cleaneddataset['predicted_label'].str.lower()

# Remove special characters (sesuaikan dengan kebutuhan)
cleaneddataset['comment'] = cleaneddataset['comment'].str.replace(r'[^\w\s]', '', regex=True)

# Remove extra whitespace
cleaneddataset['comment'] = cleaneddataset['comment'].str.strip()

# --- Tambahan: Stopwords Removal & Stemming ---
cleaneddataset['comment'] = cleaneddataset['comment'].apply(remove_stopwords_and_stem)

# Tampilkan hasil cleaning
print("✅ Shape setelah cleaning:", cleaneddataset.shape)
print("✅ Jumlah Missing Values baru:", cleaneddataset.isnull().sum())
print("\n📝 Contoh Data Bersih:")
print(cleaneddataset[['comment']].head(3))

# Simpan data hasil cleaning (opsional)
# cleaneddataset.to_csv("./dataset/cleaned_test_comments.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rfahr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   no                                            comment predicted_label
0   1  Aku cuma pake sunscreen dan krim ini doang, al...         positif
1   2  Simpen dulu di keranjang kuning, nanti check out.         negatif
2   3  Muka kusamku jadi segeran setelah rutin pake k...         positif
3   4  Aku cuma pake sunscreen dan krim ini doang, al...         positif
4   5  Masih menjadi misteri kenapa Dr. Fay bisa seba...         negatif
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   no               1000 non-null   int64 
 1   comment          990 non-null    object
 2   predicted_label  1000 non-null   object
dtypes: int64(1), object(2)
memory usage: 23.6+ KB

❓ Jumlah Missing Values:
no                  0
comment            10
predicted_label     0
dtype: int64

♻️ Jumlah Duplikat: 946
✅ Shape setelah cleaning: (53, 3)
✅ Jumlah Missing 

In [2]:
print("✅ Shape setelah cleaning:", cleaneddataset.shape)
print("✅ Jumlah Missing Values baru:", cleaneddataset.isnull().sum())
print("✅ Jumlah data duplikat setelah pembersihan:", cleaneddataset.duplicated(subset=['comment']).sum())
print("\n📝 Contoh Data Bersih:")
print(cleaneddataset[['comment']].head(3))

✅ Shape setelah cleaning: (53, 3)
✅ Jumlah Missing Values baru: no                 0
comment            0
predicted_label    0
dtype: int64
✅ Jumlah data duplikat setelah pembersihan: 0

📝 Contoh Data Bersih:
                                             comment
0  pake sunscreen krim doang alhamdulillah tahun ...
1                  simpen keranjang kuning check out
2                 muka kusam segeran rutin pake krim


In [3]:
cleaneddataset.head()

Unnamed: 0,no,comment,predicted_label
0,1,pake sunscreen krim doang alhamdulillah tahun ...,positif
1,2,simpen keranjang kuning check out,negatif
2,3,muka kusam segeran rutin pake krim,positif
4,5,misteri dr fay bagus,negatif
5,6,banyakin produk kayak gin anak kuliah minim bu...,positif


In [4]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

# **Mapping Label (Negatif, Netral, Positif) ke angka**
label_encoder = LabelEncoder()
cleaneddataset['label'] = label_encoder.fit_transform(cleaneddataset['predicted_label'])

# Lihat mapping label
print("Mapping label:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Mapping label: {'negatif': 0, 'netral': 1, 'positif': 2}


In [5]:
# Lihat mapping label
print("Mapping label:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# **Tokenisasi menggunakan BERT**
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Tokenisasi komentar menggunakan tokenizer BERT
tokens = tokenizer(
    cleaneddataset['comment'].tolist(),  # Data teks
    padding=True,                # Padding agar panjang sama
    truncation=True,             # Potong teks jika terlalu panjang
    max_length=128,              # Maksimal panjang token
    return_tensors="pt"          # Format PyTorch
)

# Simpan label dalam tensor
labels = torch.tensor(cleaneddataset['label'].values)

# Cetak contoh tokenisasi
print("Contoh tokenized input:", tokens["input_ids"][0])

Mapping label: {'negatif': 0, 'netral': 1, 'positif': 2}
Contoh tokenized input: tensor([    2,  3125,  2840, 21296,  5101, 12054,  5742,   262,  2137,  7239,
            3,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [11]:
from collections import Counter

# Filter komentar negatif
neg_comments = cleaneddataset[cleaneddataset['predicted_label'] == "positif"]['comment']

# Tokenisasi manual (pisahkan kata)
all_words = " ".join(neg_comments.astype(str)).split()
word_counts = Counter(all_words)

# Tampilkan 10 kata negatif paling sering muncul
print("🔴 Kata yang sering muncul di komentar negatif:")
print(word_counts.most_common(10))


🔴 Kata yang sering muncul di komentar negatif:
[('udah', 19), ('banget', 19), ('pake', 13), ('hasil', 11), ('gak', 9), ('krim', 6), ('cocok', 6), ('bagus', 6), ('2', 6), ('produk', 5)]


In [28]:
synonym_dict = {
    **{word: "tidak" for word in {"gk", "gak", "tdk", "ga", "nggak"}},
    **{word: "cerah" for word in {"berkilau", "glowing"}},
    **{word: "dari" for word in {"dr"}}
}


def replace_synonyms(text):
    words = text.split()
    words = [synonym_dict.get(word, word) for word in words]  # Ganti kata jika ada di dictionary
    return " ".join(words)

cleaneddataset['comment'] = cleaneddataset['comment'].apply(replace_synonyms)

In [30]:
cleaneddataset.head(50)

Unnamed: 0,no,comment,predicted_label,label
0,1,pake sunscreen krim doang alhamdulillah tahun ...,positif,2
1,2,simpen keranjang kuning check out,negatif,0
2,3,muka kusam segeran rutin pake krim,positif,2
4,5,misteri dari fay bagus,negatif,0
5,6,banyakin produk kayak gin anak kuliah minim bu...,positif,2
6,7,udah check out kak kemarin udah abis cocok banget,positif,2
7,8,orang tidak percaya kalo beli online kali beli...,negatif,0
8,9,pake alhamdulillah hasil udah liat,positif,2
9,10,hasil nyata bagus banget,positif,2
11,12,cocok banget makasih,positif,2


In [25]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Kata sebelum stemming
text = "gak suka banget hasilnya tidak sesuai ekspektasi"

# Lakukan stemming
stemmed_text = stemmer.stem(text)
print(stemmed_text)  # Output: "gak suka banget hasil sesuai ekspektasi"


gak suka banget hasil tidak sesuai ekspektasi
