In [None]:
print("PROGRAM DIMULAI")

import pandas as pd
import re, string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')

# ==================== LOAD DATA ====================

print("MEMUAT DATA...")

grab = pd.read_csv(r"C:\Users\Dell\Downloads\grab_2500.csv.csv")
gojek = pd.read_csv(r"C:\Users\Dell\Downloads\gojek_2500.csv.csv")

grab = grab[["Ulasan"]].rename(columns={"Ulasan": "text"})
gojek = gojek[["content"]].rename(columns={"content": "text"})

grab = grab.reset_index(drop=True)
gojek = gojek.reset_index(drop=True)

grab["text"] = grab["text"].astype(str).str.slice(0, 500)
gojek["text"] = gojek["text"].astype(str).str.slice(0, 500)

# ==================== PREPROCESS ====================

print("PREPROCESSING...")

stop_words = set(stopwords.words('indonesian'))
stemmer = StemmerFactory().create_stemmer()

normalisasi = {
    "gk":"tidak","ga":"tidak","nggak":"tidak",
    "bgt":"banget","bgtt":"banget","mantul":"mantap","asikkk":"asik"
}

def clean_text(text):
    try:
        text = text.lower()
        text = re.sub(r'(.)\1{2,}', r'\1', text)
        text = re.sub(r'\d+', ' ', text)
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
        text = text.translate(str.maketrans('', '', string.punctuation))

        hasil = []
        for kata in text.split():
            kata = normalisasi.get(kata, kata)
            if kata not in stop_words and len(kata) > 3:
                hasil.append(stemmer.stem(kata))
        return " ".join(hasil)
    except:
        return ""

print("PREPROCESSING GRAB...")
grab_clean = []
for i in range(len(grab)):
    grab_clean.append(clean_text(grab.loc[i, "text"]))
    if i % 200 == 0: print("Grab:", i)
grab["clean"] = grab_clean

print("PREPROCESSING GOJEK...")
gojek_clean = []
for i in range(len(gojek)):
    gojek_clean.append(clean_text(gojek.loc[i, "text"]))
    if i % 200 == 0: print("Gojek:", i)
gojek["clean"] = gojek_clean

# ==================== SENTIMENT ====================

positif = ["bagus","mantap","cepat","ramah","puas","baik"]
negatif = ["jelek","lama","buruk","kecewa","mahal","batal"]

def label_sentimen(text):
    skor = 0
    for p in positif:
        if p in text: skor += 1
    for n in negatif:
        if n in text: skor -= 1
    if skor > 0: return "Positif"
    if skor < 0: return "Negatif"
    return "Netral"

grab["sentimen"] = grab["clean"].apply(label_sentimen)
gojek["sentimen"] = gojek["clean"].apply(label_sentimen)

# ==================== WORDCLOUD ====================

def show_wordcloud(data, title):
    wc = WordCloud(width=800,height=400,background_color="white").generate(" ".join(data))
    plt.figure(figsize=(10,5))
    plt.imshow(wc)
    plt.axis("off")
    plt.title(title)
    plt.show()

show_wordcloud(grab["clean"], "WordCloud Grab")
show_wordcloud(gojek["clean"], "WordCloud Gojek")

# ==================== N-GRAM ====================

def plot_ngram(data, title):
    tokens = " ".join(data).split()
    top_ngrams = Counter(ngrams(tokens, 4)).most_common(10)

    labels = [" ".join(k) for k,v in top_ngrams]
    values = [v for k,v in top_ngrams]

    plt.figure(figsize=(10,5))
    plt.bar(labels, values)
    plt.xticks(rotation=75)
    plt.title(title)
    plt.show()

plot_ngram(grab["clean"], "Top 10 4-Gram Grab")
plot_ngram(gojek["clean"], "Top 10 4-Gram Gojek")

# ==================== CLUSTERING ====================

vectorizer = TfidfVectorizer(max_features=3000)
X_grab = vectorizer.fit_transform(grab["clean"])
X_gojek = vectorizer.fit_transform(gojek["clean"])

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)

grab["cluster"] = kmeans.fit_predict(X_grab)
gojek["cluster"] = kmeans.fit_predict(X_gojek)

print("\nDistribusi Cluster Grab:")
print(grab["cluster"].value_counts())

print("\nDistribusi Cluster Gojek:")
print(gojek["cluster"].value_counts())

# ==================== TOPIC MODELING ====================

lda = LatentDirichletAllocation(n_components=3, random_state=42)

lda.fit(X_grab)
feature_names = vectorizer.get_feature_names_out()

def show_topics(model, feature_names, n_words=8):
    for i, topic in enumerate(model.components_):
        print(f"\nTopik {i+1}:")
        print(", ".join([feature_names[j] for j in topic.argsort()[-n_words:]]))

print("\nTopik Grab:")
show_topics(lda, feature_names)

# ==================== VISUAL SENTIMEN ====================

def plot_sentiment(df, title):
    count = df["sentimen"].value_counts()
    plt.figure(figsize=(6,4))
    plt.bar(count.index, count.values)
    plt.title(title)
    plt.ylabel("Jumlah Data")
    plt.show()

plot_sentiment(grab, "Distribusi Sentimen Grab")
plot_sentiment(gojek, "Distribusi Sentimen Gojek")

# ==================== SAVE FILE ====================

grab.to_csv(r"C:\Users\Dell\Downloads\grab_final.csv", index=False)
gojek.to_csv(r"C:\Users\Dell\Downloads\gojek_final.csv", index=False)

print("\nSELESAI TOTAL â€” FILE TERSIMPAN")


PROGRAM DIMULAI


ModuleNotFoundError: No module named 'Sastrawi'