In [None]:
# ==========================================
# CELL 1: INSTALL & IMPORT
# ==========================================
!pip install pandas textblob plotly wordcloud matplotlib requests nltk

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import json
import requests
import re
import nltk
from nltk.corpus import stopwords

# Download database kata sambung (stopwords)
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# CELL 2: LOAD DATA, PREPROCESSING & ANALYSIS
import os

if 'df' in locals(): del df

# === KONFIGURASI SUMBER DATA ===
# Ganti dengan path lokal atau URL kamu
# source_path = "data/yt_comment_ferry.json" 
source_path = "https://github.com/rhnrafif/datamining_1/blob/main/data/yt_comment_ferry.json" 

def get_raw_url(github_url):
    if 'github.com' in github_url and '/blob/' in github_url:
        return github_url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
    return github_url

# --- 1. SETUP STOPWORDS (KATA YANG DIHAPUS) ---
# Kita gabungkan stopwords resmi bahasa indonesia dengan bahasa gaul/slang
stop_words = set(stopwords.words('indonesian'))
custom_slang = {
    'yg', 'gak', 'ga', 'kalo', 'kl', 'bgt', 'dr', 'dlm', 'tdk', 'jd', 
    'jgn', 'sdh', 'aja', 'n', 't', 'ny', 'sy', 'aku', 'saya', 'kamu', 
    'dia', 'ini', 'itu', 'dan', 'di', 'ke', 'dari', 'yang', 'pada',
    'untuk', 'bang', 'kak', 'min', 'gan', 'sis', 'bro', 'video', 'nya',
    'dong', 'sih', 'kok', 'deh', 'mah', 'kan', 'ada', 'apa', 'tuh', 'gw'
}
stop_words = stop_words.union(custom_slang)

def clean_text(text):
    text = str(text).lower()               # 1. Lowercase
    text = re.sub(r'[^\w\s]', '', text)    # 2. Hapus tanda baca/emoji
    text = re.sub(r'\d+', '', text)        # 3. Hapus angka (opsional)
    
    # 4. Hapus Stopwords
    words = text.split()
    cleaned_words = [w for w in words if w not in stop_words]
    return " ".join(cleaned_words)


try:
    print(f"Memeriksa sumber data: {source_path}")
    text_data = ""

    # --- LOAD DATA LOGIC (URL vs LOCAL) ---
    if source_path.startswith('http'):
        print("Terdeteksi sebagai URL. Mengunduh data...")
        raw_url = get_raw_url(source_path)
        response = requests.get(raw_url)
        if response.status_code != 200: raise Exception(f"Gagal download! Status: {response.status_code}")
        text_data = response.text
    else:
        print("Terdeteksi sebagai File Lokal. Membuka file...")
        if not os.path.exists(source_path):
            # Auto-search path
            for root, dirs, files in os.walk("."):
                if os.path.basename(source_path) in files:
                    source_path = os.path.join(root, os.path.basename(source_path))
                    break
        with open(source_path, 'r', encoding='utf-8') as f:
            text_data = f.read()

    # --- PARSING JSON ---
    text_data_fixed = re.sub(r',\s*]', ']', text_data)
    text_data_fixed = re.sub(r',\s*}', '}', text_data_fixed)
    
    json_data = json.loads(text_data_fixed)
    df = pd.DataFrame(json_data)
    print(f"Data Valid! Berhasil memuat {len(df)} baris.")

    # --- PREPROCESSING (NEW!) ---
    print("Melakukan pembersihan teks (Cleaning)...")
    # Kita simpan teks bersih di kolom baru 'text_clean'
    df['text_clean'] = df['text'].apply(clean_text)

    # --- ANALISIS SENTIMEN ---
    print("Menjalankan analisis sentimen...")
    # Kita analisis kolom asli 'text' agar konteks emosi tetap dapat (TextBlob butuh struktur kalimat)
    # Tapi nanti WordCloud pakai 'text_clean'
    
    def get_score(text):
        try: return TextBlob(str(text)).sentiment.polarity
        except: return 0
        
    def get_label(score):
        if score > 0.05: return 'Positif'    # Ambang batas sedikit diturunkan
        elif score < -0.05: return 'Negatif'
        else: return 'Netral'

    df['score'] = df['text'].apply(get_score)
    df['label'] = df['score'].apply(get_label)
    
    # Format Tanggal
    if 'published_at' in df.columns:
        df['published_at'] = pd.to_datetime(df['published_at'])
        df['date_only'] = df['published_at'].dt.date
    else:
        df['date_only'] = pd.Timestamp.now().date()
    
    print("Selesai! Data bersih ada di kolom 'text_clean'.")
    print(df[['text', 'text_clean', 'label']].head()) # Preview bedanya

except Exception as e:
    print(f"ERROR: {e}")

In [None]:

# CELL 3: PIE CHART & BAR CHART

if 'df' in locals():
    # 1. Pie Chart Interaktif
    fig_pie = px.pie(df, names='label', title='Proporsi Sentimen Komentar',
                     color='label', 
                     color_discrete_map={'Positif':'#00CC96', 'Negatif':'#EF553B', 'Netral':'#AB63FA'},
                     hole=0.4)
    fig_pie.show()

    # 2. Bar Chart (Top Videos)
    if 'video_title' in df.columns:
        video_counts = df.groupby(['video_title', 'label']).size().reset_index(name='jumlah')
        # Ambil Top 5 Video saja
        top_videos = df['video_title'].value_counts().nlargest(5).index
        video_counts_filtered = video_counts[video_counts['video_title'].isin(top_videos)]

        fig_bar = px.bar(video_counts_filtered, x="jumlah", y="video_title", color="label",
                         title="Sentimen pada 5 Video Terpopuler", orientation='h',
                         color_discrete_map={'Positif':'#00CC96', 'Negatif':'#EF553B', 'Netral':'#AB63FA'})
        fig_bar.show()
else:
    print("‚ö†Ô∏è Dataframe belum terbentuk. Cek Cell 2.")

In [None]:
# CELL 4: TIME SERIES (TREND)
if 'df' in locals():
    # Group by Date & Label
    timeline = df.groupby(['date_only', 'label']).size().reset_index(name='count')
    
    fig_line = px.line(timeline, x='date_only', y='count', color='label',
                       title='Tren Sentimen dari Waktu ke Waktu',
                       markers=True,
                       color_discrete_map={'Positif':'#00CC96', 'Negatif':'#EF553B', 'Netral':'#AB63FA'})
    
    fig_line.update_xaxes(title_text='Tanggal')
    fig_line.update_yaxes(title_text='Jumlah Komentar')
    fig_line.show()

In [None]:
# CELL 5: SCATTER PLOT (LIKES VS SENTIMENT)
if 'df' in locals() and 'likes' in df.columns:
    fig_scat = px.scatter(df, x="score", y="likes", 
                          color="label", size="likes", 
                          hover_data=['text', 'author'], # Fitur keren: hover mouse untuk baca!
                          title="Hubungan Skor Sentimen vs Jumlah Likes",
                          color_discrete_map={'Positif':'#00CC96', 'Negatif':'#EF553B', 'Netral':'#AB63FA'})
    
    fig_scat.update_layout(xaxis_title="Skor Sentimen (-1 Negatif s.d 1 Positif)", yaxis_title="Jumlah Likes")
    fig_scat.show()

In [None]:
# CELL 6: WORD CLOUD (BERSIH)
if 'df' in locals():
    print("‚è≥ Generating WordCloud dari teks bersih...")
    
    # Gabungkan teks dari kolom 'text_clean'
    text_combined = " ".join(df['text_clean'].astype(str).tolist())
    
    if len(text_combined) > 0:
        wordcloud = WordCloud(width=800, height=400, 
                              background_color='white', 
                              colormap='viridis',
                              min_font_size=10).generate(text_combined)
        
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.title("Word Cloud: Topik Pembicaraan Utama")
        plt.show()
        
        # Bonus: Tampilkan Komentar Paling Ekstrem
        print("\nüîç KOMENTAR PALING POSITIF:")
        display(df.sort_values(by='score', ascending=False).head(3)[['author', 'text', 'score']])
        
        print("\nüîç KOMENTAR PALING NEGATIF:")
        display(df.sort_values(by='score', ascending=True).head(3)[['author', 'text', 'score']])
        
    else:
        print("‚ö†Ô∏è Data teks kosong setelah dibersihkan.")