[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/romiaprilian7406/sisi-gelap-dunia-saham-sentiment-analysis/blob/main/notebooks/sisi_gelap_dunia_saham_comments.ipynb)

# Import Library

In [1]:
import pandas as pd
import re
import warnings

from googleapiclient.discovery import build
from google.colab import userdata
from tqdm import tqdm

# Global Configuration

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Extract Function

In [3]:
def clean_scraped_text(text):
    if not isinstance(text, str): return ""
    # Ganti enter dengan spasi
    text = text.replace('\n', ' ').replace('\r', ' ')
    return text.strip()

def fetch_comments_recursive(video_id, max_results=1000):
    try:
        api_key = userdata.get('YOUTUBE_API_KEY')
        youtube = build('youtube', 'v3', developerKey=api_key)
    except Exception as e:
        print("Gagal load API Key. Pastikan sudah ada di Secrets")
        return pd.DataFrame()

    comments_data = []
    next_page_token = None

    pbar = tqdm(total=max_results, desc="Fetching Comments & Replies", unit="comm")

    while len(comments_data) < max_results:
        try:
            # Request ke API
            request = youtube.commentThreads().list(
                part="snippet,replies", # Minta 'replies' juga
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat="plainText"
            )
            response = request.execute()

            for item in response.get('items', []):
                # 1. Ambil Komentar Utama (Parent)
                top_comment = item['snippet']['topLevelComment']['snippet']
                parent_id = item['id']

                comments_data.append({
                    'id': parent_id,
                    'parent_id': None, # Ini komentar utama
                    'author': top_comment.get('authorDisplayName'),
                    'text': clean_scraped_text(top_comment.get('textDisplay')),
                    'like_count': top_comment.get('likeCount', 0),
                    'reply_count': item['snippet'].get('totalReplyCount', 0),
                    'published_at': top_comment.get('publishedAt'),
                    'type': 'comment'
                })

                # 2. Ambil Balasan (Replies) jika ada
                if 'replies' in item:
                    for reply in item['replies']['comments']:
                        reply_snip = reply['snippet']
                        comments_data.append({
                            'id': reply['id'],
                            'parent_id': parent_id, # Link ke komentar utama
                            'author': reply_snip.get('authorDisplayName'),
                            'text': clean_scraped_text(reply_snip.get('textDisplay')),
                            'like_count': reply_snip.get('likeCount', 0),
                            'reply_count': 0, # Balasan tidak punya 'totalReplyCount' di API ini
                            'published_at': reply_snip.get('publishedAt'),
                            'type': 'reply' # Penanda bahwa ini balasan
                        })

                if len(comments_data) >= max_results:
                    break

            pbar.update(len(response.get('items', []))) # Update progress

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break

        except Exception as e:
            print(f"\nError: {e}")
            break

    pbar.close()

    return pd.DataFrame(comments_data)

In [4]:
VIDEO_ID = '4aVRwQutljg'
MAX_COMMENTS = 2000

# Execution

In [5]:
df_complete = fetch_comments_recursive(VIDEO_ID, MAX_COMMENTS)

print(f"\nTotal Data Terambil: {len(df_complete)}")
print(f"Komentar Utama: {len(df_complete[df_complete['type']=='comment'])}")
print(f"Balasan (Replies): {len(df_complete[df_complete['type']=='reply'])}")

Fetching Comments & Replies:  39%|███▉      | 783/2000 [00:02<00:03, 336.35comm/s]



Total Data Terambil: 1018
Komentar Utama: 783
Balasan (Replies): 235


# Export Comments

In [6]:
filename = "sisi_gelap_dunia_saham_comments.csv"
df_complete.to_csv(filename, index=False)

print(f"Dataset comments disimpan: {filename}")

Dataset comments disimpan: sisi_gelap_dunia_saham_comments.csv
