In [9]:
import googleapiclient.discovery
import pandas as pd
from config import API_KEY   

if not API_KEY:
    raise ValueError("API_KEY not found in config.py")

def extract_video_id(url):
    if "v=" in url:
        return url.split("v=")[1].split("&")[0]
    elif "youtu.be/" in url:
        return url.split("youtu.be/")[1].split("?")[0]
    elif "/shorts/" in url:
        return url.split("/shorts/")[1].split("?")[0]
    else:
        raise ValueError("Invalid YouTube URL")

def fetch_comments(video_url, max_comments=500):
    video_id = extract_video_id(video_url)

    youtube = googleapiclient.discovery.build(
        "youtube", "v3", developerKey=API_KEY
    )

    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token,
            textFormat="plainText"
        )

        response = request.execute()

        for item in response["items"]:
            comments.append(
                item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            )

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return comments[:max_comments]

video_url = input("Enter YouTube video URL: ")
comments = fetch_comments(video_url)

df = pd.DataFrame(comments, columns=["comment"])
df.to_csv("comments.csv", index=False)

print(f"Saved {len(df)} comments to comments.csv")


Saved 500 comments to comments.csv


I intentionally collected raw YouTube comments and observed heavy spam, repetition, and multilingual noise. Instead of changing data collection, I handled these issues in preprocessing by filtering low-information comments, removing spam patterns, and restricting the model to a single language for the initial version.