## Topic

In [1]:
pip install youtube_transcript_api

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-1.2.2-py3-none-any.whl.metadata (24 kB)
Downloading youtube_transcript_api-1.2.2-py3-none-any.whl (485 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m485.0/485.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-1.2.2


In [2]:
import json
import csv
import os
from datetime import datetime
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi

In [3]:
import shutil

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# CONFIG
# ========================
API_KEY = "AIzaSyDkFs6pVdkR871rDmea_SOyFc5qlmz92NI"
VIDEO_ID = "bPsAKX1ocTs"
PLAYLIST_ID = "PLGeL0tgf1NADxLzzVARqadY1H-fNXXkDw"
# Yakult Video: https://www.youtube.com/watch?v=bPsAKX1ocTs
# y-HLsEIyDjI

# https://www.youtube.com/watch?v=cuoM9amPt-I&list=PLGeL0tgf1NADxLzzVARqadY1H-fNXXkDw

### get by video

In [None]:
# FUNCTION: get metadata video
# ========================
def get_video_info(api_key, video_id):
    youtube = build("youtube", "v3", developerKey=api_key)

    request = youtube.videos().list(
        part="snippet,statistics,contentDetails,status,player,recordingDetails,topicDetails",
        id=video_id
    )
    response = request.execute()
    if not response["items"]:
        return None
    return response["items"][0]

In [None]:
# FUNCTION: get transcript
# ========================
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'vi'])
        text = " ".join([line["text"] for line in transcript])
        return text
    except Exception as e:
        return None

In [None]:
# MAIN PIPELINE
# ========================
def main():
    # 1. get metadata
    video_info = get_video_info(API_KEY, VIDEO_ID)
    if not video_info:
        print("‚ùå There is no video.")
        return

    snippet = video_info["snippet"]
    statistics = video_info.get("statistics", {})
    content_details = video_info.get("contentDetails", {})

    data = {
        "video_id": VIDEO_ID,
        "title": snippet.get("title"),
        "description": snippet.get("description"),
        "publishedAt": snippet.get("publishedAt"),
        "channelId": snippet.get("channelId"),
        "channelTitle": snippet.get("channelTitle"),
        "tags": snippet.get("tags", []),
        "viewCount": statistics.get("viewCount"),
        "likeCount": statistics.get("likeCount"),
        "commentCount": statistics.get("commentCount"),
        "duration": content_details.get("duration"),
        "caption": content_details.get("caption"),
    }

    # 2. get transcript
    transcript_text = get_transcript(VIDEO_ID)
    data["transcript"] = transcript_text if transcript_text else "No transcript available"

    # Create folder to save data by date
    # ========================
    today = datetime.today().strftime("%Y-%m-%d")
    save_dir = os.path.join("data", today)
    os.makedirs(save_dir, exist_ok=True)

    json_path = os.path.join(save_dir, "video_data.json")
    csv_path = os.path.join(save_dir, "video_data.csv")

    # Save JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

    # Save CSV
    if data:
        keys = data.keys()
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerow(data)

    print(f"‚úÖ Saved data in folder: {save_dir}")

if __name__ == "__main__":
    main()

‚úÖ Saved data in folder: data/2025-09-25


In [None]:
# Specify the destination folder in your Google Drive
drive_save_dir = os.path.join('/content/drive/My Drive/', save_dir)

# Create the directory in Google Drive if it doesn't exist
os.makedirs(drive_save_dir, exist_ok=True)

# Copy the saved files to Google Drive
shutil.copy(json_path, drive_save_dir)
shutil.copy(csv_path, drive_save_dir)

print(f"‚úÖ Copied data to Google Drive folder: {drive_save_dir}")

### get by playlists

In [5]:
# FUNCTION: get videoId in playlist
# ========================
def get_videos_from_playlist(api_key, playlist_id):
    youtube = build("youtube", "v3", developerKey=api_key)
    videos = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=playlist_id,
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            videos.append(item["contentDetails"]["videoId"])

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    return videos

In [6]:
# FUNCTION: get metadata video
# ========================
def get_video_info(api_key, video_id):
    youtube = build("youtube", "v3", developerKey=api_key)
    request = youtube.videos().list(
        part="snippet,statistics,contentDetails,status",
        id=video_id
    )
    response = request.execute()
    if not response["items"]:
        return None
    return response["items"][0]

In [7]:
# FUNCTION: get transcript
# ========================
def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'vi'])
        text = " ".join([line["text"] for line in transcript])
        return text
    except:
        return None

In [11]:
# MAIN PIPELINE
# ========================
def main():
    video_ids = get_videos_from_playlist(API_KEY, PLAYLIST_ID)
    print(f"üîπ Found {len(video_ids)} video in playlist")

    all_data = []

    for idx, vid in enumerate(video_ids, 1):
        print(f"‚û°Ô∏è Crawl video {idx}/{len(video_ids)}: {vid}")
        video_info = get_video_info(API_KEY, vid)
        if not video_info:
            continue

        snippet = video_info["snippet"]
        statistics = video_info.get("statistics", {})
        content_details = video_info.get("contentDetails", {})

        data = {
            "video_id": vid,
            "title": snippet.get("title"),
            "description": snippet.get("description"),
            "publishedAt": snippet.get("publishedAt"),
            "channelId": snippet.get("channelId"),
            "channelTitle": snippet.get("channelTitle"),
            "tags": snippet.get("tags", []),
            "viewCount": statistics.get("viewCount"),
            "likeCount": statistics.get("likeCount"),
            "commentCount": statistics.get("commentCount"),
            "duration": content_details.get("duration"),
            "caption": content_details.get("caption"),
        }

        transcript_text = get_transcript(vid)
        data["transcript"] = transcript_text if transcript_text else "No transcript available"

        all_data.append(data)

    # Create folder to save data by date
    # ========================
    today = datetime.today().strftime("%Y-%m-%d")
    save_dir = os.path.join("data", today)
    os.makedirs(save_dir, exist_ok=True)

    json_path = os.path.join(save_dir, "playlist_data.json")
    csv_path = os.path.join(save_dir, "playlist_data.csv")

    # Save JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=4)

    # Save CSV
    if all_data:
        keys = all_data[0].keys()
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(all_data)

    print(f"‚úÖ Saved data in folder: {save_dir}")

    # # Save JSON
    # with open("playlist_data.json", "w", encoding="utf-8") as f:
    #     json.dump(all_data, f, ensure_ascii=False, indent=4)

    # # Save CSV
    # keys = all_data[0].keys() if all_data else []
    # with open("playlist_data.csv", "w", newline="", encoding="utf-8") as f:
    #     writer = csv.DictWriter(f, fieldnames=keys)
    #     writer.writeheader()
    #     writer.writerows(all_data)

    # print("‚úÖ Saved playlist_data.json and playlist_data.csv")

    # Specify the destination folder in your Google Drive
    drive_save_dir = os.path.join('/content/drive/My Drive/Research/Brand Image Detection/Code', save_dir)

    # Create the directory in Google Drive if it doesn't exist
    os.makedirs(drive_save_dir, exist_ok=True)

    # Copy the saved files to Google Drive
    shutil.copy(json_path, drive_save_dir)
    shutil.copy(csv_path, drive_save_dir)

    print(f"‚úÖ Copied data to Google Drive folder: {drive_save_dir}")

if __name__ == "__main__":
    main()

üîπ Found 59 video in playlist
‚û°Ô∏è Crawl video 1/59: cuoM9amPt-I
‚û°Ô∏è Crawl video 2/59: 86MV7fCVTs4
‚û°Ô∏è Crawl video 3/59: JjnmcSTHQYk
‚û°Ô∏è Crawl video 4/59: i459dVnvTi0
‚û°Ô∏è Crawl video 5/59: 909igbIvzCA
‚û°Ô∏è Crawl video 6/59: w1tuKtHLeEM
‚û°Ô∏è Crawl video 7/59: ftPJ6kTSRNc
‚û°Ô∏è Crawl video 8/59: pogJoL8oXsY
‚û°Ô∏è Crawl video 9/59: pllzbS_vcRk
‚û°Ô∏è Crawl video 10/59: VYc-ZwQiEl0
‚û°Ô∏è Crawl video 11/59: 9ZQMlI-QBuE
‚û°Ô∏è Crawl video 12/59: P55VKCNxdoA
‚û°Ô∏è Crawl video 13/59: MtANUOOjuPo
‚û°Ô∏è Crawl video 14/59: x0yXt21tTjA
‚û°Ô∏è Crawl video 15/59: iS1ibDDozvI
‚û°Ô∏è Crawl video 16/59: kSv6ZxkFzGE
‚û°Ô∏è Crawl video 17/59: vmBCH5YDjFA
‚û°Ô∏è Crawl video 18/59: mYqlmeg6u98
‚û°Ô∏è Crawl video 19/59: rvTR4TsYxjk
‚û°Ô∏è Crawl video 20/59: -iOm8ydhftk
‚û°Ô∏è Crawl video 21/59: cmflrVLacio
‚û°Ô∏è Crawl video 22/59: ktCeW0Yy75w
‚û°Ô∏è Crawl video 23/59: kjhgDOHHd8Q
‚û°Ô∏è Crawl video 24/59: 9BTj8tQOjjg
‚û°Ô∏è Crawl video 25/59: _3r7k3D1pps
‚û°Ô∏è Crawl video 26/59: W