In [1]:
import pandas as pd
import re
import os
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound

# API Key and YouTube API Configuration
API_KEY = "api_key"  # Replace with your actual API key
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

# Build the YouTube API client
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)


In [3]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound

def get_video_transcript(video_id, language="ko"):
    """
    Fetch transcript for a given video ID in the specified language (default: Korean).
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        return [entry["text"] for entry in transcript]
    except TranscriptsDisabled:
        print(f"Transcript is disabled for video ID: {video_id}")
        return None  # Indicate no transcript available
    except NoTranscriptFound:
        print(f"No transcript found for language '{language}' for video ID: {video_id}")
        return None  # Indicate no transcript available
    except Exception as e:
        print(f"Transcript error for video ID {video_id}: {e}")
        return None


def search_videos_with_transcripts(query, published_after, published_before, max_results=10, order="viewCount"):
    """
    Search YouTube for videos by query and date range, ensuring each video has a transcript.
    """
    request = youtube.search().list(
        part="snippet",
        q=query,
        type="video",
        maxResults=50,  # Fetch more videos to ensure we can filter for transcripts
        order=order,
        publishedAfter=published_after,
        publishedBefore=published_before
    )
    response = request.execute()

    videos = []
    for item in response.get("items", []):
        video_id = item["id"]["videoId"]
        title = item["snippet"]["title"]
        publish_date = item["snippet"]["publishedAt"]

        # Check for transcript availability
        transcript = get_video_transcript(video_id, language="ko")
        if transcript:
            videos.append({
                "video_id": video_id,
                "title": title,
                "publish_date": publish_date,
                "transcript": transcript  # Save the transcript here for optimization
            })

        # Stop when we have enough videos with transcripts
        if len(videos) >= max_results:
            break

    if not videos:
        print("No videos with transcripts found.")
    return videos


def get_video_details_with_comments(video_id):
    """
    Get video details including views, likes, and up to 1000 comments.
    """
    # Fetch video details
    video_request = youtube.videos().list(
        part="statistics,snippet",
        id=video_id
    )
    video_response = video_request.execute()

    # Extract video details
    video_data = video_response["items"][0]
    details = {
        "views": int(video_data["statistics"].get("viewCount", "0")),
        "likes": video_data["statistics"].get("likeCount", "N/A"),
        "comments_count": video_data["statistics"].get("commentCount", "N/A"),
    }

    # Fetch up to 1000 comments
    comments = []
    next_page_token = None

    while len(comments) < 1000:
        try:
            comment_request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=50,  # Maximum allowed per request
                pageToken=next_page_token,
                order="relevance"
            )
            comment_response = comment_request.execute()
            for comment_item in comment_response.get("items", []):
                top_comment = comment_item["snippet"]["topLevelComment"]["snippet"]
                comments.append({
                    "comment_text": top_comment["textDisplay"],
                    "comment_likes": int(top_comment["likeCount"]),
                    "comment_date" : top_comment["publishedAt"]
                })
            next_page_token = comment_response.get("nextPageToken")
            if not next_page_token:  # Stop if there are no more pages
                break
        except Exception as e:
            print(f"Error fetching comments for video ID {video_id}: {e}")
            break

    # Sort comments by likes in descending order
    sorted_comments = sorted(comments, key=lambda x: x["comment_likes"], reverse=True)
    details["comments"] = sorted_comments[:1000]  # Limit to 1000 comments

    return details

In [14]:
def fetch_videos_and_save(query, published_after, published_before, max_results):
    """
    Fetch videos and save details, comments, and transcripts to a CSV file.
    
    Parameters:
    - query: Search query string
    - published_after: Start date for video search
    - published_before: End date for video search
    - max_results: Maximum number of videos to fetch
    """
    # Search videos with transcripts
    videos = search_videos_with_transcripts(
        query=query,
        published_after=published_after,
        published_before=published_before,
        max_results=max_results,
        order="viewCount"
    )

    # Enrich videos with details (comments, views, etc.)
    video_details = []
    for video in videos:
        # Fetch detailed video data (views, likes, comments)
        detailed_data = get_video_details_with_comments(video["video_id"])

        # Merge the data from search_videos_with_transcripts with get_video_details_with_comments
        details = {
            "title": video["title"],
            "publish_date": video["publish_date"],
            "video_id": video["video_id"],
            "transcript": video["transcript"],  # Keep the transcript
            **detailed_data  # Include detailed data (views, likes, comments)
        }

        video_details.append(details)

    # Sort videos by views in descending order
    video_details.sort(key=lambda x: x["views"], reverse=True)

    # Prepare data for DataFrame
    data_transcript = []

    df_comment = pd.DataFrame()

    for details in video_details:
        label_comment = [] 
        filtered_comments = [comment for comment in details["comments"] if comment["comment_likes"] > 10]
        for _ in range(len(filtered_comments)):
            label_comment.append(details["video_id"])
        
        data_comment = (pd.DataFrame({
            "Video_ID" : label_comment,
            "comment_text" : [comment["comment_text"] for comment in filtered_comments],
            "comment_likes" : [comment["comment_likes"] for comment in filtered_comments],
            "comment_date" : [comment["comment_date"] for comment in filtered_comments]
        }))

        df_comment = pd.concat([df_comment, data_comment], ignore_index=True)

        transcript_text = "|".join(details["transcript"]) if isinstance(details["transcript"], list) else details["transcript"]

        # Append to data
        data_transcript.append({
            "Title": details["title"],
            "Video_ID": details["video_id"],
            "Publish Date": details["publish_date"],
            "Views": details["views"],
            "Likes": details["likes"],
            "Comments_Count": details["comments_count"],
            "Transcript": transcript_text
        })

    # Create DataFrame
    df_transcript = pd.DataFrame(data_transcript)

    # Save to CSV
    transcript_file = f"{query}_youtube_transcript_{published_after}_to_{published_before}.csv"
    comment_file = f"{query}_youtube_comment_{published_after}_to_{published_before}.csv"
    df_transcript.to_csv(transcript_file, index=False, encoding="utf-8-sig")
    df_comment.to_csv(comment_file, index=False, encoding="utf-8-sig")

    print(f"Data saved to {df_transcript}")
    print(f"Data saved to {df_comment}")

In [17]:
# Define time periods
time_periods = [
    {"published_after": "2018-10-19T00:00:00Z", "published_before": "2020-10-19T23:59:59Z"},
    {"published_after": "2020-10-20T00:00:00Z", "published_before": "2020-10-21T23:59:59Z"},
    {"published_after": "2020-10-22T00:00:00Z", "published_before": "2020-12-22T23:59:59Z"}
]

# Iterate through time periods and fetch videos
query = "아이린"
max_results = 10

for period in time_periods:
    fetch_videos_and_save(
        query=query,
        published_after=period["published_after"],
        published_before=period["published_before"],
        max_results=max_results
    )

Transcript is disabled for video ID: 0OJ4aTCrGyE
Transcript is disabled for video ID: wCWoUUWwdqg
Transcript is disabled for video ID: uVXu2McmpYs
No transcript found for language 'ko' for video ID: PHp3wWbR9cw
Transcript is disabled for video ID: 3tYJTkdQMgA
Transcript is disabled for video ID: 9UD0MSOE3X0
No transcript found for language 'ko' for video ID: ChhCpSVrjvU
No transcript found for language 'ko' for video ID: o91OUaj4gm8
No transcript found for language 'ko' for video ID: noK5jMN9UmA
Transcript is disabled for video ID: bHreEWWGGZ8
No transcript found for language 'ko' for video ID: 7CV25uBj2bw
No transcript found for language 'ko' for video ID: zasD3mYNSsU
No transcript found for language 'ko' for video ID: a39GyyirOkg
No transcript found for language 'ko' for video ID: YQVuin19bLg
No transcript found for language 'ko' for video ID: gLgJe2yzStU
No transcript found for language 'ko' for video ID: QfCSlz3dRDI
No transcript found for language 'ko' for video ID: KfpUkFSJwLs
Tr

In [16]:
# Define time periods
time_periods = [
    {"published_after": "2012-12-04T00:00:00Z", "published_before": "2014-12-04T23:59:59Z"},
    {"published_after": "2014-12-05T00:00:00Z", "published_before": "2014-12-07T23:59:59Z"},
    {"published_after": "2014-12-08T00:00:00Z", "published_before": "2015-01-08T23:59:59Z"}
]

# Iterate through time periods and fetch videos
query = "조현아"
max_results = 10

for period in time_periods:
    fetch_videos_and_save(
        query=query,
        published_after=period["published_after"],
        published_before=period["published_before"],
        max_results=max_results
    )

No transcript found for language 'ko' for video ID: -PKsgTvtIzg
No transcript found for language 'ko' for video ID: fwUgWQ6cqjY
Transcript is disabled for video ID: YtgkWnZRurE
No transcript found for language 'ko' for video ID: cssrX7UtuII
Transcript is disabled for video ID: XvsrR_MmZfo
Transcript is disabled for video ID: lhNCwHhwXCU
No transcript found for language 'ko' for video ID: 9Cn40uBP17I
Data saved to                                                Title     Video_ID  \
0             [M/V] 너는 나를 (feat. 조현아 of 어반자카파) - 김진표  XIxm1rt1rks   
1                                     River - 어반 자카파  1XQGoi5zWIM   
2  크루시픽스 크릭 Reunion (feat. The Quiett. Verbal Jin...  OUtrbc7oids   
3      [13.09.18] 어반자카파 조현아 - 사랑이 제일 낫더라 @봄여름가을겨울의 숲  FZrvYHw427Y   
4                                 조현아—사는게 아니야 （2013）  InuQB6gacAo   
5  131003 Block.B SHOWCASE - (박경 Solo) 언제 어디서 무엇을...  QhbTh-_pvgc   
6  [MV] Kim jo han(김조한) _ The stars, The moon, Al...  8TfVWKFclfQ   
7                어반자카파(Urban Za