In [None]:

import googleapiclient.discovery
import pandas as pd
from datetime import datetime

# YouTube API setup
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "AIzaSyAVTScqtDxBt_G5CDxoxqoRnRvYVBzFOHU"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=DEVELOPER_KEY
)

# Step 1: Search for videos on a topic
search_request = youtube.search().list(
    part="snippet",
    q="Balen Shah",  # Replace with your topic
    type="video",
    maxResults=50  # Set maxResults to 50 (the limit)
)
search_response = search_request.execute()
video_ids = [item['id']['videoId'] for item in search_response['items']]

# Pagination for videos (in case there are more than 50 results)
nextPageToken = search_response.get('nextPageToken')
while nextPageToken:
    search_request = youtube.search().list(
        part="snippet",
        q="Balen Shah",
        type="video",
        maxResults=50,
        pageToken=nextPageToken
    )
    search_response = search_request.execute()
    video_ids.extend([item['id']['videoId'] for item in search_response['items']])
    nextPageToken = search_response.get('nextPageToken')

# Step 2: Fetch comments for each video
all_comments = []
nextPageToken = None  # Initialize for pagination

for video_id in video_ids:
    try:
        for _ in range(5):  
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,  # Max 100 per request
                pageToken=nextPageToken
            )
            response = request.execute()

            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                all_comments.append([
                    video_id,
                    comment['authorDisplayName'],
                    comment['publishedAt'],
                    comment['likeCount'],
                    comment['textDisplay']
                ])

            # Handle pagination for comments
            nextPageToken = response.get('nextPageToken')
            if not nextPageToken:
                break
    except Exception as e:
        print(f"Error fetching comments for video {video_id}: {e}")

# Step 3: Organize data
df = pd.DataFrame(all_comments, columns=['video_id', 'author', 'published_at', 'like_count', 'text'])

# Convert 'published_at' to datetime and remove timezone info
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce').dt.tz_localize(None)

# Define date ranges (now timezone-naive)
range1_start = datetime(2021, 5, 1)
range1_end = datetime(2022, 5, 31)
range2_start = datetime(2023, 5, 1)
range2_end = datetime(2024, 5, 31)

# Filter comments by date ranges
df_range1 = df[(df['published_at'] >= range1_start) & (df['published_at'] <= range1_end)]
df_range2 = df[(df['published_at'] >= range2_start) & (df['published_at'] <= range2_end)]

# Balance the data
min_size = min(len(df_range1), len(df_range2))  # Ensure equal number of comments
df_range1 = df_range1.sample(n=min_size, random_state=42)
df_range2 = df_range2.sample(n=min_size, random_state=42)

# Save to CSV
df_range1.to_csv('/content/drive/My Drive/Youtube Comments/comments_2021_2022.csv', index=False)
df_range2.to_csv('/content/drive/My Drive/Youtube Comments/comments_2023_2024.csv', index=False)

print("Balanced datasets saved.")
