In [6]:
# pip install google-api-python-client

# AIzaSyCj7cvvC-FAqklmn1EPGPY4e7Dgsm3WCfw
# AIzaSyDT3Q6oPXSVpL2RDwhjYUMq0zDqnDbuqxQ
# AIzaSyDdJ6abZeoamjbs6svKdLwyyjoOUURst8Q
# AIzaSyD2dB6DYHaFA8ayO6n65JhqFYlMew_003k

In [5]:
import csv
import os
import googleapiclient.discovery
import pandas as pd
from datetime import timedelta

# Function to get YouTube API service
def get_youtube_service():
    api_service_name = "youtube"
    api_version = "v3"
    developer_key = "AIzaSyD2dB6DYHaFA8ayO6n65JhqFYlMew_003k" 

    return googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=developer_key)

# Function to get videos
def get_videos(youtube, query, max_results=50, date='2019-01-01T00:00:00Z'):
    search_response = []
    request = youtube.search().list(
                q=query,
                part="id",
                type="video",
                publishedAfter=date,
                maxResults=max_results,
                relevanceLanguage='en'
            )
    response = request.execute()
    search_response.extend(response.get('items', []))
    return search_response

# Function to get the highest resolution thumbnail URL
def get_highest_resolution_thumbnail(thumbnails):
    resolutions = ['maxres', 'standard', 'high', 'medium', 'default']
    for res in resolutions:
        if res in thumbnails:
            return thumbnails[res]['url']
    return None

# Function to get video details by IDs and filter out Shorts
def get_video_details(youtube, video_ids):
    video_details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(video_ids[i:i + 50])
        )
        response = request.execute()
        for item in response.get('items', []):
            # Check if 'duration' key is present in contentDetails
            if 'duration' in item['contentDetails']:
                # Parse the duration from ISO 8601 format
                duration = item['contentDetails']['duration']
                duration_timedelta = parse_duration(duration)

                # Filter out videos with duration less than 60 seconds (Shorts)
                if duration_timedelta.total_seconds() >= 60:
                    video_details.append(item)
            else:
                print(f"Skipping video {item['id']} due to missing duration data.")
    return video_details

# Function to parse ISO 8601 duration to timedelta
def parse_duration(duration):
    try:
        return pd.to_timedelta(duration).to_pytimedelta()
    except Exception as e:
        print(f"Error parsing duration: {e}")
        return timedelta(0)

# Function to get channel details
def get_channel_details(youtube, channel_id):
    request = youtube.channels().list(
        part="statistics",
        id=channel_id
    )
    response = request.execute()
    if response['items']:
        return response['items'][0]['statistics'].get('subscriberCount', 0)
    return 0

# Function to get category names
def get_video_categories(youtube):
    request = youtube.videoCategories().list(
        part="snippet",
        regionCode="US"  # Modify if needed
    )
    response = request.execute()
    categories = {}
    for item in response['items']:
        categories[item['id']] = item['snippet']['title']
    return categories

# Function to save data to a CSV file
def save_to_csv(data, filename, folder):
    # Ensure the folder exists
    os.makedirs(folder, exist_ok=True)
    filepath = os.path.join(folder, filename)
    
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["video_id", "title", "thumbnail_url", "views", "likes", "comments", "subscriber_count", "date_posted", "duration", "tags", "category"])
        for row in data:
            writer.writerow(row)
    print(f"Data saved to {filepath}")

# Main function
def main():
    youtube = get_youtube_service()
    
    # Ask the user which row to start reading from
    start_row = int(input("Enter the row number to start from (1-based index): "))
    
    if (start_row == 0):
        print("Starting from the beginning")
    
    # Read the CSV file with queries
    df = pd.read_csv('youtube_search_queries.csv')
    
    # Get all video categories
    video_categories = get_video_categories(youtube)
    
    # Specify the folder where files will be saved
    folder_name = "query_youtube_data"
    
    # Loop through each query in the CSV, starting from the specified row
    for index, row in df.iloc[start_row:].iterrows():
        query = row['query']
        print(f"Fetching data for query: {query}")
        
        # Fetch videos for the current query
        videos = get_videos(youtube, query, max_results=500)
        
        # Extract video IDs
        video_ids = [item['id']['videoId'] for item in videos if 'videoId' in item['id']]
        
        # Get video details and filter out Shorts
        video_details = get_video_details(youtube, video_ids)
        
        # Prepare data to save
        data = []
        for item in video_details:
            video_id = item['id']
            title = item['snippet']['title']
            thumbnail_url = get_highest_resolution_thumbnail(item['snippet']['thumbnails'])
            views = item['statistics'].get('viewCount', 0)
            likes = item['statistics'].get('likeCount', 0)
            comments = item['statistics'].get('commentCount', 0)
            date_posted = item['snippet']['publishedAt']
            duration = item['contentDetails']['duration']
            duration_seconds = parse_duration(duration).total_seconds()
            tags = item['snippet'].get('tags', [])
            category_id = item['snippet']['categoryId']
            category = video_categories.get(category_id, "Unknown")

            # Fetch channel subscriber count
            channel_id = item['snippet']['channelId']
            subscriber_count = get_channel_details(youtube, channel_id)
            
            data.append([video_id, title, thumbnail_url, views, likes, comments, subscriber_count, date_posted, duration_seconds, tags, category])
        
        # Save to CSV for the current query in the specified folder
        save_to_csv(data, f"[{index}] - {query}.csv", folder_name)

if __name__ == "__main__":
    main()


Starting from the beginning
Fetching data for query: popular YouTube challenges
Data saved to query_youtube_data/[0] - popular YouTube challenges.csv
Fetching data for query: luxury car reviews
Data saved to query_youtube_data/[1] - luxury car reviews.csv
Fetching data for query: cooking hacks
Data saved to query_youtube_data/[2] - cooking hacks.csv
Fetching data for query: how to become a pilot
Data saved to query_youtube_data/[3] - how to become a pilot.csv
Fetching data for query: truck modification tutorials


KeyboardInterrupt: 