In [5]:
import pandas as pd
import requests
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# File path in your Google Drive
search_results_file_path = '/content/drive/My Drive/Capstone Project 2/search_results.csv'
video_info_file_path = '/content/drive/My Drive/Capstone Project 2/search_results_video_info.csv'

# Load CSV file containing new video IDs
search_results_df = pd.read_csv(search_results_file_path)

# Extract video IDs
video_ids_all = set(search_results_df['Video ID'])

# If there are no new video IDs, exit
if not video_ids_all:
    print("No new video IDs to fetch.")
    exit()

# Convert video IDs to a comma-separated string
video_id_string = ','.join(video_ids_all)

# Determine the batch size
batch_size = min(len(video_ids_all), 50)

# Calculate the total number of batches
num_batches = ((len(video_ids_all) - 1) // batch_size) + 1

# Function to fetch data from URL synchronously with retries
def fetch_data_with_retries(url, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Request failed. Retrying...")
                time.sleep(1)  # Adding a small delay before retrying
        except Exception as e:
            print(f"Error occurred: {e}. Retrying...")
            time.sleep(1)  # Adding a small delay before retrying
    print("Failed after multiple retries. Skipping this batch.")
    return None

# Main function to process batches synchronously
def process_batches():
    for batch_index in range(num_batches):
        start_index = batch_index * batch_size
        end_index = min((batch_index + 1) * batch_size, len(video_ids_all))

        # Extract video IDs for the current batch
        video_ids_batch = list(video_ids_all)[start_index:end_index]
        video_id_string_batch = ','.join(video_ids_batch)

        # Construct URL for batch request
        url = f"https://yt.lemnoslife.com/noKey/videos?part=snippet,contentDetails,statistics,liveStreamingDetails,status&id={video_id_string_batch}"

        # Fetch data from URL synchronously with retries
        data = fetch_data_with_retries(url)
        if data is not None and 'items' in data:
            # Create list to store rows for current batch
            batch_rows = []
            for item in data['items']:
                # Process each item and append to batch_rows
                video_id = item['id']
                snippet = item.get('snippet', {})
                content_details = item.get('contentDetails', {})
                statistics = item.get('statistics', {})
                live_streaming_details = item.get('liveStreamingDetails', {})
                status = item.get('status', {})

                channel_id = snippet.get('channelId', '')
                channel_title = snippet.get('channelTitle', '')
                tags = ','.join(snippet.get('tags', []))
                duration = content_details.get('duration', '')
                content_rating = content_details.get('contentRating', '')
                definition = content_details.get('definition', '')
                caption = content_details.get('caption', '')
                licensed_content = content_details.get('licensedContent', '')
                projection = content_details.get('projection', '')
                dimension = content_details.get('dimension', '')

                yt_rating = content_details.get('contentRating', {}).get('ytRating', '')
                made_for_kids = status.get('madeForKids', '')
                self_declared_made_for_kids = status.get('selfDeclaredMadeForKids', '')
                view_count = statistics.get('viewCount', '')
                like_count = statistics.get('likeCount', '')
                stream_start_time = live_streaming_details.get('actualStartTime', '')
                stream_end_time = live_streaming_details.get('actualEndTime', '')
                published_at = snippet.get('publishedAt', '')
                title = snippet.get('title', '')
                description = snippet.get('description', '')

                # Append row to batch_rows list if Stream Start Time is null
                if stream_start_time is None:
                    batch_rows.append({
                        'Video ID': video_id,
                        'Channel ID': channel_id, #This allows me to look up channel information in the future
                        'Channel Title': channel_title,
                        'Tags': tags, #Tags are used to allow people to find videos easier or even click on tags to search for a particular kind of video
                        'Duration': duration, #This will allow me to see if the length of the video matters for popularity
                        'Content Rating': content_rating, #This is similar to movie ratings
                        'YT Rating': yt_rating, #This is a subset of content rating
                        'Made For Kids': made_for_kids, #Youtube has a kids app. This will decide if it is on the kids app or not. If on the kids app, it will get to a wider audience
                        'Self Declared Made For Kids': self_declared_made_for_kids, #This generally was empty
                        'View Count': view_count, #The main thing I am looking for, views
                        'Like Count': like_count, #This is when users click on the thumbs up button in youtube and favorite a video. Supposedly it puts it more into the algorithm and on the trending page
                        'Stream Start Time': stream_start_time, #This allows me to find if it streamed or not. Also, creators can use a live function where they post the video early and people can react together about the video
                        'Stream End Time': stream_end_time, #This will help me figure out if the Youtube Premier function was used
                        'Published At': published_at, # When the video was created in UTC-0
                        'Video Title': title, #What was the title of the video
                        'Video Description': description, #This is the description used for the video
                        'Video Definition': definition, #This will tell you what quality the video is
                        'Video Captions': caption, #This tells me if Captions are available
                        'Video Licensed': licensed_content, #This will tell me if the Video is licensed https://support.google.com/youtube/answer/2797468?hl=en
                        'Video Projection': projection, #This will tell you if it is a normal video or a 360 video
                        'Video Dimension': dimension #This will tell you if it is 2d or 3d
                    })

            # Create DataFrame for current batch
            batch_data = pd.DataFrame(batch_rows)

            # Save batch DataFrame to CSV in append mode
            batch_data.to_csv(video_info_file_path, mode='a', index=False, header=not bool(batch_index))

        # Calculate remaining videos after this batch
        remaining_videos = len(video_ids_all) - ((batch_index + 1) * batch_size)
        remaining_videos = max(remaining_videos, 0)  # Ensure remaining videos count is not negative

        # Print remaining videos
        print(f"Batch {batch_index + 1} completed. {remaining_videos} videos remaining.")

# Run the process_batches function
start_time = time.time()
process_batches()
end_time = time.time()
print(f"Total processing time: {end_time - start_time} seconds.")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Batch 1 completed. 226276 videos remaining.
Batch 2 completed. 226226 videos remaining.
Batch 3 completed. 226176 videos remaining.


KeyboardInterrupt: 