In [None]:
#@title Installing Packages
from IPython.display import clear_output
!pip install mysqlclient
clear_output()

In [None]:
#@title Importing Packages and UDFs

import csv
import os
import googleapiclient.discovery
from googleapiclient.errors import HttpError
import time
import pandas as pd
import datetime
import MySQLdb
from google.colab import files

def get_cnx():
    return MySQLdb.connect(host='**********', user='*********', password='*******@', db='*******')

def insert_records(cnx, query, records):

    try:
        cursor = cnx.cursor()
        cursor.executemany(query, records)
        cnx.commit()
    except MySQLdb.OperationalError:
        cnx = get_cnx()
        cursor = cnx.cursor()
        cursor.executemany(query, records)
        cnx.commit()

class YoutubeScraper:

    def build_youtube_client(self,api_keys):
        """Builds the YouTube client object using the API key."""
        return googleapiclient.discovery.build("youtube", "v3", developerKey=api_keys)


    def generate_published_dates(self):
          
        now = datetime.datetime.now()
        year_range = range(2010, now.year+1)
        published_before = []
        published_after = []

        for year in year_range:
            if year != 2023:
                published_after.append(datetime.datetime(year, 1, 1).strftime('%Y-%m-%d'))
                published_before.append(datetime.datetime(year, 4, 30).strftime('%Y-%m-%d'))
                published_after.append(datetime.datetime(year, 4, 1).strftime('%Y-%m-%d'))
                published_before.append(datetime.datetime(year, 8, 30).strftime('%Y-%m-%d'))
                published_after.append(datetime.datetime(year, 8, 1).strftime('%Y-%m-%d'))
                published_before.append(datetime.datetime(year, 12, 30).strftime('%Y-%m-%d'))


        published_after.append(datetime.datetime(year, 1, 1).strftime('%Y-%m-%d'))
        published_before.append(datetime.datetime(year, 3, 30).strftime('%Y-%m-%d'))
        return published_before[::-1], published_after[::-1]




    def retrieve_videos(self, keyword, limit,api_keys_list):
        """Retrieves YouTube videos with the given keyword using one of the API keys in the list."""

        published_Before,published_After=self.generate_published_dates()

        regions = ['IN','US', 'AU','UK','CA', 'NZ','IE']

        cnx = get_cnx()
        cursor = cnx.cursor()

        cursor.execute("SELECT video_id from urls;")
        result = cursor.fetchall()
        video_id_list = [res[0] for res in result]

        timeindex,rindex=0,0
        index=0

        youtube=self.build_youtube_client(api_keys_list[index])
        videos = []
        video_ids = set()
        page_num = 0
        page_token = ''
        while len(videos) < limit:
            try:
                search_response = youtube.search().list(
                    part="snippet,id",
                    q=keyword,
                    regionCode=f"{regions[rindex]}",
                    #videoCaption="closedCaption",
                    type="playlist,video",
                    publishedAfter=f'{published_After[timeindex]}T00:00:00Z',
                    publishedBefore=f'{published_Before[timeindex]}T23:59:59Z',
                    maxResults=min(50, limit - len(videos)),
                    pageToken=page_token,).execute()
                

                # Retrieve video details for each search result
                video_details = []
                for search_result in search_response.get("items", []):
                    if "videoId" in search_result["id"]:
                        video_id = search_result["id"]["videoId"]
                        if video_id not in video_ids and video_id not in video_id_list:
                            video_ids.add(video_id)
                            video_details.append(video_id)

                video_details = youtube.videos().list(
                    part="id,snippet",
                    id=",".join(video_details)
                ).execute()
                print(video_details)



                # Store video data in a list
                for video_result in video_details.get("items", []):
                    video_title = video_result["snippet"]["title"]
                    video_description = video_result["snippet"]["description"]
                    if video_description:
                      video_id = video_result['id']
                      video_link = f"https://www.youtube.com/watch?v={video_id}"
                      videos.append([video_link, video_title, video_description, video_id])
                  
                if len(videos) > limit:
                  self.save_to_csv(videos, keyword)
                  print("stoping...")
                  break




                if "nextPageToken" not in search_response:
                    print("Current YouTube object has been exhausted. No more pages to search!")
                    page_token = ''
                    timeindex+= 1
                    if timeindex>=len(published_Before):
                      rindex+=1
                      
                      if rindex>=len(regions):
                        print("No More Regions Existing...")
                        self.save_to_csv(videos,keyword)
                        break

                      timeindex=0

                    time.sleep(8)
                    youtube = self.build_youtube_client(api_keys_list[index])

                

  
                page_token = search_response.get("nextPageToken")
                page_num += 1
                print(f"Scraped page {page_num} ", "videos--> ", len(videos))
                # Add a delay to avoid exceeding the API quota
                time.sleep(2)
                if len(videos)>=limit:
                  print(f"{limit} videos successfully scraped. Exiting...") 


            except HttpError as error:
                self.save_to_csv(videos, keyword)
                print(f"Error retrieving video data: {error}")
                if error.resp.status == 403:
                  index+=1
                  if index>=len(api_keys_list):
                      print("All API keys have been exhausted. Exiting...")

                      break
                  print(f"API has been changed . Now videos are Retrieving with...{api_keys_list[index]}")
                  youtube=self.build_youtube_client(api_keys_list[index])
            
                

        return videos, video_ids

    def remove_duplicates(self, keyword):
        file_name = f"{keyword}_videos.csv"
        df = pd.read_csv(file_name)
        df = df.drop_duplicates()
        df = df.dropna()
        df.to_csv(file_name, index=False)
        return file_name, df.shape[0]

    def save_to_csv(self,videos, keyword):
        file_name = f"{keyword}_videos.csv"
        # Check if file exists, append to it
        mode = "a" if os.path.exists(file_name) else "w"
        with open(file_name, mode, newline="", encoding="utf-8") as csv_file:
            writer = csv.writer(csv_file)
            if mode == "w":
                writer.writerow(["Link", "Title", "Description","video_id"])
            writer.writerows(videos)

# Define function to scrape YouTube videos
def completeVidoes_scraper(api_keys, keyword, num_videos):

    youtube_scraper = YoutubeScraper()

    videos, video_ids = youtube_scraper.retrieve_videos(keyword, num_videos, api_keys)
    youtube_scraper.save_to_csv(videos,keyword)
    filename, total_records = youtube_scraper.remove_duplicates(keyword)

    cnx = get_cnx()
    insert_query = 'INSERT INTO urls (video_id) VALUES(%s);'
    insert_records(cnx, insert_query, video_ids)
    print(f"Compeleted! Total of {total_records} URLs have Been Saved in {filename}.\nFile is Downloading...")
    files.download(filename)

In [None]:
10#@title Run Function

if __name__ == "__main__":
    # Prompt user to enter keyword, number of videos, and number of API keys
    keyword = input("Enter Keyword...")
    num_videos = int(input("Enter Number of Videos..."))

    # Suggest using up to 4 API keys for scraping up to 10,000 videos
    if num_videos > 10000:
        print("For scraping more than 10,000 videos, it's recommended to use up to 4 API keys.")

    api_keys = []

    print("Wait, we will update you...")

    completeVidoes_scraper(api_keys, keyword, num_videos)