In [325]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

import pandas as pd
import datetime
import isodate

In [326]:
youtube = build('youtube', 'v3', developerKey=api_key)

In [327]:
#load gz_df
all_df = pd.read_csv('new_youtube_video.csv')

all_df.head()

Unnamed: 0,video_id,title,description,tags,category_id,view_count,like_count,dislike_count,comment_count,duration,...,channel_description,published_at,channel_creation_date,duration_seconds,weighted_views,length_category,upload_day,upload_hour,title_length,description_length
0,2XLnR2HlmBU,Compact Verbot: Faeser massiv unter Druck!,Prof. Vosgerau übt massive Kritik an Faesers C...,[],27,71231,11255,0,953,PT10M32S,...,"Ich erstelle Videos über Dinge, die mich beweg...",2013-11-28T10:40:22Z,2013-11-28T10:40:22Z,,,,,,,
1,SjnWd_j7wZQ,ن سوشل میڈیا کی PTI خاتون MNA کی جعلی گندی وڈی...,Today's Punjabi Vlog: https://www.youtube.com/...,"['shahbaz gill', 'shehbaz gill', 'urdu vlog', ...",25,191438,22657,0,2894,PT25M17S,...,This channel covers News & Current Affairs\n\n...,2019-06-02T20:41:32Z,2019-06-02T20:41:32Z,,,,,,,
2,XFTuRBkb8r8,Thailand Vlog / DAY 7,,[],1,30639,7874,0,147,PT1M1S,...,Hey there!\nMy name is Nirami :) \nI´m gonna t...,2012-07-30T09:27:07Z,2012-07-30T09:27:07Z,,,,,,,
3,ahIN1hCdjeQ,Buying a Gaming PC from Facebook Marketplace I...,Check out the ZimaBlade below!\nZimaBlade Offi...,[],28,43212,7229,0,411,PT11M30S,...,"Hi, I'm Andy! I own a computer repair shop in ...",2014-10-15T21:30:10Z,2014-10-15T21:30:10Z,,,,,,,
4,IM7SROtdvlo,Drivers stuck between L.A. and Las Vegas as I-...,An overturned big rig and resulting hazmat sit...,"['video', 'news']",25,152033,1427,0,657,PT5M2S,...,"KTLA 5 in Los Angeles covers breaking news, we...",2006-06-13T05:19:22Z,2006-06-13T05:19:22Z,,,,,,,


In [328]:
#needed functions
def parse_duration(duration):
    # Parse ISO 8601 duration format and return the duration in seconds
    duration = isodate.parse_duration(duration)
    return duration.total_seconds()


# @retry(retry=retry_if_exception_type(HttpError), wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_videos_published_last_two_years():
    video_ids = []
    two_years_ago = (datetime.datetime.utcnow() - datetime.timedelta(days=2*365)).isoformat() + 'Z'
    next_page_token = None

    while True:
        request = youtube.search().list(
            part='id,snippet',
            publishedAfter=two_years_ago,
            type='video',
            maxResults=50,
            safeSearch = 'none',
            regionCode = 'IE',  #US, UK, NG, IE, AU, NZ,SA,CA,GH, GB, BE
            q='-shorts', # Excluding videos with 'shorts' in title/description
            relevanceLanguage='en', # Ensuring relevance to English language
            pageToken=next_page_token
        )
        response = request.execute()

        ids_to_check = []
        for item in response['items']:
            ids_to_check.append(item['id']['videoId'])

        if ids_to_check:
            videos_request = youtube.videos().list(
                part='contentDetails',
                id=','.join(ids_to_check)
            )
            videos_response = videos_request.execute()

            for item in videos_response['items']:
                duration = item['contentDetails']['duration']
                # Convert ISO 8601 duration to seconds
                duration_seconds = parse_duration(duration)
                if duration_seconds >= 60:  # Filter out videos less than 60 seconds
                    video_ids.append(item['id'])

        next_page_token = response.get('nextPageToken')

        # If there are no more pages, break out of the loop
        if next_page_token is None:
            break

    return video_ids


#get video details
# @retry(retry=retry_if_exception_type(HttpError), wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_video_details(video_id):
    request = youtube.videos().list(part="snippet,contentDetails,statistics", id=video_id)
    response = request.execute()
    return response['items'][0] if response['items'] else None

# @retry(retry=retry_if_exception_type(HttpError), wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_category_name(category_id):
    request = youtube.videoCategories().list(part="snippet", id=category_id)
    response = request.execute()
    return response['items'][0]['snippet']['title'] if response['items'] else None


# @retry(retry=retry_if_exception_type(HttpError), wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_channel_details(video_id):
    # Get the video details
    video_request = youtube.videos().list(
        part='snippet',
        id=video_id
    )
    video_response = video_request.execute()

    if not video_response['items']:
        print('No video found with the provided ID.')
        return None

    channel_id = video_response['items'][0]['snippet']['channelId']

    # Get the channel details
    channel_request = youtube.channels().list(
        part='snippet,statistics',
        id=channel_id
    )
    channel_response = channel_request.execute()

    if not channel_response['items']:
        print('No channel found for the provided video ID.')
        return None

    channel_details = channel_response['items'][0]
    channel_info = {
        'channel_id': channel_details['id'],
        'channel_name': channel_details['snippet']['title'],
        'subscriber_count': int(channel_details['statistics'].get('subscriberCount', 0)),
        'view_count': int(channel_details['statistics'].get('viewCount', 0)),
        'video_count': int(channel_details['statistics'].get('videoCount', 0)),
        'description': channel_details['snippet']['description'],
        'published_at': channel_details['snippet']['publishedAt']
    }

    return channel_info

In [329]:
# Example video IDs
#video_ids = ['dQw4w9WgXcQ', 'kJQP7kiw5Fk']
video_ids = get_videos_published_last_two_years()

# Collect data for each video
data = []
category_ids = []
for video_id in video_ids:
    details = get_video_details(video_id)
    # channel_info = get_channel_details(video_id)
    if details:
        data.append({
            'video_id': video_id,
            'title': details['snippet']['title'],
            'description': details['snippet']['description'],
            'tags': details['snippet'].get('tags', []),
            'category_id': details['snippet']['categoryId'],
            'view_count': details['statistics'].get('viewCount',0),
            'like_count': details['statistics'].get('likeCount', 0),
            'dislike_count': details['statistics'].get('dislikeCount', 0),
            'comment_count': details['statistics'].get('commentCount', 0),
            'duration': details['contentDetails']['duration'],
            'is_for_kids': details['contentDetails'].get('madeForKids', False),
            'publish_time': details['snippet']['publishedAt']
        })
        category_ids.append(details['snippet']['categoryId'])
# df.to_csv('youtube_data.csv', index=False) 

In [330]:
#youtube channel details 
# - present subscriber count
# - segment - technology & science, lifestyle etc.  - details['snippet']['categoryId'] - but only categoryId
# - for kids or not
# - some information about the thumbnail
#

In [331]:
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,video_id,title,description,tags,category_id,view_count,like_count,dislike_count,comment_count,duration,is_for_kids,publish_time
0,GDKSL5eFbNA,I Survived 350 Days in FIFA!!!,In today's video I attempt to manage a team to...,"[davidmc, david mc youtube, youtube davidmc, d...",17,3067740,25071,0,648,PT14M47S,False,2022-08-10T14:00:08Z
1,ayq68MKkQaA,"We ADOPTED a GIRL, But My LITTLE BROTHER Gets ...","We ADOPTED a GIRL, But My LITTLE BROTHER Gets ...","[adam b, adamb, we adopted, adopted, adopted p...",24,1110906,15943,0,1690,PT11M15S,False,2022-11-16T18:00:04Z
2,EJyoz_ypkyY,How to Avoid Mobilization in Russia,How to avoid Mobilization in Russia\n#mobiliza...,"[Avoiding russian mobilization, How to avoid M...",23,1360805,45210,0,4393,PT2M7S,False,2022-10-20T07:00:10Z
3,15-RlewrpDs,｢ GCMV 」House of Memories | TW,Part 1: https://www.youtube.com/watch?v=M7bxVB...,"[Gacha, Oc, Gacha Club, Illustration, Art, Dra...",1,890675,28377,0,513,PT4M20S,False,2023-05-26T12:43:25Z
4,CJNzSV-_Gzg,RAIN - ANGEL X HUSK - (Hazbin Hotel Comic Dub),⭐️⭐️⭐️ SUPPORT US ON PATREON FOR EXCLUSIVE CON...,"[hazfans unlimited, hazfans, Helluva Boss, Haz...",1,304142,9104,0,110,PT4M33S,False,2022-09-16T19:00:08Z
5,HKPphqh9CPk,This Lawyer Dated Her Client... and HELPED Him...,Barry Titus and Keegan Harroz fell in love in ...,"[barry titus, keegan harroz, strange, creepy, ...",24,1429820,64097,0,5586,PT20M6S,False,2022-08-26T19:00:10Z
6,wGm0deLFPrk,What Fills Your Heart?,What Fills Your Heart with the Derry Girls’ Sa...,"[Visit Ireland, Ireland, Northern Ireland, The...",19,660883,532,0,46,PT1M55S,False,2023-03-20T14:50:57Z
7,E09dBXzLTyw,seriously... WTF are you doing to your bodies?!,wtf are these women (and men.. Elon i'm lookin...,"[ozempic, kim kardashian weight loss, kim kard...",26,1194056,40107,0,2582,PT12M19S,False,2023-03-09T19:00:04Z
8,0DfS0f_eEBI,We GROUNDED him on holiday...,We GROUNDED him on holiday...\n\nOrder my DEBU...,"[adam b, adamb, grounded, grounded prank, adam...",24,514115,7816,0,494,PT12M32S,False,2023-03-01T18:00:00Z
9,1pWmHeikZxQ,Not a Toy: World's Scariest Aircraft | Last M...,There's no aircraft in the world quite like th...,"[messerschmitt, komet, footage, joseph pohs, a...",24,3477936,65829,0,3278,PT11M29S,False,2023-05-12T20:00:02Z


In [332]:
more_data = []
for video_id,category_id in zip(video_ids,category_ids):
    #details = get_video_details(video_id)
    category_name = get_category_name(category_id)
    channel_info = get_channel_details(video_id)
    if details:
        more_data.append({
            'category_name': category_name,
            'video_id': video_id,
            'channel_id': channel_info['channel_id'],
            'channel_name': channel_info['channel_name'],
            'channel_view_count': channel_info.get('viewCount', 0) if channel_info else None,
            'subscriber_count': channel_info['subscriber_count'],
            'channel_video_count':channel_info['video_count'],
            'channel_description': channel_info['description'],
            'published_at': channel_info['published_at'],
            'channel_creation_date': channel_info['published_at'] if channel_info else None,
            
        })

In [333]:
other_df = pd.DataFrame(more_data)
other_df.head()

Unnamed: 0,category_name,video_id,channel_id,channel_name,channel_view_count,subscriber_count,channel_video_count,channel_description,published_at,channel_creation_date
0,Sports,GDKSL5eFbNA,UCOici5uLklRoOciuPM7Rs4w,DavidMC,0,605000,153,I like playing football and filming it lol\n\n...,2016-07-17T21:36:14Z,2016-07-17T21:36:14Z
1,Entertainment,ayq68MKkQaA,UC4_Ssc1v_P5SrrFjbVG821Q,Adam B,0,4220000,991,"Hey everyone, it’s Adam B! \n\nYou can catch m...",2012-07-29T17:37:53Z,2012-07-29T17:37:53Z
2,Comedy,EJyoz_ypkyY,UCzb-6smlTg5UPirLdsdQ_cQ,Foil Arms and Hog,0,983000,622,"FOIL, ARMS AND HOG!\n\nWe are Sean Finegan (Fo...",2008-10-26T11:57:30Z,2008-10-26T11:57:30Z
3,Film & Animation,15-RlewrpDs,UCYU0PXd3_k9ckF7ZuhmlSxQ,KC SafireWolf,0,444000,61,"Hi, I'm KC SafireWolf! \nI make Gacha=\n☞︎ Mus...",2019-04-27T07:03:44Z,2019-04-27T07:03:44Z
4,Film & Animation,CJNzSV-_Gzg,UCAgj5t73HmhMp3jCwfblH3Q,Hazfans Unlimited,0,92000,594,❤️FOLLOW US:❤️\n🧡 DISCORD (+18): https://disco...,2022-05-01T14:42:02.660275Z,2022-05-01T14:42:02.660275Z


In [334]:
#merge the 2 dataframes together
gz_df = pd.merge(df, other_df, on="video_id")

gz_df.head()

Unnamed: 0,video_id,title,description,tags,category_id,view_count,like_count,dislike_count,comment_count,duration,...,publish_time,category_name,channel_id,channel_name,channel_view_count,subscriber_count,channel_video_count,channel_description,published_at,channel_creation_date
0,GDKSL5eFbNA,I Survived 350 Days in FIFA!!!,In today's video I attempt to manage a team to...,"[davidmc, david mc youtube, youtube davidmc, d...",17,3067740,25071,0,648,PT14M47S,...,2022-08-10T14:00:08Z,Sports,UCOici5uLklRoOciuPM7Rs4w,DavidMC,0,605000,153,I like playing football and filming it lol\n\n...,2016-07-17T21:36:14Z,2016-07-17T21:36:14Z
1,ayq68MKkQaA,"We ADOPTED a GIRL, But My LITTLE BROTHER Gets ...","We ADOPTED a GIRL, But My LITTLE BROTHER Gets ...","[adam b, adamb, we adopted, adopted, adopted p...",24,1110906,15943,0,1690,PT11M15S,...,2022-11-16T18:00:04Z,Entertainment,UC4_Ssc1v_P5SrrFjbVG821Q,Adam B,0,4220000,991,"Hey everyone, it’s Adam B! \n\nYou can catch m...",2012-07-29T17:37:53Z,2012-07-29T17:37:53Z
2,EJyoz_ypkyY,How to Avoid Mobilization in Russia,How to avoid Mobilization in Russia\n#mobiliza...,"[Avoiding russian mobilization, How to avoid M...",23,1360805,45210,0,4393,PT2M7S,...,2022-10-20T07:00:10Z,Comedy,UCzb-6smlTg5UPirLdsdQ_cQ,Foil Arms and Hog,0,983000,622,"FOIL, ARMS AND HOG!\n\nWe are Sean Finegan (Fo...",2008-10-26T11:57:30Z,2008-10-26T11:57:30Z
3,15-RlewrpDs,｢ GCMV 」House of Memories | TW,Part 1: https://www.youtube.com/watch?v=M7bxVB...,"[Gacha, Oc, Gacha Club, Illustration, Art, Dra...",1,890675,28377,0,513,PT4M20S,...,2023-05-26T12:43:25Z,Film & Animation,UCYU0PXd3_k9ckF7ZuhmlSxQ,KC SafireWolf,0,444000,61,"Hi, I'm KC SafireWolf! \nI make Gacha=\n☞︎ Mus...",2019-04-27T07:03:44Z,2019-04-27T07:03:44Z
4,CJNzSV-_Gzg,RAIN - ANGEL X HUSK - (Hazbin Hotel Comic Dub),⭐️⭐️⭐️ SUPPORT US ON PATREON FOR EXCLUSIVE CON...,"[hazfans unlimited, hazfans, Helluva Boss, Haz...",1,304142,9104,0,110,PT4M33S,...,2022-09-16T19:00:08Z,Film & Animation,UCAgj5t73HmhMp3jCwfblH3Q,Hazfans Unlimited,0,92000,594,❤️FOLLOW US:❤️\n🧡 DISCORD (+18): https://disco...,2022-05-01T14:42:02.660275Z,2022-05-01T14:42:02.660275Z


In [335]:
#concat more and more to gz_df
all_df = pd.concat([all_df,gz_df])

In [336]:
#save gz_df to disk
all_df.to_csv('new_youtube_video.csv',index=False)

In [337]:
# def main():
#     video_ids = ['YOUR_VIDEO_IDS']  # Replace with your list of video IDs
#     try:
#         data = collect_video_data(video_ids)
#         print(f"Collected data for {len(data)} videos.")
#         for video_data in data:
#             print(video_data)
#     except HttpError as e:
#         print(f"An error occurred: {e}")

# if __name__ == '__main__':
#     main()


In [338]:
all_df.shape

(14863, 28)

In [339]:
all_df.video_id.nunique()

8805