In [9]:
# api_key api_key = "AIzaSyAlFcC7d6_JfcskrdnP7eQt2yOe4Esln44"
# search_word: "search word"
# Video Categroies: Education ("27"), Science & Technologie ("28"), People & Blogs ("22"), Howto & Style ("26")
# published: "YYYY-MM-DD"
# location: "latitude, longitute"
# radius: "miles"

def youtube_search(api_key, search_word, category_ids, published_after, published_before, location, radius):

    from googleapiclient.discovery import build
    import pandas as pd
    from IPython.display import JSON
    from pprint import pprint
    
    api_key = api_key
    
    api_service_name = "youtube"
    api_version = "v3"
        
    # API client   
    youtube = build(
        api_service_name, api_version, developerKey=api_key)
             
    category_ids = category_ids
    published_after = published_after  # Start of time range
    published_before = published_before  # End of time range


    # Calling youtube.search function
    all_results = []  
    
    for category_id in category_ids:
        # Initial request for each category
        request = youtube.search().list(
            part="snippet",
            location = location,
            locationRadius = radius,
            type="video",
            order="date",
            maxResults=50,
            q=search_word,  
            videoCategoryId=category_id,  
            publishedAfter=published_after,
            publishedBefore=published_before
        )
        response = request.execute()
        all_results.extend(response['items'])  
    
        # Pagination
        while 'nextPageToken' in response:
            request = youtube.search().list(
                part="snippet",
                location = "38.399588, -77.795914",
                locationRadius = "500mi",
                type="video",
                order="date",
                maxResults=50,
                q="health",
                videoCategoryId=category_id,
                publishedAfter=published_after,
                publishedBefore=published_before,
                pageToken=response['nextPageToken']  # Use the next page token
            )
            response = request.execute()
            all_results.extend(response['items'])  # Add paginated results to the list
    
    # Extracting videoIDs
    all_ids = [entry['id']['videoId'] for entry in all_results]

    # Iterating through the 'all_vids' list in steps of 50
    split_vids = []
    for i in range(0, len(all_ids), 50):
        split_vids.append(all_ids[i:i + 50])


    # Calling youtube.video function, looping through each list of video IDs
    responses = []
    for video_ids in split_vids:
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_ids
        )
        response = request.execute()
        responses.append(response)

    # Lists to store extracted data
    channel_title_list = []
    title_list = []
    published_at_list = []
    tags_list = []
    like_count_list = []
    view_count_list = []
    comment_count_list = []
    
    # Looping through each response in the list of responses
    for response in responses:
        # Accessing the 'items' in each response
        for item in response['items']:
            # Extracting items out of JSON
            channel_title = item['snippet']['channelTitle']
            title = item['snippet']['title']
            published_at = item['snippet']['publishedAt']
            tags = item['snippet'].get('tags', [])  # Use .get to handle missing tags
    
            # Appending the items into lists
            channel_title_list.append(channel_title)
            title_list.append(title)
            published_at_list.append(published_at)
            tags_list.append(tags)
    
            # Extracting statistics
            statistics = item.get('statistics', {})
            like_count = statistics.get('likeCount', 'N/A')  # Use .get to handle missing keys
            view_count = statistics.get('viewCount', 'N/A')  # Use .get to handle missing keys
            comment_count = statistics.get('commentCount', 'N/A')  # Use .get to handle missing keys
    
            # Appending statistics into lists
            like_count_list.append(like_count)
            view_count_list.append(view_count)
            comment_count_list.append(comment_count)

    # like count and view count ratio
    # ratio between 0 and 1, the higer the better the ratio, if > 0 likes are mor than views 
    # equation = 1 - ((view_count - like_count) / view_count)
    # basically the percentage of viewers that liked the video
    
    view_count_list_int = [float(count) if count != 'N/A' else 0 for count in view_count_list]
    like_count_list_int = [float(count) if count != 'N/A' else 0 for count in like_count_list]
    view_like_ratios = []
    
    for view_count, like_count in zip(view_count_list_int, like_count_list_int):
        if view_count != 0:
            view_like_ratio = (1 - ((view_count - like_count) / view_count))
        else:
            view_like_ratio = 0
        view_like_ratios.append(view_like_ratio)

    view_like_ratios_round = [round(num, 2) for num in view_like_ratios]
    
   # Creating dictionary and pd table
    data = {
        'title': title_list,
        'channel_title': channel_title_list,
        'like_count': like_count_list,
        'view_count': view_count_list,
        'comment_count': comment_count_list,
        'view_like_ratio': view_like_ratios_round,
        'published_at': published_at_list
    }
    
    df = pd.DataFrame(data)
    df = df.sort_values(by='published_at', ascending=True)
    
    return df



In [15]:
def data_cleaning(df):

    import pandas as pd
    
    video_table['published_at'] = pd.to_datetime(video_table['published_at'])
    video_table['view_like_ratio'] = pd.to_numeric(video_table['view_like_ratio'], errors='coerce')
    video_table['view_count'] = pd.to_numeric(video_table['view_count'], errors='coerce')
    video_table['comment_count'] = pd.to_numeric(video_table['comment_count'], errors='coerce')
    
    # Extract year and month
    video_table['year_month'] = video_table['published_at'].dt.to_period('M')  # This gives 'YYYY-MM' format
    
    # Group by year and month, then summarize columns using sum, mean, etc.
    summary = video_table.groupby('year_month').agg({
        'title': 'count',
        'view_count': 'sum',
        'view_like_ratio': 'mean',
        'comment_count': 'sum',
    })
    
    return summary

In [13]:
#def youtube_search(api_key, search_word, category_ids, published_after, published_before, location, radius)


video_table = youtube_search("AIzaSyDuZoyV4OxjjEp6f62_-GITNZRY33hm17U","Gesundheit",['28', '22', '26', '27'],"2020-01-01T00:00:00Z","2023-09-03T00:00:00Z","51.326350,10.623423","500mi")

video_table

Unnamed: 0,title,channel_title,like_count,view_count,comment_count,view_like_ratio,published_at
500,UNC Lenoir Health Care Heart Failure Program,Jim Ware,0,20,0,0.00,2020-06-30T22:08:29Z
499,YoungDo Smart Bluetooth Scale Step Your Health...,Mr.Noir'sReviews,29,3479,19,0.01,2020-07-01T17:15:56Z
498,"Sight + Sound Bites: Untreated Hearing Loss, S...",Eye & Ear Foundation of Pittsburgh,,126,,0.00,2020-07-06T13:36:49Z
497,"Wealth Healing Gemstone Collection ""ABUNDANCE,...",Cosmic Cuts Crystals,7,938,2,0.01,2020-07-08T23:14:18Z
496,Amazonite COURAGE HEALTH & GOOD LUCK Healing G...,Cosmic Cuts Crystals,17,726,4,0.02,2020-07-16T21:10:58Z
...,...,...,...,...,...,...,...
504,Marinas Massage Garage! Gesundheit und Kosmeti...,Marina Holtermann,0,5,0,0.00,2023-08-31T17:38:15Z
503,Gesundheitswissen #faktencheck #gesundheit #my...,Annelina Waller,29,2914,1,0.01,2023-09-01T14:57:02Z
502,PTA gesucht.Werde Teil unseres Teams! #pta #ge...,Bihlplatz-Apotheke,5,95,0,0.05,2023-09-01T16:48:55Z
501,Wieviel RedBull ok?🤯#gesundheit #gesund #trink...,Olga Dreams,3,203,0,0.01,2023-09-02T12:44:09Z


In [16]:
data_cleaning(video_table)

  video_table['year_month'] = video_table['published_at'].dt.to_period('M')  # This gives 'YYYY-MM' format


Unnamed: 0_level_0,title,view_count,view_like_ratio,comment_count
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06,1,20,0.0,0.0
2020-07,8,12340,0.01375,30.0
2020-08,8,12506,0.0125,80.0
2020-09,5,12734,0.042,0.0
2020-10,15,6557,0.018,29.0
2020-11,24,224532,0.017917,151.0
2020-12,12,20228,0.035,64.0
2021-01,18,427391,0.062222,1181.0
2021-02,19,14456,0.026842,45.0
2021-03,17,220225,0.027059,643.0
