In [1]:
# Libraries Required
import pandas as pd
from googleapiclient.discovery import build
import matplotlib.pyplot as plt
import seaborn as sns
import isodate
import csv

In [2]:
# Youtube V3 API key
API_KEY = 'AIzaSyBCQY_bqe2-2Gh6NUXzIITeEr1JQZg9C9U'

In [3]:
# Function to get the trending videos of 5 specific regions AU(Australia), CA(Canada), GB(United Kingdom), IN(India), US(United States) and saving then in CSV files
def get_trending_videos(api_key, region_codes=['AU', 'CA', 'GB', 'IN', 'US'], max_results=200):
    # Initialize an empty dictionary to store videos by region
    videos_by_region = {region: [] for region in region_codes}

    # Youtube API connection build object
    youtube = build('youtube', 'v3', developerKey=api_key)

    for youtube_region_code in region_codes:
        # Getting youtube categories Names
        categories_response = youtube.videoCategories().list(part='snippet', regionCode=youtube_region_code).execute()
        category_map = {item['id']: item['snippet']['title'] for item in categories_response.get('items', [])}

        # Fetching the most popular videos
        request = youtube.videos().list(
            part='snippet,contentDetails,statistics',
            chart='mostPopular',
            regionCode=youtube_region_code,
            maxResults=50
        )

        # Paginating through the results getting the details and storing them in the list
        while request and len(videos_by_region[youtube_region_code]) < max_results:
            response = request.execute()
            for item in response['items']:
                video_details = {
                    'video_id': item['id'],
                    'title': item['snippet']['title'],
                    'description': item['snippet']['description'],
                    'published_at': item['snippet']['publishedAt'],
                    'channel_id': item['snippet']['channelId'],
                    'channel_title': item['snippet']['channelTitle'],
                    'category_id': item['snippet']['categoryId'],
                    'category_name': category_map.get(item['snippet']['categoryId'], 'Unknown'),
                    'tags': item['snippet'].get('tags', []),
                    'duration': item['contentDetails']['duration'],
                    'definition': item['contentDetails']['definition'],
                    'caption': item['contentDetails'].get('caption', 'false'),
                    'view_count': item['statistics'].get('viewCount', 0),
                    'like_count': item['statistics'].get('likeCount', 0),
                    'dislike_count': item['statistics'].get('dislikeCount', 0),
                    'favorite_count': item['statistics'].get('favoriteCount', 0),
                    'comment_count': item['statistics'].get('commentCount', 0)
                }
                videos_by_region[youtube_region_code].append(video_details)

            # Getting the next page token
            request = youtube.videos().list_next(request, response)

        # Write the data to a CSV file
        filename = f"trending_videos_{youtube_region_code}.csv"
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = video_details.keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(videos_by_region[youtube_region_code])

    # Returning the videos by region
    return videos_by_region

# Calling the function to get the trending videos
trending_videos = get_trending_videos(API_KEY)

In [4]:
# Reading the CSV file for AU(Australia) region
trending_videos_AU = pd.read_csv('trending_videos_AU.csv')
trending_videos_AU.head()

Unnamed: 0,video_id,title,description,published_at,channel_id,channel_title,category_id,category_name,tags,duration,definition,caption,view_count,like_count,dislike_count,favorite_count,comment_count
0,wxZP0bdXYUU,Robert Whittaker knocks out Ikram Aliskerov in...,Watch as Robert Whittaker knocks out Ikram Ali...,2024-06-22T21:44:48Z,UCO4AcsPKEkIqDmbeiZLfd1A,ESPN MMA,17,Sports,"['Alexander Volkov', 'Sergey Pavlovich', 'robe...",PT1M3S,hd,False,209500,4426,0,0,829
1,U0Mq3mJdwh4,MEGA BOXES ARE BACK!!!,This episode of Brawl Talk brings back the mos...,2024-06-22T15:00:18Z,UCooVYzDxdwTtGYAkcPmOgOw,Brawl Stars,20,Gaming,"['brawl stars', 'supercell game', 'megabox', '...",PT8M53S,hd,True,23946146,1282935,0,0,163795
2,Sfpr_S8nVLA,My soccer shoe breaks world records and my shi...,Get a free 14-day trial of Odoo's all-in-one b...,2024-06-22T15:00:22Z,UCJLZe_NoiG0hT7QCX_9vmqw,I did a thing,24,Entertainment,[],PT21M30S,hd,False,1763660,103764,0,0,3980
3,Z7B7PpTOpDE,Aussie Tries American Fast Food for the First ...,Join me as I try American fast food for the fi...,2024-06-23T07:30:15Z,UCecAIXPb5KTJz5BFnUzlTaA,Spanian,24,Entertainment,"['Spanian', 'action bronson', 'american', 'bes...",PT40M38S,hd,False,235263,8382,0,0,1700
4,bmzFk5-TT3w,Hiring a DJ for a Bikies Funeral PRANK,Thanks to Danny Rant's & Billboard for helping...,2024-06-22T21:00:24Z,UCEpHkpv4_CgZIEadjjOv4jA,Misfit Minds,22,People & Blogs,[],PT12M44S,hd,False,119239,7574,0,0,365


In [5]:
# Reading the csv file for CA(Canada) region
trending_videos_CA = pd.read_csv('trending_videos_CA.csv')
trending_videos_CA.head()

Unnamed: 0,video_id,title,description,published_at,channel_id,channel_title,category_id,category_name,tags,duration,definition,caption,view_count,like_count,dislike_count,favorite_count,comment_count
0,U0Mq3mJdwh4,MEGA BOXES ARE BACK!!!,This episode of Brawl Talk brings back the mos...,2024-06-22T15:00:18Z,UCooVYzDxdwTtGYAkcPmOgOw,Brawl Stars,20,Gaming,"['brawl stars', 'supercell game', 'megabox', '...",PT8M53S,hd,True,23946146,1282943,0,0,163795
1,luWWiJWergk,Gm 6: Panthers @ Oilers 6/21 | NHL Highlights ...,Extended highlights of the Florida Panthers at...,2024-06-22T03:19:07Z,UCqFMzb-4AUf6WAIbl132QKA,NHL,17,Sports,"['Edmonton Oilers', 'Edmonton Oilers vs. Flori...",PT9M,hd,True,577577,6750,0,0,1495
2,q8mC7u028Mw,I TURNED MY HOUSE INTO A WATERPARK,Are you Team Salish or Team Jordan? VOTE FOR U...,2024-06-22T14:00:08Z,UCKaCalz5N5ienIbfPzEbYuA,Jordan Matter,24,Entertainment,"['jordan matter', 'challenge', 'ben azelart', ...",PT25M23S,hd,False,5400839,91938,0,0,16055
3,0psrAMafXVg,AMP OPENS A PAWN SHOP,Get AMP streetwear https://amp.shop ‚ö°Ô∏è‚ö°Ô∏è\n\nSp...,2024-06-22T19:37:39Z,UCJbYdyufHR-cxOuY96KIoqA,AMP,24,Entertainment,"['AMP PAWN SHOP', 'AMP PAWN STARS', 'AMP RUNS ...",PT34M50S,hd,False,1184967,55751,0,0,2226
4,KR4i1Q-gBZA,This One Mistake Won The Celtics An NBA Champi...,Don‚Äôt miss out on all the action this week at ...,2024-06-23T02:11:32Z,UC3L9XPe0_FGfRG-CMGtBvFg,JxmyHighroller,17,Sports,"['LeBron James', 'House of Highlights', 'ESPN'...",PT16M39S,hd,False,554495,33522,0,0,1505


In [6]:
# Reading the csv file for GB(United Kingdom) region
trending_videos_GB = pd.read_csv('trending_videos_GB.csv')
trending_videos_GB.head()

Unnamed: 0,video_id,title,description,published_at,channel_id,channel_title,category_id,category_name,tags,duration,definition,caption,view_count,like_count,dislike_count,favorite_count,comment_count
0,ojUQLuSqV4s,Race Highlights | 2024 Spanish Grand Prix,"Relive all the best moments from Barcelona, in...",2024-06-23T15:22:49Z,UCB_qr75-ydFVKSF9Dmo6izg,FORMULA 1,17,Sports,"['F1', 'Formula One', 'Formula 1', 'Sports', '...",PT8M1S,hd,False,1533783,49608,0,0,2496
1,U0Mq3mJdwh4,MEGA BOXES ARE BACK!!!,This episode of Brawl Talk brings back the mos...,2024-06-22T15:00:18Z,UCooVYzDxdwTtGYAkcPmOgOw,Brawl Stars,20,Gaming,"['brawl stars', 'supercell game', 'megabox', '...",PT8M53S,hd,True,23946146,1282943,0,0,163795
2,q8mC7u028Mw,I TURNED MY HOUSE INTO A WATERPARK,Are you Team Salish or Team Jordan? VOTE FOR U...,2024-06-22T14:00:08Z,UCKaCalz5N5ienIbfPzEbYuA,Jordan Matter,24,Entertainment,"['jordan matter', 'challenge', 'ben azelart', ...",PT25M23S,hd,False,5400839,91938,0,0,16055
3,5QAwod2MMxU,$500 Cheap Car Race VS Mat Armstrong!,Get 'Blue Monday (2020 Digital Master)' by New...,2024-06-22T18:00:10Z,UC9-3c4LzdzT_HvW3Xuti9wg,Calfreezy,24,Entertainment,"['Soccer', 'Football', 'Calfreezy', 'Rebel', '...",PT35M35S,hd,False,241742,12241,0,0,365
4,CE5yG3PnoaY,SUPER EIGHT | England score eight tries to def...,Relive the action as England score eight tries...,2024-06-22T09:01:53Z,UCmi7CahP3G3YySOAFOfSnkw,England Rugby,17,Sports,"['England', 'Rugby', 'England Rugby', 'RFUTV']",PT7M35S,hd,False,245745,1459,0,0,335


In [7]:
# Reading the csv file for IN(India) region
trending_videos_IN = pd.read_csv('trending_videos_IN.csv')
trending_videos_IN.head()

Unnamed: 0,video_id,title,description,published_at,channel_id,channel_title,category_id,category_name,tags,duration,definition,caption,view_count,like_count,dislike_count,favorite_count,comment_count
0,nJDclWEjGPA,#TheGOATBdayShots | Thalapathy Vijay | Venkat ...,A special GOATbdayshots from the film The Grea...,2024-06-21T18:31:00Z,UC9WXzTgk10ncJX1eOxHElCg,AGS Entertainment,1,Film & Animation,"['Thalapathy Vijay', 'Thalapathy Vijay birthda...",PT51S,hd,False,8220425,502781,0,0,11972
1,4P_k0rqmyX8,Chinna Chinna Kangal (Lyrical) | The Greatest ...,Get ready to melt in this soul stirring melody...,2024-06-22T12:02:01Z,UCq-Fj5jknLsUf-MWSy4_brA,T-Series,10,Music,"['hindi songs 2024', 'hindi songs new', 'bolly...",PT4M42S,hd,False,5410599,548069,0,0,19135
2,SFiZRQZII1g,Living 24 Hours In Space Capsule : Challenge ü§Ø,SUBSCRIBE: https://youtube.com/MRINDIANHACKER?...,2024-06-22T10:28:03Z,UCSiDGb0MnHFGjs4E2WKvShw,MR. INDIAN HACKER,28,Science & Technology,[],PT27M29S,hd,False,2865475,295688,0,0,9366
3,a5XAO2vu8lY,The GOAT | Second Single Promo | Thalapathy V...,Get ready to melt in this soul stirring melody...,2024-06-21T13:00:43Z,UCAEv0ANkT221wXsTnxFnBsQ,T-Series Tamil,10,Music,"['Tamil Songs', 'Latest Tamil Songs', 'Tamil L...",PT29S,hd,False,1401019,105530,0,0,1968
4,lUrndDV5Djs,Fight On Equal Ration | Bigg Boss OTT 3 | Now ...,Vegetarians versus non-vegetarians in the hous...,2024-06-22T17:15:50Z,UC8To9CFsZzvPafxMLzS08iA,JioCinema,24,Entertainment,[],PT1M1S,hd,False,1026239,14056,0,0,1153


In [8]:
# Reading the csv file for US(United States) region
trending_videos_US = pd.read_csv('trending_videos_US.csv')
trending_videos_US.head()

Unnamed: 0,video_id,title,description,published_at,channel_id,channel_title,category_id,category_name,tags,duration,definition,caption,view_count,like_count,dislike_count,favorite_count,comment_count
0,U0Mq3mJdwh4,MEGA BOXES ARE BACK!!!,This episode of Brawl Talk brings back the mos...,2024-06-22T15:00:18Z,UCooVYzDxdwTtGYAkcPmOgOw,Brawl Stars,20,Gaming,"['brawl stars', 'supercell game', 'megabox', '...",PT8M53S,hd,True,23946146,1282943,0,0,163795
1,qReN9SFd35o,SHA'CARRI TO PARIS: Richardson SCORCHES 100m T...,"Sha'Carri Richardson, the world's fastest woma...",2024-06-23T03:38:48Z,UCqZQlzSHbVJrwrn5XvzrzcA,NBC Sports,17,Sports,"['olympics', 'nbc sports', 'track and field', ...",PT6M56S,hd,False,733142,19202,0,0,1958
2,0psrAMafXVg,AMP OPENS A PAWN SHOP,Get AMP streetwear https://amp.shop ‚ö°Ô∏è‚ö°Ô∏è\n\nSp...,2024-06-22T19:37:39Z,UCJbYdyufHR-cxOuY96KIoqA,AMP,24,Entertainment,"['AMP PAWN SHOP', 'AMP PAWN STARS', 'AMP RUNS ...",PT34M50S,hd,False,1184967,55751,0,0,2226
3,q8mC7u028Mw,I TURNED MY HOUSE INTO A WATERPARK,Are you Team Salish or Team Jordan? VOTE FOR U...,2024-06-22T14:00:08Z,UCKaCalz5N5ienIbfPzEbYuA,Jordan Matter,24,Entertainment,"['jordan matter', 'challenge', 'ben azelart', ...",PT25M23S,hd,False,5400839,91938,0,0,16055
4,hNRWpWEd_q4,"I Made $114,127,975,963 by Exploiting the Publ...",JOIN MY STREAMS! ‚ñ∫ https://www.twitch.tv/letsg...,2024-06-22T15:00:37Z,UCto7D1L-MiRoOziCXK9uT5Q,Let's Game It Out,20,Gaming,"[""let's game it out"", 'lets game it out', 'let...",PT29M31S,hd,False,1189918,61547,0,0,3377


In [9]:
# Fuction to check the missing values in the dataset
def missing_values(df):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    missing_percentage = (missing / df.shape[0]) * 100
    return pd.DataFrame({'Missing Values': missing, 'Percentage': missing_percentage})

In [10]:
# Function to check the data types of the columns
def data_types(df):
    return df.dtypes

In [11]:
# Checking the missing values and data types of the columns for AU(Australia) region
print("Missing Values AU(Australia) Region")
missing_values(trending_videos_AU), data_types(trending_videos_AU)

Missing Values AU(Australia) Region


(             Missing Values  Percentage
 description               1         0.5,
 video_id          object
 title             object
 description       object
 published_at      object
 channel_id        object
 channel_title     object
 category_id        int64
 category_name     object
 tags              object
 duration          object
 definition        object
 caption             bool
 view_count         int64
 like_count         int64
 dislike_count      int64
 favorite_count     int64
 comment_count      int64
 dtype: object)

In [12]:
# Checking the missing values and data types of the columns for CA(Canada) region
print("Missing Values CA(Canada) Region")
missing_values(trending_videos_CA), data_types(trending_videos_CA)

Missing Values CA(Canada) Region


(             Missing Values  Percentage
 description               1         0.5,
 video_id          object
 title             object
 description       object
 published_at      object
 channel_id        object
 channel_title     object
 category_id        int64
 category_name     object
 tags              object
 duration          object
 definition        object
 caption             bool
 view_count         int64
 like_count         int64
 dislike_count      int64
 favorite_count     int64
 comment_count      int64
 dtype: object)

In [13]:
# Checking the missing values and data types of the columns for GB(United Kingdom) region
print("Missing Values GB(United Kingdom) Region")
missing_values(trending_videos_GB), data_types(trending_videos_GB)

Missing Values GB(United Kingdom) Region


(             Missing Values  Percentage
 description               1         0.5,
 video_id          object
 title             object
 description       object
 published_at      object
 channel_id        object
 channel_title     object
 category_id        int64
 category_name     object
 tags              object
 duration          object
 definition        object
 caption             bool
 view_count         int64
 like_count         int64
 dislike_count      int64
 favorite_count     int64
 comment_count      int64
 dtype: object)

In [14]:
# Checking the missing values and data types of the columns for IN(India) region
print("Missing Values IN(India) Region")
missing_values(trending_videos_IN), data_types(trending_videos_IN)

Missing Values IN(India) Region


(             Missing Values  Percentage
 description               4    2.898551,
 video_id          object
 title             object
 description       object
 published_at      object
 channel_id        object
 channel_title     object
 category_id        int64
 category_name     object
 tags              object
 duration          object
 definition        object
 caption             bool
 view_count         int64
 like_count         int64
 dislike_count      int64
 favorite_count     int64
 comment_count      int64
 dtype: object)

In [15]:
# Checking the missing values and data types of the columns for US(United States) region
print("Missing Values US(United States) Region")
missing_values(trending_videos_US), data_types(trending_videos_US)

Missing Values US(United States) Region


(             Missing Values  Percentage
 description               1         0.5,
 video_id          object
 title             object
 description       object
 published_at      object
 channel_id        object
 channel_title     object
 category_id        int64
 category_name     object
 tags              object
 duration          object
 definition        object
 caption             bool
 view_count         int64
 like_count         int64
 dislike_count      int64
 favorite_count     int64
 comment_count      int64
 dtype: object)

In [16]:
# Function to fillna values in the dataset
def fill_na(df):
    df.fillna({'description': 'Description Blank'}, inplace=True)
    return df

In [17]:
# Filling the missing values in the description column with Description Blank
trending_videos_AU = fill_na(trending_videos_AU)
trending_videos_CA = fill_na(trending_videos_CA)
trending_videos_GB = fill_na(trending_videos_GB)
trending_videos_IN = fill_na(trending_videos_IN)
trending_videos_US = fill_na(trending_videos_US)

In [18]:
# Function to convert published_at column to datetime
def convert_published_at(df):
    df['published_at'] = pd.to_datetime(df['published_at'])
    return df

In [19]:
# Converting published_at column to datetime
trending_videos_AU = convert_published_at(trending_videos_AU)
trending_videos_CA = convert_published_at(trending_videos_CA)
trending_videos_GB = convert_published_at(trending_videos_GB)
trending_videos_IN = convert_published_at(trending_videos_IN)
trending_videos_US = convert_published_at(trending_videos_US)

In [20]:
# Function to convert tags column from str to list
def convert_tags(df):
    df['tags'] = df['tags'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    return df

In [21]:
# Converting tags column from str to list
trending_videos_AU = convert_tags(trending_videos_AU)
trending_videos_CA = convert_tags(trending_videos_CA)
trending_videos_GB = convert_tags(trending_videos_GB)
trending_videos_IN = convert_tags(trending_videos_IN)
trending_videos_US = convert_tags(trending_videos_US)

In [22]:
# function to convert isodate to date and chaning its type to int
def convert_date(df):
    df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
    df['duration'] = df['duration'].astype(int)
    return df

In [23]:
# Converting isodate to date
trending_videos_AU = convert_date(trending_videos_AU)
trending_videos_CA = convert_date(trending_videos_CA)
trending_videos_GB = convert_date(trending_videos_GB)
trending_videos_IN = convert_date(trending_videos_IN)
trending_videos_US = convert_date(trending_videos_US)

In [24]:
# Function to get the max duration of the videos
def get_max_duration(df):
    return df['duration'].max()

# Getting the max duration of the videos for AU(Australia), CA(Canada), GB(United Kingdom), IN(India), US(United States) regions
max_duration_AU = get_max_duration(trending_videos_AU)
max_duration_CA = get_max_duration(trending_videos_CA)
max_duration_GB = get_max_duration(trending_videos_GB)
max_duration_IN = get_max_duration(trending_videos_IN)
max_duration_US = get_max_duration(trending_videos_US)
max_duration_AU, max_duration_CA, max_duration_GB, max_duration_IN, max_duration_US

(np.int64(4417),
 np.int64(4417),
 np.int64(4417),
 np.int64(5289),
 np.int64(4417))

In [None]:
import pandas as pd

def add_duration_range_column(df):
    bins = [0, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900, 4200, 4500, 4800, 5100, 5400, 5700, 6000]
    labels = ['0-5 min', '5-10 min', '10-15 min', '15-20 min', '20-25 min', '25-30 min', '30-35 min', '35-40 min', '40-45 min', '45-50 min', '50-55 min', '55-60 min', '60-65 min', '65-70 min', '70-75 min', '75-80 min', '80-85 min', '85-90 min', '90-95 min', '95-100 min']
    df['duration_range'] = pd.cut(df['duration_seconds'], bins=bins, labels=labels)
    return df

# Example usage:
# trending_videos = add_duration_range_column(trending_videos)


In [26]:
trending_videos['duration_range'] = pd.cut(trending_videos['duration_seconds'], bins=[0, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900, 4200, 4500, 4800, 5100, 5400, 5700, 6000], labels=['0-5 min', '5-10 min', '10-15 min', '15-20 min',  '20-25 min','25-30 min', '30-35 min', '35-40 min', '40-45 min', '45-50 min', '50-55 min', '55-60 min', '60-65 min', '65-70 min', '70-75 min', '75-80 min', '80-85 min', '85-90 min', '90-95 min', '95-100 min'])

KeyError: 'duration_seconds'