In [None]:
# Importing Libraries
import pandas as pd
from googleapiclient.discovery import build
import isodate
import csv
print("Libraries Imported Successfully")

In [None]:
# Youtube V3 API key
API_KEY = 'AIzaSyBCQY_bqe2-2Gh6NUXzIITeEr1JQZg9C9U'
print("API Key Set Successfully")

In [None]:
# Function to get the trending videos of 5 specific regions AU(Australia), CA(Canada), GB(United Kingdom), IN(India), US(United States) and saving then in CSV files
def get_trending_videos(api_key, region_codes=['AU', 'CA', 'GB', 'IN', 'US'], max_results=200):
    # Initialize an empty dictionary to store videos by region
    videos_by_region = {region: [] for region in region_codes}

    # Youtube API connection build object
    youtube = build('youtube', 'v3', developerKey=api_key)

    for youtube_region_code in region_codes:
        # Getting youtube categories Names
        categories_response = youtube.videoCategories().list(part='snippet', regionCode=youtube_region_code).execute()
        category_map = {item['id']: item['snippet']['title'] for item in categories_response.get('items', [])}

        # Fetching the most popular videos
        request = youtube.videos().list(
            part='snippet,contentDetails,statistics',
            chart='mostPopular',
            regionCode=youtube_region_code,
            maxResults=50
        )

        # Paginating through the results getting the details and storing them in the list
        while request and len(videos_by_region[youtube_region_code]) < max_results:
            response = request.execute()
            for item in response['items']:
                video_details = {
                    'video_id': item['id'],
                    'title': item['snippet']['title'],
                    'description': item['snippet']['description'],
                    'published_at': item['snippet']['publishedAt'],
                    'channel_id': item['snippet']['channelId'],
                    'channel_title': item['snippet']['channelTitle'],
                    'category_id': item['snippet']['categoryId'],
                    'category_name': category_map.get(item['snippet']['categoryId'], 'Unknown'),
                    'tags': item['snippet'].get('tags', []),
                    'duration': item['contentDetails']['duration'],
                    'view_count': item['statistics'].get('viewCount', 0),
                    'like_count': item['statistics'].get('likeCount', 0),
                    'comment_count': item['statistics'].get('commentCount', 0)
                }
                videos_by_region[youtube_region_code].append(video_details)

            # Getting the next page token
            request = youtube.videos().list_next(request, response)

        # Write the data to a CSV file
        filename = f"trending_videos_{youtube_region_code}.csv"
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = video_details.keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(videos_by_region[youtube_region_code])

    # Returning the videos by region
    return videos_by_region

# Calling the function to get the trending videos
trending_videos = get_trending_videos(API_KEY)
print("Trending Videos Fetched Successfully")

In [None]:
# Reading the CSV file for AU(Australia) region
trending_videos_AU = pd.read_csv('trending_videos_AU.csv')
trending_videos_AU.head()

In [None]:
# Reading the csv file for CA(Canada) region
trending_videos_CA = pd.read_csv('trending_videos_CA.csv')
trending_videos_CA.head()

In [None]:
# Reading the csv file for GB(United Kingdom) region
trending_videos_GB = pd.read_csv('trending_videos_GB.csv')
trending_videos_GB.head()

In [None]:
# Reading the csv file for IN(India) region
trending_videos_IN = pd.read_csv('trending_videos_IN.csv')
trending_videos_IN.head()

In [None]:
# Reading the csv file for US(United States) region
trending_videos_US = pd.read_csv('trending_videos_US.csv')
trending_videos_US.head()

In [None]:
# Fuction to check the missing values in the dataset
def missing_values(df):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    missing_percentage = (missing / df.shape[0]) * 100
    return pd.DataFrame({'Missing Values': missing, 'Percentage': missing_percentage})

In [None]:
# Function to check the data types of the columns
def data_types(df):
    return df.dtypes

In [None]:
# Checking the missing values and data types of the columns for AU(Australia) region
print("Missing Values AU(Australia) Region")
missing_values(trending_videos_AU), data_types(trending_videos_AU)

In [None]:
# Checking the missing values and data types of the columns for CA(Canada) region
print("Missing Values CA(Canada) Region")
missing_values(trending_videos_CA), data_types(trending_videos_CA)

In [None]:
# Checking the missing values and data types of the columns for GB(United Kingdom) region
print("Missing Values GB(United Kingdom) Region")
missing_values(trending_videos_GB), data_types(trending_videos_GB)

In [None]:
# Checking the missing values and data types of the columns for IN(India) region
print("Missing Values IN(India) Region")
missing_values(trending_videos_IN), data_types(trending_videos_IN)

In [None]:
# Checking the missing values and data types of the columns for US(United States) region
print("Missing Values US(United States) Region")
missing_values(trending_videos_US), data_types(trending_videos_US)

In [None]:
# Function to fillna values in the dataset
def fill_na(df):
    df.fillna({'description': 'Description Blank'}, inplace=True)
    return df

In [None]:
# Filling the missing values in the description column with Description Blank
trending_videos_AU = fill_na(trending_videos_AU)
trending_videos_CA = fill_na(trending_videos_CA)
trending_videos_GB = fill_na(trending_videos_GB)
trending_videos_IN = fill_na(trending_videos_IN)
trending_videos_US = fill_na(trending_videos_US)
print("Missing Values Filled Successfully")

In [None]:
# Function to convert published_at column to datetime
def convert_published_at(df):
    df['published_at'] = pd.to_datetime(df['published_at'])
    return df

In [None]:
# Converting published_at column to datetime
trending_videos_AU = convert_published_at(trending_videos_AU)
trending_videos_CA = convert_published_at(trending_videos_CA)
trending_videos_GB = convert_published_at(trending_videos_GB)
trending_videos_IN = convert_published_at(trending_videos_IN)
trending_videos_US = convert_published_at(trending_videos_US)
print("Published At Column Converted to Datetime Successfully")

In [None]:
# Function to convert tags column from str to list
def convert_tags(df):
    df['tags'] = df['tags'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    return df

In [None]:
# Converting tags column from str to list
trending_videos_AU = convert_tags(trending_videos_AU)
trending_videos_CA = convert_tags(trending_videos_CA)
trending_videos_GB = convert_tags(trending_videos_GB)
trending_videos_IN = convert_tags(trending_videos_IN)
trending_videos_US = convert_tags(trending_videos_US)
print("Tags Column Converted to List Successfully")

In [None]:
# function to convert isodate to date and chaning its type to int
def convert_date(df):
    df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())
    df['duration'] = df['duration'].astype(int)
    return df

In [None]:
# Converting isodate to date
trending_videos_AU = convert_date(trending_videos_AU)
trending_videos_CA = convert_date(trending_videos_CA)
trending_videos_GB = convert_date(trending_videos_GB)
trending_videos_IN = convert_date(trending_videos_IN)
trending_videos_US = convert_date(trending_videos_US)
print("Date Column Converted Successfully")

In [None]:
# Function to get the max duration of the videos
def get_max_duration(df):
    return df['duration'].max()

# Getting the max duration of the videos for AU(Australia), CA(Canada), GB(United Kingdom), IN(India), US(United States) regions
max_duration_AU = get_max_duration(trending_videos_AU)
max_duration_CA = get_max_duration(trending_videos_CA)
max_duration_GB = get_max_duration(trending_videos_GB)
max_duration_IN = get_max_duration(trending_videos_IN)
max_duration_US = get_max_duration(trending_videos_US)
print("Max AU Region",max_duration_AU)
print("Max CA Region",max_duration_CA)
print("Max GB Region",max_duration_GB)
print("Max IN Region",max_duration_IN)
print("Max US Region",max_duration_US)

In [None]:
# Function to add duration range column
def add_duration_range_column(df):
    bins = [0, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900, 4200, 4500, 4800, 5100, 5400, 5700, 6000]
    labels = ['0-5 min', '5-10 min', '10-15 min', '15-20 min', '20-25 min', '25-30 min', '30-35 min', '35-40 min', '40-45 min', '45-50 min', '50-55 min', '55-60 min', '60-65 min', '65-70 min', '70-75 min', '75-80 min', '80-85 min', '85-90 min', '90-95 min', '95-100 min']
    df['duration_range'] = pd.cut(df['duration'], bins=bins, labels=labels)
    return df

In [None]:
# Adding duration range column to the dataset
trending_videos_AU= add_duration_range_column(trending_videos_AU)
trending_videos_CA = add_duration_range_column(trending_videos_CA)
trending_videos_GB = add_duration_range_column(trending_videos_GB)
trending_videos_IN = add_duration_range_column(trending_videos_IN)
trending_videos_US = add_duration_range_column(trending_videos_US)
print("Duration Range Column Added Successfully")

In [None]:
# Function to add tag count column
def add_tag_count_column(df):
    df['tag_count'] = df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return df

In [None]:
# Adding tag count column to the dataset
trending_videos_AU = add_tag_count_column(trending_videos_AU)
trending_videos_CA = add_tag_count_column(trending_videos_CA)
trending_videos_GB = add_tag_count_column(trending_videos_GB)
trending_videos_IN = add_tag_count_column(trending_videos_IN)
trending_videos_US = add_tag_count_column(trending_videos_US)
print("Tag Count Column Added Successfully")

In [None]:
# Function to add published hour coloumn
def add_published_hour_column(df):
    df['published_hour'] = df['published_at'].dt.hour
    return df

In [None]:
# Adding published hour column to the dataset
trending_videos_AU = add_published_hour_column(trending_videos_AU)
trending_videos_CA = add_published_hour_column(trending_videos_CA)
trending_videos_GB = add_published_hour_column(trending_videos_GB)
trending_videos_IN = add_published_hour_column(trending_videos_IN)
trending_videos_US = add_published_hour_column(trending_videos_US)
print("Published Hour Column Added Successfully")

In [None]:
# Viewing the processed dataset for AU(Australia) region
trending_videos_AU.head()

In [None]:
# Viewing the data types of the preprocessed dataset for AU(Australia) region
data_types(trending_videos_AU)

In [None]:
# Viewing the processed dataset for CA(Canada) region
trending_videos_CA.head()

In [None]:
# Viewing the data types of the preprocessed dataset for CA(Canada) region
data_types(trending_videos_CA)

In [None]:
# Viewing the processed dataset for GB(United Kingdom) region
trending_videos_GB.head()

In [None]:
# Viewing the data types of the preprocessed dataset for GB(United Kingdom) region
data_types(trending_videos_GB)

In [None]:
# Viewing the processed dataset for IN(India) region
trending_videos_IN.head()

In [None]:
# Viewing the data types of the preprocessed dataset for IN(India) region
data_types(trending_videos_IN)

In [None]:
# Viewing the processed dataset for US(United States) region
trending_videos_US.head()

In [None]:
# Viewing the data types of the preprocessed dataset for US(United States) region
data_types(trending_videos_US)

In [None]:
# Preprocessing of the dataset is completed successfully
print("Preprocessing Completed Successfully")

In [None]:
# Function to update the csv file with the preprocessed data
def update_csv_file(df, region_code):
    filename = f"trending_videos_{region_code}.csv"
    df.to_csv(filename, index=False)
    print(f"CSV file updated for {region_code} region")

In [None]:
# Updating the csv file with the preprocessed data
update_csv_file(trending_videos_AU, 'AU')
update_csv_file(trending_videos_CA, 'CA')
update_csv_file(trending_videos_GB, 'GB')
update_csv_file(trending_videos_IN, 'IN')
update_csv_file(trending_videos_US, 'US')