# A. Cài đặt thư viện

In [55]:
import kagglehub
import pandas as pd
import requests
import os
from isodate import parse_duration

# B. Thu thập và xử lý dữ liệu

## 1. Thu thập dữ liệu

### 1.1. Thu thập dữ liệu từ kaggle

In [56]:
def get_trending_videos():
    """
    Downloads and reads the trending YouTube videos dataset from Kaggle.
    This function uses the `kagglehub` library to download the dataset 
    "trending-youtube-videos-113-countries" by user "asaniczka" from Kaggle.
    It then reads the CSV file containing the trending videos data into a 
    pandas DataFrame and returns it.
    Returns:
        pandas.DataFrame: A DataFrame containing the trending YouTube videos data.
    """
    
    path = kagglehub.dataset_download("asaniczka/trending-youtube-videos-113-countries")
    namefile = '/trending_yt_videos_113_countries.csv'
    trending_videos = pd.read_csv(path+namefile)
    return trending_videos

### 1.2. Thu thập dữ liệu từ YouTube API

In [57]:
def get_categories(api_key:str):
    """
    Fetches YouTube video categories for a specified region.
    Args:
        api_key (str): The API key to access the YouTube Data API.
    Returns:
        dict: A dictionary where the keys are category IDs and the values are category names.
    """
    
    params = {
        'part': 'snippet',
        'chart': 'mostPopular',
        'regionCode': 'VN',
        'key': api_key
    }
    response = requests.get(url='https://www.googleapis.com/youtube/v3/videoCategories',params=params)
    data_json = response.json()
    categories = {}
    if "items" in data_json:
        for item in data_json["items"]:
            category_id = item["id"]
            category_name = item["snippet"]["title"]
            categories.update({category_id:category_name})
    return categories

In [58]:
def get_dates(time:str):
    """
    Parses a given time string and returns the formatted duration.
    Args:
        time (str): The time string to be parsed.
    Returns:
        str: The formatted duration as a string.
    """
    
    duration = parse_duration(time)
    formatted_time = str(duration)
    return formatted_time

In [59]:

def get_video_info(api_key: str, video_id: str):
    """
    Fetches video information from the YouTube Data API.

    Args:
        api_key (str): The API key to authenticate with the YouTube Data API.
        video_id (str): The ID of the YouTube video to fetch information for.

    Returns:
        list: A list containing the video ID, category, duration, and published date.
                The list is in the format [video_id, category, duration, published_at].

    Raises:
        requests.exceptions.RequestException: If there is an issue with the HTTP request.
        KeyError: If the expected keys are not found in the API response.

    Note:
        This function relies on the helper functions `get_categories` and `get_dates` 
        to fetch category information and parse the video duration, respectively.
    """
    
    categories = get_categories(api_key)
    category = None
    duration = None
    published_at = None
    params = {
        'part': ['snippet','contentDetails','status'],
        'id':video_id,
        'key': api_key
    }
    response = requests.get(url='https://www.googleapis.com/youtube/v3/videos',params=params)
    data_json = response.json()
    if "items" in data_json:
        for item in data_json["items"]:
            category = categories.get(item["snippet"]["categoryId"])
            duration = get_dates(item['contentDetails']['duration'])
            published_at = item['snippet']['publishedAt']
    return [video_id,category,duration,published_at]

In [60]:
def get_video_info_list(api_key: str, list_video_id: pd.Series):
    """
    Retrieves video information for a list of video IDs using the provided API key.
    Args:
        api_key (str): The API key used to authenticate requests to the video information service.
        list_video_id (pd.Series): A pandas Series containing video IDs for which information is to be retrieved.
    Returns:
        pd.DataFrame: A DataFrame containing video information with columns:
            - 'video_id': The ID of the video.
            - 'category': The category of the video.
            - 'duration': The duration of the video.
            - 'published_at': The publication date of the video.
    """
    
    video_id = []
    category = []
    duration = []
    published_at = []
    for item in list_video_id:
        res = get_video_info(api_key,item)
        video_id.append(res[0])
        category.append(res[1])
        duration.append(res[2])
        published_at.append(res[3])
    df = pd.DataFrame({
        'video_id': video_id,
        'category': category,
        'duration': duration,
        'published_at': published_at
    })
    return df

## 2. Xử lý dữ liệu

### Lấy dữ liệu từ Kaggle

In [61]:
if os.path.exists('Data/VN_trending_videos.csv'):
    trending_videos = pd.read_csv('Data/VN_trending_videos.csv')
else:
    trending_videos = get_trending_videos()
    trending_videos = trending_videos.loc[trending_videos['country'] == 'VN']
    trending_videos.to_csv('Data/VN_trending_videos.csv',index=False)

### Lấy dữ liệu từ YoutubeAPI

In [62]:
api_key = 'AIzaSyD62vGXCRcS9ZTktlFWALDb-5tmnpewW1w'

In [64]:
if os.path.exists('Data/api_data.csv'):
    api_data = pd.read_csv('Data/api_data.csv')
else:
    trending_videos_drop_duplicates = trending_videos.drop_duplicates(subset='video_id')
    api_data = get_video_info_list(api_key,trending_videos_drop_duplicates.loc[::,'video_id'])
    api_data.to_csv('Data/api_data.csv', index=False)

### Kết hợp dữ liệu

In [65]:
trending_videos = pd.merge(trending_videos,api_data,on='video_id',how='inner')

### 2.1. Loại bỏ dữ liệu không liên quan đến dự án

#### 2.1.1. Loại bỏ các cột không liên quan

In [66]:
trending_videos.drop(columns=['daily_rank', 
                              'daily_movement', 
                              'weekly_movement', 
                              'country', 
                              'description', 
                              'thumbnail_url', 
                              'channel_id', 'kind', 
                              'publish_date'],
                              inplace=True)

#### 2.1.2. Lấy vùng dữ liệu từ tháng 11-2023 đến 11-2024

In [67]:
start_date = '2023-11-01'
end_date = '2024-11-31'

trending_videos = trending_videos[(trending_videos['snapshot_date'] >= start_date) & 
                                  (trending_videos['snapshot_date'] <= end_date)]

#### 2.1.3. Loại bỏ các video không còn tồn tại hoặc bị ẩn đi

In [68]:
trending_videos = trending_videos.dropna(subset=['duration'])

### 2.2. Chuẩn hóa dữ liệu

In [69]:
trending_videos['snapshot_date'] = pd.to_datetime(trending_videos['snapshot_date'])

trending_videos['published_at'] = pd.to_datetime(trending_videos['published_at']).dt.tz_convert('Asia/Ho_Chi_Minh').dt.strftime('%Y-%m-%d %H:%M:%S')

### 2.3. Bổ sung thông tin dữ liệu

In [70]:

trending_videos['trending_time'] = (trending_videos['snapshot_date'] + pd.Timedelta(days=1)) - trending_videos.groupby('video_id')['snapshot_date'].transform('min')

### 2.4. Tối ưu hóa thời gian thu thập

In [71]:
trending_videos.to_csv('Data/trending_videos.csv', index=False)

In [72]:
trending_videos_unique = trending_videos.loc[trending_videos.groupby('video_id')['snapshot_date'].idxmax()]
trending_videos_unique.to_csv('Data/trending_videos_unique.csv', index=False)