### 1. Cài đặt thư viện

In [1]:
# %pip install kagglehub isodate pandas requests

In [2]:
import kagglehub
import pandas as pd
import requests
import os
from isodate import parse_duration

### 2. Hàm lấy dữ liệu từ Kaggle

In [3]:
def get_trending_videos():
    path = kagglehub.dataset_download("asaniczka/trending-youtube-videos-113-countries")
    namefile = '/trending_yt_videos_113_countries.csv'
    trending_videos = pd.read_csv(path+namefile)
    return trending_videos

### 3. Hàm lấy dữ liệu từ YoutubeAPI

In [4]:
def get_categories(api_key:str):
    params = {
        'part': 'snippet',
        'chart': 'mostPopular',
        'regionCode': 'VN',
        'key': api_key
    }
    response = requests.get(url='https://www.googleapis.com/youtube/v3/videoCategories',params=params)
    data_json = response.json()
    categories = {}
    if "items" in data_json:
        for item in data_json["items"]:
            category_id = item["id"]
            category_name = item["snippet"]["title"]
            categories.update({category_id:category_name})
    return categories

In [5]:
def get_dates(time:str):
    duration = parse_duration(time)
    formatted_time = str(duration)
    return formatted_time

In [6]:
def get_video_info(api_key:str, video_id:str):
    categories = get_categories(api_key)
    category = None
    duration = None
    published_at = None
    params = {
        'part': ['snippet','contentDetails','status'],
        'id':video_id,
        'key': api_key
    }
    response = requests.get(url='https://www.googleapis.com/youtube/v3/videos',params=params)
    data_json = response.json()
    if "items" in data_json:
        for item in data_json["items"]:
            category = categories.get(item["snippet"]["categoryId"])
            duration = get_dates(item['contentDetails']['duration'])
            published_at = item['snippet']['publishedAt']
    return [video_id,category,duration,published_at]

In [7]:
def get_video_info_list(api_key:str, list_video_id:pd.Series):
    video_id = []
    category = []
    duration = []
    published_at = []
    for item in list_video_id:
        res = get_video_info(api_key,item)
        video_id.append(res[0])
        category.append(res[1])
        duration.append(res[2])
        published_at.append(res[3])
    df = pd.DataFrame({
        'video_id': video_id,
        'category': category,
        'duration': duration,
        'published_at': published_at
    })
    return df

### 4. Lọc dữ liệu

#### a. Lấy dữ liệu từ Kaggle

In [8]:
if os.path.exists('Data/VN_trending_videos.csv'):
    trending_videos = pd.read_csv('Data/VN_trending_videos.csv')
else:
    trending_videos = get_trending_videos()
    trending_videos = trending_videos.loc[trending_videos['country'] == 'VN']
    trending_videos.to_csv('Data/VN_trending_videos.csv',index=False)

#### b. Lấy vùng dữ liệu từ tháng 11-2023 đến 10-2024

In [9]:
start_date = '2023-11-01'
end_date = '2024-10-31'

trending_videos = trending_videos[(trending_videos['snapshot_date'] >= start_date) & (trending_videos['snapshot_date'] <= end_date)]

#### c. Lấy dữ liệu từ YoutubeAPI

In [10]:
api_key = 'AIzaSyD62vGXCRcS9ZTktlFWALDb-5tmnpewW1w'

In [11]:
categories = get_categories(api_key)

In [12]:
if os.path.exists('Data/api_data.csv'):
    api_data = pd.read_csv('Data/api_data.csv')
else:
    trending_videos_drop_duplicates = trending_videos.drop_duplicates(subset='video_id')
    api_data = get_video_info_list(api_key,trending_videos_drop_duplicates.loc[::,'video_id'])
    api_data.to_csv('Data/api_data.csv', index=False)

#### d. Xoá những cột không cần thiết

In [13]:
trending_videos.drop(columns=['daily_rank', 'daily_movement', 'weekly_movement', 'thumbnail_url', 'kind', 'publish_date', 'channel_id', 'country','thumbnail_url','kind','description','publish_date'],inplace=True)

#### e. Ghép data từ 2 bảng

In [14]:
trending_videos = pd.merge(trending_videos,api_data,on='video_id',how='inner')

#### f. Xoá những video bị xoá

In [15]:
trending_videos = trending_videos.dropna(subset=['duration'])

#### g. Thêm những cột cần thiết

In [16]:
trending_videos['snapshot_date'] = pd.to_datetime(trending_videos['snapshot_date'])
trending_videos['published_at'] = pd.to_datetime(trending_videos['published_at']).dt.tz_convert('Asia/Ho_Chi_Minh').dt.strftime('%Y-%m-%d %H:%M:%S')
trending_videos['trending_time'] = (trending_videos['snapshot_date'] + pd.Timedelta(days=1)) - trending_videos.groupby('video_id')['snapshot_date'].transform('min')

#### h. Xuất ra csv

In [17]:
trending_videos.to_csv('Data/trending_videos.csv', index=False)

In [18]:
trending_videos_unique = trending_videos.loc[trending_videos.groupby('video_id')['snapshot_date'].idxmax()]
trending_videos_unique.to_csv('Data/trending_videos_unique.csv', index=False)