# **Data Collection**

YouTube Extraction

In [None]:
#!pip install pandas openpyxl yt-dlp pydub
#!pip show yt-dlp 

In [None]:
import pandas as pd # to handle dataframes
import yt_dlp # to make a connection with youtube data and download audio and captions
import os # to handle file paths

In [None]:
# Given an excel file path with a list of music catalog based on youtube links
# For each link, download the captions (if available) and save them
def extract_lyrics_from_youtube_captions(excel_path: str, captions_output_folder_path: str, ffmpeg_path: str = None, lang: str = 'en'):

    # verify if the captions folder exists, if not create it
    if not os.path.exists(captions_output_folder_path):
        os.makedirs(captions_output_folder_path)

    # read the excel file into a pandas dataframe
    try:
        df = pd.read_excel(excel_path, dtype={'title': 'object', 'channel': 'object'})
        print(f"File {excel_path} loaded successfully.")
    except FileNotFoundError:
        print(f"Error: The file at {excel_path} was not found.")
        return
    
    # check if the captions_file column exists, if not create it
    if 'captions_file' not in df.columns:
        df['captions_file'] = None

    # iterate through each row in the dataframe
    for index, row in df.iterrows():
        link = row.get('link')
        music_id = row.get('id', f"track_{index}")

        # skip rows with missing links
        if pd.isna(link):
            print(f"Skipping row {index} due to missing link.")
            continue

        # download captions using yt-dlp
        try:
            print(f"Processing captions for music id {music_id}.")

            # set up yt-dlp options for captions download
            caption_metadata = {
                'skip_download': True,
                'writesubtitles': True,
                'writeautomaticsub': True,
                'subtitleslangs': [lang],
                'subtitlesformat': 'vtt',
                'outtmpl': os.path.join(captions_output_folder_path, f'{music_id}.%(ext)s'),
                'ffmpeg_location': ffmpeg_path if ffmpeg_path else None,
            }

            # download captions
            with yt_dlp.YoutubeDL(caption_metadata) as data:
                info = data.extract_info(link, download=True)
                
                # Check if caption file exists
                vtt_file = os.path.join(captions_output_folder_path, f'{music_id}.en.vtt')
                if os.path.exists(vtt_file):
                    df.at[index, 'captions_file'] = vtt_file
                    print(f"Captions for music id {music_id} downloaded successfully at {vtt_file}.")
                else:
                    print("Failed to download captions for music id {music_id}.")

        # if something goes wrong, log the error
        except Exception as e:
            error_message = f"ERROR: {type(e).__name__} - {e}"
            print(f"Failed to process link. Reason:{error_message}")

            if 'captions_flag' not in df.columns:
                df['captions_flag'] = None
            df.at[index, 'captions_flag'] = error_message

In [None]:
# Get an excel file path with a list of music catalog based on youtube links
# For each link, download the audio and save it as a .wav file
def process_music_catalog_from_excel(excel_path: str, audio_output_folder_path: str, 
                                     ffmpeg_path: str, lyrics_output_folder_path: str):

    # verify if the output folder exists, if not create it
    if not os.path.exists(audio_output_folder_path):
        os.makedirs(audio_output_folder_path)

    # read the excel file into a pandas dataframe
    try:
        df = pd.read_excel(excel_path, dtype={'title': object, 'channel': object})
        print(f"File {excel_path} loaded successfully.")
    except FileNotFoundError:
        print(f"Error: The file at {excel_path} was not found.")
        return
    
    # iterate through each row in the dataframe
    for index, row in df.iterrows():
        link = row.get('link')
        music_id = row.get('id', f"track_{index}")

        # skip rows with missing links
        if pd.isna(link):
            print(f"Skipping row {index} due to missing link.")
            continue

        # download audio using yt-dlp
        try:
            print(f"Processing music of id: {music_id}.")
            music_metadata = {'quiet': True}

            with yt_dlp.YoutubeDL(music_metadata) as data:
                # extract video information without downloading it
                info = data.extract_info(link, download=False)

                df.at[index, 'title'] = info.get('title')
                df.at[index, 'channel'] = info.get('uploader')
                df.at[index, 'youtube_views'] = info.get('view_count')
                df.at[index, 'youtube_likes'] = info.get('like_count', 'N/A')

            # configure yt-dlp options for audio extraction (.wav format)
            audio_config = {
                'format': 'bestaudio/best',
                'outtmpl': os.path.join(audio_output_folder_path, f"{music_id}.%(ext)s"),
                'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'wav',
                    'preferredquality': '192',
                }],
                'ffmpeg_location': ffmpeg_path,
                'quiet': True,
            }

            # download and convert the audio with the setup configuration
            with yt_dlp.YoutubeDL(audio_config) as ydl:
                ydl.download([link])
            print(f"Downloaded and processed music of id: {music_id} successfully.")

        # if something goes wrong, log the error
        except Exception as e:
            error_message = f"ERROR: {type(e).__name__} - {e}"
            print(f"Failed to process link. Reason:{error_message}")

            if 'audio_flag' not in df.columns:
                df['audio_flag'] = None
            df.at[index, 'audio_flag'] = error_message

    # extract lyrics based on youtube captions
    extract_lyrics_from_youtube_captions(excel_path, lyrics_output_folder_path, ffmpeg_path, lang="en")

In [None]:
# process the catalog of children's songs (from 2019 to 2024)
# to use as a test an control group for the main experiment
# it includes audio and lyrics extraction
process_music_catalog_from_excel (
    excel_path="../../sample_data/music_collection.xlsx",
    audio_output_folder_path="../../sample_data/tracks",
    ffmpeg_path=r"C:\ffmpeg\bin",
    lyrics_output_folder_path="../../sample_data/captions"
)

In [None]:
# process the catalog of children's songs (released before 2019)
# to use as a training group for the SVM model classifier
# it includes only lyrics extraction
extract_lyrics_from_youtube_captions(
    excel_path="../../sample_data/svm_classifier/music_training_collection.xlsx",
    lyrics_output_folder_path="../../sample_data/svm_classifier/captions",
    ffmpeg_path=r"C:\ffmpeg\bin"
)