In [1]:
import pathlib
import os
import pandas as pd
import requests

def download_video_from_link(video_url, save_path):
    """
    Download a video from a URL and save it to a file.
    Args:
        video_url (str): URL of the video to download.
        save_path (str): Directory where the video will be saved.
    """
    try:
        response = requests.get(video_url, stream=True)
        response.raise_for_status()  # Raise an error for bad responses

        with open(save_path, 'wb') as video_file:
            for chunk in response.iter_content(chunk_size=8192): # The video is written in chunks (chunk_size=8192) to avoid using too much memory.
                video_file.write(chunk)

        print(f"Video successfully downloaded to {save_path}")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading video from {video_url}: {e}")

##2. Download all videos for one word from all data sources##
def download_videos_for_word(word, data_sources, base_path='C:/Users/91974/Desktop/Healthbrazil'):
    """
    Download all videos for a specific word across all data sources.
    Args:
        word (str): The word for which to download the videos.
        data_sources (dict): Dictionary of data sources.
        base_path (str): The base directory where the data sources are located.
    """
    
    # Normalize the word to handle spaces and case sensitivity
    word_normalized = word.strip().lower()  # Strip leading/trailing spaces and convert to lowercase

    for data_source_key, data_source in data_sources.items():
        print(f"Processing word: {word} from data source: {data_source}...")

        # Dynamically set the metadata CSV path based on the data source
        metadata_csv_path = pathlib.Path(f'{base_path}/{data_source}/metadata.csv')
              
        if not metadata_csv_path.exists():
            print(f"Metadata file for {data_source} not found at {metadata_csv_path}. Skipping.")
            continue

        metadata_csv = pd.read_csv(metadata_csv_path)

        # Normalize the 'label' column in the metadata CSV
        metadata_csv['label_normalized'] = metadata_csv['label'].str.strip().str.lower()

        # Check if the normalized word exists in the normalized 'label' column of the metadata CSV file
        if word_normalized not in metadata_csv['label_normalized'].values:
            print(f"Word '{word}' not found in {data_source} metadata.")
            continue

        # Get video links for the chosen word
        word_metadata = metadata_csv[metadata_csv['label_normalized'] == word_normalized]

        # Check if there are video links for the word
        video_links = word_metadata['video_url'].values
        if len(video_links) == 0:
            print(f"No video links found for word '{word}' in {data_source} data source")
            continue

        # Set save path for review videos (per data source)
        review_videos_path = pathlib.Path(f'{base_path}/{data_source}/videos/review/{word}')

        # Create the folder if it doesn't exist
        if not review_videos_path.exists():
            os.makedirs(review_videos_path)

        # Download the videos
        for i, video_url in enumerate(video_links):
            video_name = f"{data_source}_{word}_{i + 1}.mp4"  # Naming the video based on source and word
            save_path = review_videos_path / video_name
            print(f"Downloading video {i + 1} from {video_url}...")
            download_video_from_link(video_url, save_path)


def download_videos_for_words(words_to_download, data_sources, base_path='C:/Users/91974/Desktop/Healthbrazil'):
    """
    Download all videos for a list of words across all data sources.
    Args:
        words_to_download (list): List of words for which to download the videos.
        data_sources (dict): Dictionary of data sources.
        base_path (str): The base directory where the data sources are located.
    """
    for word in words_to_download:
        print(f"\nStarting download for word: {word}...\n")
        download_videos_for_word(word, data_sources, base_path)


def main():
    # List of available data sources
    data_sources = {1: 'INES', 2: 'V-Librasil', 3: 'SignBank'}

    # Define a list of words to download videos for
    words_to_download = ['VACINA', 'Antecipar', 'Prevenção', 'Hospital','SINTOMA','Medicamento','MÃ¡ximo','Medicina']

    # Download all videos for all words
    download_videos_for_words(words_to_download, data_sources)


if __name__ == "__main__":
    main()



Starting download for word: VACINA...

Processing word: VACINA from data source: INES...
Downloading video 1 from https://www.ines.gov.br/dicionario-de-libras/public/media/palavras/videos/vacinaSm_Prog001.mp4...
Video successfully downloaded to C:\Users\91974\Desktop\Healthbrazil\INES\videos\review\VACINA\INES_VACINA_1.mp4
Processing word: VACINA from data source: V-Librasil...
Word 'VACINA' not found in V-Librasil metadata.
Processing word: VACINA from data source: SignBank...
Downloading video 1 from https://videos.nals.cce.ufsc.br/SignBank/V%C3%ADdeos/VACINA.mp4...
Error downloading video from https://videos.nals.cce.ufsc.br/SignBank/V%C3%ADdeos/VACINA.mp4: HTTPSConnectionPool(host='videos.nals.cce.ufsc.br', port=443): Max retries exceeded with url: /SignBank/V%C3%ADdeos/VACINA.mp4 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))

Starting download for word: Antecipa