# Fetching Datasets from a Hugging Face Organization
This notebook demonstrates how to retrieve datasets published by a specific organization on Hugging Face.

# Function to Retrieve Organization Datasets
This cell defines a function `get_org_datasets` that fetches all datasets published by a given organization.

In [1]:
import requests

def get_org_datasets(org_name):
    """
    Retrieves the datasets published by a specific organization on Hugging Face.
    
    Args:
        org_name (str): Name of the organization (e.g., 'oscur')
    
    Returns:
        list: A list of dataset IDs belonging to the organization
    """
    url = f"https://huggingface.co/api/datasets?author={org_name}"
    
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch datasets for {org_name}: {response.status_code}")
    
    datasets = response.json()
    return [dataset["id"] for dataset in datasets]


# Example Usage of `get_org_datasets`
This cell demonstrates how to use the `get_org_datasets` function to fetch datasets for a specific organization. Here, we use the organization name "oscur" as an example.

In [None]:
# Example usage:
org_name = "oscur"
datasets = get_org_datasets(org_name)

print(f"Datasets published by '{org_name}':")
for ds in datasets:
    print("-", ds)

Datasets published by 'oscur':
- oscur/automated-traffic-volume-counts-sample
- oscur/automated-traffic-volume-counts
- oscur/test_pluto
- oscur/NYC_vehicle_collisions_issue
- oscur/NYC_vehicle_collisions
- oscur/pluto
- oscur/NYC_311
- oscur/taxisvis1M
- oscur/NYC_raised_crosswalk
- oscur/NYC_speed_humps
- oscur/test-profiler1
- oscur/test2-profiler3
- oscur/test2-profiler4


# Downloading Files from Hugging Face Datasets
This cell defines functions to download specific files from datasets published by an organization.

In [None]:
import os
import requests

def get_org_datasets(org_name):
    """Fetch dataset IDs for a Hugging Face organization."""
    url = f"https://huggingface.co/api/datasets?author={org_name}"
    response = requests.get(url)
    response.raise_for_status()
    datasets = response.json()
    return [dataset["id"] for dataset in datasets]

def download_file_from_dataset(dataset_id, file_name, save_dir="downloads"):
    """
    Download a specific file from a Hugging Face dataset repo if it exists.

    Args:
        dataset_id (str): Full dataset ID (e.g., 'oscur/nyc-crashes')
        file_name (str): File to download (e.g., 'profiling_metadata.json')
        save_dir (str): Local folder to save downloaded files
    """
    # Construct URL to raw file
    url = f"https://huggingface.co/datasets/{dataset_id}/resolve/main/{file_name}"

    # Prepare local save path
    dataset_subdir = os.path.join(save_dir, dataset_id.replace("/", "__"))
    os.makedirs(dataset_subdir, exist_ok=True)
    file_path = os.path.join(dataset_subdir, file_name)

    # Attempt download
    response = requests.get(url)
    if response.status_code == 200:
        # with open(file_path, "wb") as f:
        #     f.write(response.content)
        print(f"✅ Downloaded {file_name} from {dataset_id}")
    else:
        print(f"❌ File not found in {dataset_id}: {file_name}")

def download_profiling_files(org_name, file_name="profiling_metadata.json"):
    """Download a specific file from all datasets in an org."""
    datasets = get_org_datasets(org_name)
    for dataset_id in datasets:
        download_file_from_dataset(dataset_id, file_name)



# Example Usage of File Download
This cell demonstrates how to download profiling files from all datasets in an organization.

In [4]:

# Example usage
download_profiling_files("oscur")

❌ File not found in oscur/automated-traffic-volume-counts-sample: profiling_metadata.json
❌ File not found in oscur/automated-traffic-volume-counts: profiling_metadata.json
❌ File not found in oscur/test_pluto: profiling_metadata.json
❌ File not found in oscur/NYC_vehicle_collisions_issue: profiling_metadata.json
✅ Downloaded profiling_metadata.json from oscur/NYC_vehicle_collisions
✅ Downloaded profiling_metadata.json from oscur/pluto
✅ Downloaded profiling_metadata.json from oscur/NYC_311
✅ Downloaded profiling_metadata.json from oscur/taxisvis1M
✅ Downloaded profiling_metadata.json from oscur/NYC_raised_crosswalk
✅ Downloaded profiling_metadata.json from oscur/NYC_speed_humps
❌ File not found in oscur/test-profiler1: profiling_metadata.json
✅ Downloaded profiling_metadata.json from oscur/test2-profiler3
❌ File not found in oscur/test2-profiler4: profiling_metadata.json
