### Import libraries and links

from huggingface_hub import HfApi
import pandas as pd
hfapi = HfApi()

In [None]:
data_providers_path = '2024-11-05_data_providers.csv'
data_providers_df = pd.read_csv(data_providers_path)

In [None]:
data_providers_df['data_provider']

### Hugging Face

In [None]:
datasets = hfapi.list_datasets(filter="geospatial")
dataset_ids = [dataset.id for dataset in datasets]
for name in dataset_ids:
    print(name)

In [None]:
metadata_list = []
for dataset_id in dataset_ids:
    try:
        dataset = hfapi.dataset_info(dataset_id)
        description = dataset.description or "N/A"
        metadata = {
            'dataset_name': dataset.id,
            'dataset_description': description.replace('\n', ' ').replace('\t', ' ').strip(),
            'dataset_description_link': f"https://huggingface.co/datasets/{dataset.id}",
            'dataset_download_link': f"https://huggingface.co/datasets/{dataset.id}/resolve/main/data",  # General download path
            'doi': "N/A",  # Placeholder, as Hugging Face does not provide DOI directly
            'provider': 'Hugging Face',
            'authors': dataset.cardData.get('authors', "N/A") if dataset.cardData else "N/A",
            'date_publish': dataset.lastModified,
            'version': dataset.cardData.get('version', "N/A") if dataset.cardData else "N/A",
            'sensor': "N/A",  # Not available in Hugging Face metadata
            'temporal_resolution': "N/A",
            'spatial_extent': "N/A",
            'temporal_extent': "N/A",
            'raster': False,
            'vector': False,
            'table': True,
            'task': ", ".join(dataset.tags) if dataset.tags else "N/A",
            'number_of_samples': dataset.cardData.get('number_of_samples', "N/A") if dataset.cardData else "N/A",
            'label_by_machine': dataset.cardData.get('label_by_machine', False) if dataset.cardData else False,
            'label_by_human': dataset.cardData.get('label_by_human', False) if dataset.cardData else False
        }
        
        metadata_list.append(metadata)

    except Exception as e:
        print(f"Error fetching metadata for dataset {dataset_id}: {e}")


In [None]:
df = pd.DataFrame(metadata_list)
df.head()

In [None]:
output_path = 'dataset.csv'
df.to_csv(output_path, index=False)