In [1]:
from huggingface_hub import list_datasets
import pandas as pd
from tqdm import tqdm

In [2]:
def collect_arabic_datasets_with_tasks():

    # Convert generator to list to use len()
    datasets = list(list_datasets(
        filter="language:ar",
        full=True,
        sort="downloads"
    ))
    
    total_count = len(datasets)
    print(f" Total Arabic datasets found: {total_count}")

    filtered_data = []
    no_task_count = 0

    for ds in tqdm(datasets, desc="Filtering datasets"):
        card = ds.cardData or {}
        task_ids = card.get("task_ids")
        task_categories = card.get("task_categories")

        # Exclude if no task OR only task is 'other'
        has_valid_task = (
            (task_ids or task_categories) and
            not (task_categories == ["other"])
        )

        if has_valid_task:
            filtered_data.append({
                "Dataset ID": ds.id,
                "Language": "ar",
                "Task Categories": task_categories
            })
        else:
            no_task_count += 1

    with_task_count = len(filtered_data)

    print(f" Arabic datasets with valid tasks: {with_task_count}")
    print(f" Arabic datasets without tasks: {no_task_count}")

    df = pd.DataFrame(filtered_data)
    return df

In [3]:
df_arabic = collect_arabic_datasets_with_tasks()
df_arabic.head(1)

 Collecting Arabic datasets from Hugging Face...
 Total Arabic datasets found: 1219


Filtering datasets: 100%|███████████████████████████████████████████████████████| 1219/1219 [00:00<00:00, 420638.14it/s]

 Arabic datasets with valid tasks: 878
 Arabic datasets without tasks: 341





Unnamed: 0,Dataset ID,Language,Task Categories
0,allenai/c4,ar,"[text-generation, fill-mask]"
