#### Libraries import, logging and HF API setup

In [65]:
from huggingface_hub import HfApi
import pandas as pd
import json
import logging
import arxiv as ax
from arxiv import Client, Search

logging.basicConfig(level=logging.DEBUG)

api = HfApi()

In [None]:
try:
    models_generator = api.list_models(full=True, cardData=True, fetch_config=True)
except Exception as e:
    logging.error(f"Error fetching models from Hugging Face API: {e}")
    models_generator = iter([])

data_list = []

model_iter = iter(models_generator)
while True:
    try:
        model = next(model_iter)
    except StopIteration:
        break
    except Exception as e:
        logging.error(f"Skipping model due to load error: {e}")
        continue
    try:
        arxiv_ids = [x for x in model.tags if x.startswith("arxiv")] # Extract arXiv IDs
        arxiv_ids_str = ", ".join(arxiv_ids) if arxiv_ids else None # Convert to a list of strings in case of multiple arXiv IDs

        base_model = model.card_data.get("base_model", None) if model.card_data else None #Extract base model
        if isinstance(base_model, list):
            base_model = ", ".join(base_model)
        elif not isinstance(base_model, str):
            base_model = None

        language = model.card_data.get("language", None) if model.card_data else None # Extract language and convert to a string if it's a list
        if isinstance(language, list):
            language = ", ".join(language)
        elif not isinstance(language, str):
            language = None

        # Extract evaluation results
        eval_list = []
        if model.card_data and model.card_data.eval_results:
            # Check if model_name exists in card_data
            if not model.card_data.model_name:
                logging.debug(f"Model {model.id} has eval_results but no model_name, skipping evaluation results.")
            else:
                for result in model.card_data.eval_results:
                    eval_list.append({
                        "task_type": result.task_type,
                        "dataset_name": result.dataset_name,
                        "metric_type": result.metric_type,
                        "metric_value": result.metric_value
                    })

        data_list.append({
            'id': model.id,
            'author': model.author,
            'created_at': model.created_at,
            'downloads': model.downloads,
            'pipeline_tag': model.pipeline_tag,
            'arxiv_ids': arxiv_ids_str,
            'base_model': base_model,
            'language': language,
            'evaluation_metrics': json.dumps(eval_list) if eval_list else None
        })
        
    except Exception as e:
        logging.error(f"Error processing model {model.id}: {e}")

logging.info(f"Processed {len(data_list)} models") # Processed 1404500 models
df_final = pd.DataFrame(data_list)

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not

Unnamed: 0,id,author,created_at,downloads,pipeline_tag,arxiv_ids,base_model,language
0,tabularisai/multilingual-sentiment-analysis,tabularisai,2024-12-07 17:56:18+00:00,69454,text-classification,,distilbert/distilbert-base-multilingual-cased,"en, zh, es, hi, ar, bn, pt, ru, ja, de, ms, te..."
1,ProsusAI/finbert,ProsusAI,2022-03-02 23:29:04+00:00,1559705,text-classification,arxiv:1908.10063,,en
2,BAAI/bge-reranker-v2-m3,BAAI,2024-03-15 13:32:18+00:00,963605,text-classification,"arxiv:2312.15503, arxiv:2402.03216",,multilingual
3,cardiffnlp/twitter-roberta-base-sentiment-latest,cardiffnlp,2022-03-15 01:21:58+00:00,2195066,text-classification,arxiv:2202.03829,,en
4,facebook/bart-large-mnli,facebook,2022-03-02 23:29:05+00:00,2823122,zero-shot-classification,"arxiv:1910.13461, arxiv:1909.00161",,
...,...,...,...,...,...,...,...,...
4995,Jeevesh8/bert-base-uncased_mnli_ft_69,Jeevesh8,2022-04-19 00:06:59+00:00,4,text-classification,,,
4996,Jeevesh8/bert-base-uncased_mnli_ft_65,Jeevesh8,2022-04-19 00:07:56+00:00,4,text-classification,,,
4997,ChrisZeng/electra-large-discriminator-nli-efl-...,ChrisZeng,2022-04-19 00:29:30+00:00,27,text-classification,,,
4998,anshr/distilbert_reward_model_01,anshr,2022-04-19 00:42:43+00:00,163,text-classification,,,


#### Data pre-processing

In [None]:
# Drop duplicates
if not df_final.empty:
    df_final = df_final.drop_duplicates(subset=['evaluation_metrics'], keep=False) # Drop duplicates based on evaluation metrics
    df_final["id_second_part"] = df_final["id"].apply(lambda x: x.split("/")[1] if "/" in x else x) 
    df_final = df_final.drop_duplicates(subset=["id_second_part"], keep="first") # Drop model duplicates
    df_final = df_final.drop(columns=["id_second_part"])
    df_final = df_final[df_final["downloads"] != 0]

df_final.to_csv("./output/full-dataset.csv", index=False)

In [None]:
dataframe = pd.read_csv("./output/full-dataset.csv") # 1404500 -> 25254
print("Before filtering:", dataframe.shape)
dataframe = dataframe[dataframe["downloads"] != 0] # 25254 -> 18741
#dataframe = dataframe[dataframe["downloads"] < 50] # Filter models with less than 50 downloads - Probably to apply. 
print("After filtering:", dataframe.shape)

dataframe.to_csv("./output/full-dataset.csv", index=False)

Before filtering: (25254, 9)
After filtering: (18741, 9)


#### Retrieving authors, title and summary for each paper and put them into a dataframe

In [None]:
print(f"Number of arvix ids {dataframe[dataframe["arxiv_ids"].notnull()].shape}")

paper_list = dataframe["arxiv_ids"].dropna().tolist() # Retrieve arXiv IDs
print(len(paper_list)) # 3125
split_data = [entry.strip() for item in paper_list for entry in item.split(',')] # Split and create a list of arXiv IDs
split_data = list(set(split_data)) # Remove duplicates
print(split_data)
print(len(split_data))

Number of arvix ids (3125, 9)
3125
['arxiv:2002.00293', 'arxiv:2202.03555', 'arxiv:2405.05374', 'arxiv:2104.08678', 'arxiv:2004.10964', 'arxiv:2402.16107', 'arxiv:2310.16609', 'arxiv:2311.09613', 'arxiv:2310.04921', 'arxiv:2405.00675', 'arxiv:2204.08387', 'arxiv:2403.03206', 'arxiv:2012.03411', 'arxiv:2106.13731', 'arxiv:2204.10757', 'arxiv:2401.02415', 'arxiv:2106.13687', 'arxiv:2203.00585', 'arxiv:2310.00752', 'arxiv:2101.11718', 'arxiv:2203.09509', 'arxiv:2311.11691', 'arxiv:2311.07052', 'arxiv:1603.08983', 'arxiv:2205.13147', 'arxiv:1907.12412', 'arxiv:1908.07490', 'arxiv:2306.01708', 'arxiv:2412.03187', 'arxiv:2405.07703', 'arxiv:1911.11641', 'arxiv:0000.00000', 'arxiv:2410.02525', 'arxiv:2305.10853', 'arxiv:2308.07124', 'arxiv:2311.13534', 'arxiv:2312.12450', 'arxiv:2304.12244', 'arxiv:2408.07990', 'arxiv:2310.16049', 'arxiv:2110.08207', 'arxiv:2405.19495', 'arxiv:2411.15734', 'arxiv:2402.14830', 'arxiv:2308.11878', 'arxiv:2312.06795', 'arxiv:2310.19923', 'arxiv:2401.10491', 'arx

In [None]:
paper_info_list = []
i = 0
for paper in split_data:
    try:
        print(f"Getting info for paper {paper}")
        paper_id = paper.split(":")[1].strip()
        print(f"Paper ID: {paper_id}")
        paper_info = api.paper_info(paper_id)
        if isinstance(paper_info.authors, list):
            paper_authors = ", ".join(paper_info.authors)
        else:
            continue
    except Exception as e:
        print(f"Error getting paper info for {paper}: {e}")
        continue

    
    paper_info_list.append([paper_info.id, paper_authors, paper_info.title, paper_info.summary])
    
paper_info_df = pd.DataFrame(paper_info_list, columns=["id", "authors", "title", "summary"])
paper_info_df.to_csv("./output/paper_info.csv", index=False)    