In [10]:
from pathlib import Path
from nb_utils import unzip, DATA_DIR, RESULTS_DIR
import pandas as pd
import os

In [11]:
# Extracted results, load to frame and delete the large CSV
unzip(DATA_DIR / 'sfconvertbot_pr_metadata.csv.zip', DATA_DIR)
df = pd.read_csv(Path('../../data/sfconvertbot_pr_metadata.csv'))
os.remove(DATA_DIR / 'sfconvertbot_pr_metadata.csv')
# add a date column
df['date'] = df['time'].str.split('T').str[0]
df['date'] = df['date'].str.split(' ').str[0]
# ensure date column is a datetime object
df['date'] = pd.to_datetime(df['date'])
# filter in place, keep only commits before Aug 31 2024 (including)
df = df[df['date'] <= pd.Timestamp(2024, 8, 31)]


# series for collecting dataset stats, columns = metric / value
stats = pd.Series()
stats.loc['Number of collected PRs'] = len(df)

# remove rows with missing values
df = df.dropna()
stats.loc['Number of PRs after removing missing values'] = len(df)

# remove rows with invalid JSONs
df = df[df['discussion_metadata'].str.startswith('{')]
stats.loc['Number of PRs after removing invalid JSONs'] = len(df)



# exclude PRs made on spaces/safetensors/convert
df = df[~df['pr_url'].str.contains('/spaces/safetensors/convert/discussions')]
stats.loc['Number of PRs after filtering by URL'] = len(df)

# show some stats
stats.loc['Number of repositories'] = df['model_id'].nunique()
stats.loc['Average number of PRs per repository'] = df['model_id'].value_counts().mean()
stats.loc['First commit time'] = df['time'].min()
stats.loc['Last commit time'] = df['time'].max()

stats

Number of collected PRs                                         43650.0
Number of PRs after removing missing values                     43598.0
Number of PRs after removing invalid JSONs                      43598.0
Number of PRs after filtering by URL                            43596.0
Number of repositories                                          35094.0
Average number of PRs per repository                           1.242264
First commit time                              2022-11-07T13:00:55.000Z
Last commit time                               2024-08-31T23:41:59.000Z
dtype: object

In [12]:
from tqdm import tqdm
import json

def label_status(row: pd.Series) -> str:
    # check if the PR has conflicts
    if row['conflicts']:
        return f"{row['status']} (has conflicts)"
    return row['status']

# Extract status changes from discussion metadata
for index, row in tqdm(df.iterrows(), total=len(df)):
    # add a column to track status changes
    status_changes = []
    discussion = json.loads(row['discussion_metadata'])['discussion']
    created_at = pd.to_datetime(discussion['createdAt'])
    author = discussion['author']['name']
    
    first_status_change_date = None
    df.at[index, 'was_status_changed_by_hf_staff'] = False
    for event in discussion['events']:
        event_type = event['type']
        
        if event_type == 'status-change':
            status_changes.append(event['data']['status'])
            # this means author has been deleted
            if 'author' not in event:
                event['author'] = {'name': 'UNKNOWN (likely a deleted account)', 'isHf':False}
            
            # convert to datetime
            if not first_status_change_date and event['author']['name'].lower() != 'sfconvertbot':
                first_status_change_date = pd.to_datetime(event['createdAt'])
                if event['author']['isHf']:
                    df.at[index, 'was_status_changed_by_hf_staff'] = True

    df.at[index, 'status_changes'] = ";".join(status_changes)

    # add a column to track files with conflicts
    df.at[index, 'conflicts'] = ""
    header = json.loads(row['header_metadata'])
    df.at[index, 'status'] = header['discussion']['status']
    # check if there are conflicting files
    conflicting_files = header['discussion'].get('filesWithConflicts', [])
    df.at[index, 'conflicts'] = ";".join(conflicting_files)
    # time to first response
    df.at[
        index, 'time_to_first_response'] = first_status_change_date - created_at if first_status_change_date else None
    # adjust the status
    df.at[index, 'status'] = label_status(df.loc[index])
    # Add a year column
    df['year'] = df['date'].dt.year

stats

100%|██████████| 43596/43596 [00:40<00:00, 1089.75it/s]


Number of collected PRs                                         43650.0
Number of PRs after removing missing values                     43598.0
Number of PRs after removing invalid JSONs                      43598.0
Number of PRs after filtering by URL                            43596.0
Number of repositories                                          35094.0
Average number of PRs per repository                           1.242264
First commit time                              2022-11-07T13:00:55.000Z
Last commit time                               2024-08-31T23:41:59.000Z
dtype: object

In [13]:
df

Unnamed: 0,pr_url,discussion_metadata,header_metadata,model_id,time,date,was_status_changed_by_hf_staff,status_changes,conflicts,status,time_to_first_response,year
2146,https://huggingface.co/Emmaruyi/CompetitionSty...,"{""apiBaseUrl"":""/api/models/Emmaruyi/Competitio...","{""discussion"":{""_id"":""66d3aa474b87a685cc25d69a...",Emmaruyi/CompetitionStyle-ArchitectureGenerator,2024-08-31T23:41:59.000Z,2024-08-31,False,merged,,merged,0 days 00:02:15,2024
2147,https://huggingface.co/Delta-Vector/Holland-4B...,"{""apiBaseUrl"":""/api/models/Delta-Vector/Hollan...","{""discussion"":{""_id"":""66d33f63b4396d43c3bab8a5...",Delta-Vector/Holland-4B-V1,2024-08-31T16:05:55.000Z,2024-08-31,False,merged,,merged,0 days 00:02:20,2024
2148,https://huggingface.co/vespa-engine/col-minilm...,"{""apiBaseUrl"":""/api/models/vespa-engine/col-mi...","{""discussion"":{""_id"":""66d1c3704c596f785214538c...",vespa-engine/col-minilm,2024-08-30T13:04:48.000Z,2024-08-30,False,,,open,NaT,2024
2149,https://huggingface.co/Abhaykoul/Abhayjr/discu...,"{""apiBaseUrl"":""/api/models/Abhaykoul/Abhayjr"",...","{""discussion"":{""_id"":""66d03d85378dfa1ea4a6565b...",Abhaykoul/Abhayjr,2024-08-29T09:21:09.000Z,2024-08-29,False,merged,,merged,0 days 00:00:30,2024
2150,https://huggingface.co/robotics-diffusion-tran...,"{""apiBaseUrl"":""/api/models/robotics-diffusion-...","{""discussion"":{""_id"":""66d01d22d2480a2555be07d4...",robotics-diffusion-transformer/rdt-1b,2024-08-29T07:02:58.000Z,2024-08-29,False,,,open,NaT,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
46315,https://huggingface.co/pchatz/palobert-base-gr...,"{""apiBaseUrl"":""/api/models/pchatz/palobert-bas...","{""discussion"":{""_id"":""642c682aac7e48a03a0999c2...",pchatz/palobert-base-greek-social-media,2023-04-04T18:10:50.000Z,2023-04-04,False,,,open,NaT,2023
46316,https://huggingface.co/Merry/pythia-160m-dedup...,"{""apiBaseUrl"":""/api/models/Crataco/Pythia-160M...","{""discussion"":{""_id"":""642c71f9c811cd1de5bacd0d...",Merry/pythia-160m-deduped-aid,2023-04-04T18:52:41.000Z,2023-04-04,False,merged,,merged,0 days 02:57:19,2023
46321,https://huggingface.co/babylm/t5-base-strict/d...,"{""apiBaseUrl"":""/api/models/babylm/t5-base-stri...","{""discussion"":{""_id"":""642cbf013bf67e40142189c6...",babylm/t5-base-strict,2023-04-05T00:21:21.000Z,2023-04-05,False,,,open,NaT,2023
46322,https://huggingface.co/zhaozh/autotrain-pubmed...,"{""apiBaseUrl"":""/api/models/sindri101/autotrain...","{""discussion"":{""_id"":""642d0c4f9efab84585efbea6...",zhaozh/autotrain-pubmed-medft-tiny-46854115566,2023-04-05T05:51:11.000Z,2023-04-05,False,,,open,NaT,2023


In [15]:
from nb_utils import read_repositories_evolution

# model files per repository and the elapsed days between the commit and safetensors' release date
repos = read_repositories_evolution()
repos['year'] = repos['date'].dt.year

# find the last commit hash per repository per year
df_last_hash = repos.sort_values('date').drop_duplicates(['repo_url', 'year'], keep='last')
df_last_hash = df_last_hash[['repo_url', 'year', 'commit_hash', 'date']]

# filter df such that if  only keeps the rows in df if the commit hash is the last commit hash in the repository URL
repos = repos.merge(df_last_hash, on=['repo_url', 'year', 'date', 'commit_hash'], how='inner')
"Min date", repos['date'].min(),"Max date",  repos['date'].max()

('Min date',
 Timestamp('2019-09-23 15:48:20'),
 'Max date',
 Timestamp('2024-08-26 04:05:14'))

In [16]:
repos

Unnamed: 0,repo_url,commit_hash,model_file_path,serialization_format,message,author,date,is_in_commit,change_status,elapsed_days,year
0,rinna/japanese-gpt-neox-small,f33d44540298e143079f34c35b21eebe4f3dce89,model.safetensors,safetensors,Adding `safetensors` variant of this model (#2...,Tianyu Zhao,2023-08-04 06:46:32,True,+,315,2023
1,rinna/japanese-gpt-neox-small,f33d44540298e143079f34c35b21eebe4f3dce89,pytorch_model.bin,torch.save,Adding `safetensors` variant of this model (#2...,Tianyu Zhao,2023-08-04 06:46:32,False,,315,2023
2,rinna/japanese-gpt-neox-small,f33d44540298e143079f34c35b21eebe4f3dce89,spiece.model,ONNX,Adding `safetensors` variant of this model (#2...,Tianyu Zhao,2023-08-04 06:46:32,False,,315,2023
3,rinna/japanese-gpt-neox-small,f3cdc9a0868ac3d6dbd8f7431f719dc07c5114e9,pytorch_model.bin,torch.save,init commit,Tianyu Zhao,2022-08-31 02:01:42,True,+,-23,2022
4,rinna/japanese-gpt-neox-small,f3cdc9a0868ac3d6dbd8f7431f719dc07c5114e9,spiece.model,ONNX,init commit,Tianyu Zhao,2022-08-31 02:01:42,True,+,-23,2022
...,...,...,...,...,...,...,...,...,...,...,...
7545,albert/albert-base-v1,66b3f3c279cf3a1df6d8df1cffa60d4d026ea396,with-prefix-tf_model.h5,h5/hdf5,Adding `safetensors` variant of this model (#1...,Sylvain Gugger,2023-04-06 09:42:57,False,,195,2023
7546,albert/albert-base-v1,7b2207b0995b9618ed866befd087bee3673f022d,pytorch_model.bin,torch.save,Update tf_model.h5,system,2020-06-22 11:29:35,False,,-823,2020
7547,albert/albert-base-v1,7b2207b0995b9618ed866befd087bee3673f022d,spiece.model,ONNX,Update tf_model.h5,system,2020-06-22 11:29:35,False,,-823,2020
7548,albert/albert-base-v1,7b2207b0995b9618ed866befd087bee3673f022d,tf_model.h5,h5/hdf5,Update tf_model.h5,system,2020-06-22 11:29:35,True,+,-823,2020


In [17]:
repos_torch_save = repos[repos['serialization_format'] == 'torch.save']
repos_torch_save

Unnamed: 0,repo_url,commit_hash,model_file_path,serialization_format,message,author,date,is_in_commit,change_status,elapsed_days,year
1,rinna/japanese-gpt-neox-small,f33d44540298e143079f34c35b21eebe4f3dce89,pytorch_model.bin,torch.save,Adding `safetensors` variant of this model (#2...,Tianyu Zhao,2023-08-04 06:46:32,False,,315,2023
3,rinna/japanese-gpt-neox-small,f3cdc9a0868ac3d6dbd8f7431f719dc07c5114e9,pytorch_model.bin,torch.save,init commit,Tianyu Zhao,2022-08-31 02:01:42,True,+,-23,2022
5,Intel/roberta-base-squad2-int8-static-inc,79caea4b54eec88d1a75f641ecce4446aa8fd2a9,pytorch_model.bin,torch.save,upload int8 model.,"Lv, Kaokao",2022-08-30 05:25:59,True,+,-24,2022
6,Intel/roberta-base-squad2-int8-static-inc,79caea4b54eec88d1a75f641ecce4446aa8fd2a9,training_args.bin,torch.save,upload int8 model.,"Lv, Kaokao",2022-08-30 05:25:59,True,+,-24,2022
7,saphvis/ngpx2022,d5011c5daf903087aa8d424c878cc82640611d1d,ngpx2022.pt,torch.save,ngpx2022 400000 upload,ocular degenerate,2022-08-29 21:26:46,True,+,-25,2022
...,...,...,...,...,...,...,...,...,...,...,...
7531,albert/albert-base-v2,c3fcc1d2675a05929ea687d4ffbb537d1aa9c57a,pytorch_model.bin,torch.save,Adding `safetensors` variant of this model (#2...,Sylvain Gugger,2023-04-06 09:39:39,False,,195,2023
7535,albert/albert-base-v2,44ec114441af838d5133509b2bf7ed774c6b93c1,pytorch_model.bin,torch.save,Update tf_model.h5,system,2020-06-22 11:32:37,False,,-823,2020
7539,albert/albert-base-v2,ada8cde1d4486a2de1a5a1a242d43251074bf415,pytorch_model.bin,torch.save,Update pytorch_model.bin,system,2019-11-06 11:24:17,True,+,-1052,2019
7542,albert/albert-base-v1,66b3f3c279cf3a1df6d8df1cffa60d4d026ea396,pytorch_model.bin,torch.save,Adding `safetensors` variant of this model (#1...,Sylvain Gugger,2023-04-06 09:42:57,False,,195,2023


In [19]:
df_open_status = df[df['status'] == 'open']
df_open_status

Unnamed: 0,pr_url,discussion_metadata,header_metadata,model_id,time,date,was_status_changed_by_hf_staff,status_changes,conflicts,status,time_to_first_response,year
2148,https://huggingface.co/vespa-engine/col-minilm...,"{""apiBaseUrl"":""/api/models/vespa-engine/col-mi...","{""discussion"":{""_id"":""66d1c3704c596f785214538c...",vespa-engine/col-minilm,2024-08-30T13:04:48.000Z,2024-08-30,False,,,open,NaT,2024
2150,https://huggingface.co/robotics-diffusion-tran...,"{""apiBaseUrl"":""/api/models/robotics-diffusion-...","{""discussion"":{""_id"":""66d01d22d2480a2555be07d4...",robotics-diffusion-transformer/rdt-1b,2024-08-29T07:02:58.000Z,2024-08-29,False,,,open,NaT,2024
2151,https://huggingface.co/Kwai-Kolors/Kolors-IP-A...,"{""apiBaseUrl"":""/api/models/Kwai-Kolors/Kolors-...","{""discussion"":{""_id"":""66cfd824aab5a6fbf824503c...",Kwai-Kolors/Kolors-IP-Adapter-Plus,2024-08-29T02:08:36.000Z,2024-08-29,False,,,open,NaT,2024
2153,https://huggingface.co/cerspense/zeroscope_v2_...,"{""apiBaseUrl"":""/api/models/cerspense/zeroscope...","{""discussion"":{""_id"":""66cef3496116c7f53e8327c0...",cerspense/zeroscope_v2_576w,2024-08-28T09:52:09.000Z,2024-08-28,False,,,open,NaT,2024
2154,https://huggingface.co/cvssp/audioldm-l-full/d...,"{""apiBaseUrl"":""/api/models/cvssp/audioldm-l-fu...","{""discussion"":{""_id"":""66ced702103ac47986f07ad4...",cvssp/audioldm-l-full,2024-08-28T07:51:30.000Z,2024-08-28,False,,,open,NaT,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
46309,https://huggingface.co/babylm/roberta-base-str...,"{""apiBaseUrl"":""/api/models/babylm/roberta-base...","{""discussion"":{""_id"":""642c25fafe98081a6fdd65f3...",babylm/roberta-base-strict,2023-04-04T13:28:26.000Z,2023-04-04,False,,,open,NaT,2023
46315,https://huggingface.co/pchatz/palobert-base-gr...,"{""apiBaseUrl"":""/api/models/pchatz/palobert-bas...","{""discussion"":{""_id"":""642c682aac7e48a03a0999c2...",pchatz/palobert-base-greek-social-media,2023-04-04T18:10:50.000Z,2023-04-04,False,,,open,NaT,2023
46321,https://huggingface.co/babylm/t5-base-strict/d...,"{""apiBaseUrl"":""/api/models/babylm/t5-base-stri...","{""discussion"":{""_id"":""642cbf013bf67e40142189c6...",babylm/t5-base-strict,2023-04-05T00:21:21.000Z,2023-04-05,False,,,open,NaT,2023
46322,https://huggingface.co/zhaozh/autotrain-pubmed...,"{""apiBaseUrl"":""/api/models/sindri101/autotrain...","{""discussion"":{""_id"":""642d0c4f9efab84585efbea6...",zhaozh/autotrain-pubmed-medft-tiny-46854115566,2023-04-05T05:51:11.000Z,2023-04-05,False,,,open,NaT,2023


In [20]:
matching_instances = pd.merge(
    df_open_status,
    repos_torch_save,
    left_on='model_id',
    right_on='repo_url',
    how='inner'
)

# Get the count of matching instances
matching_count = len(matching_instances)

# Print the count
print(f"Number of matching instances: {matching_count}")

Number of matching instances: 300


In [21]:
matching_instances_no_dups = matching_instances.drop_duplicates()

# Get the count of unique matching instances
matching_count_no_dups = len(matching_instances_no_dups)

# Print the count
print(f"Number of unique matching instances: {matching_count_no_dups}")

Number of unique matching instances: 300
