# Selecting model repositories to include on our study

1. Load the data
2. Filter the data
3. Save the data

## 1 - Load the repositories' metadata

In [1]:
import pandas as pd

# uncompress zip file
!unzip ../../data/huggingface_sort_by_createdAt_top996939.json.zip -d ../../data/
# load the data
input_file = "../../data/huggingface_sort_by_createdAt_top996939.json"
df = pd.read_json(input_file)
# delete the unzipped file
!rm ../../data/huggingface_sort_by_createdAt_top996939.json
df["created_at"].min()

Archive:  ../../data/huggingface_sort_by_createdAt_top996939.json.zip
  inflating: ../../data/huggingface_sort_by_createdAt_top996939.json  


Timestamp('2022-03-02 23:29:04+0000', tz='UTC')

In [2]:
# HF API has a unique situation: 
# It's important to note that there is a unique value, 2022-03-02T23:29:04.000Z assigned to all repositories that were created before we began storing creation dates.
df["created_at"].min()

Timestamp('2022-03-02 23:29:04+0000', tz='UTC')

## 2 - Filter the repositories based on the following criteria
- models created before September 2022; AND
- models last modified in 2024; AND
- models with at least one model file.

In [3]:
from analyticaml import MODEL_FILE_EXTENSIONS

df['last_modified'] = pd.to_datetime(df['last_modified'], utc=True)
df['created_at'] = pd.to_datetime(df['created_at'], utc=True)


# find models created_at before Sep-2022 and last_modified in 2024
# df_filtered = df[(df["created_at"].dt.year == 2022) & (df["created_at"].dt.month < 9) & (df["last_modified"].dt.year == 2024)]
# df_filtered = df_filtered[(df_filtered["created_at"].dt.year <= 2022)]
df_filtered = df[(df["last_modified"].dt.year == 2024) & (df["created_at"].dt.year > 2022)]
df["gated"] = df["gated"].astype(bool)
# filter repositories not gated
df_filtered = df_filtered[df_filtered["gated"] == False]
# find models with at least one model file (extension in MODEL_FILE_EXTENSIONS)
df_filtered = df_filtered[
    df_filtered["siblings"].apply(lambda x: any([file["extension"] in MODEL_FILE_EXTENSIONS for file in x]))]
# 
df_filtered


Unnamed: 0,id,author,sha,last_modified,created_at,private,gated,disabled,downloads,likes,...,transformers_info,siblings,spaces,safetensors,lastModified,cardData,transformersInfo,_id,inference,modelId
1,dogssss/Qwen-Qwen1.5-0.5B-1727306842,dogssss,3e28ce8bbbad707344dbdd919fe18b73d3572e99,2024-09-25 23:27:26+00:00,2024-09-25 23:27:23+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-25 23:27:26+00:00,,,66f49c5bc0dddfd36c29e4d1,pipeline-not-detected,dogssss/Qwen-Qwen1.5-0.5B-1727306842
3,SALUTEASD/Qwen-Qwen1.5-1.8B-1727306835,SALUTEASD,7d829a8433154e6b0d7779dd7a114573dd1739e6,2024-09-25 23:27:21+00:00,2024-09-25 23:27:15+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-25 23:27:21+00:00,,,66f49c53d79aed46c0fcd04c,pipeline-not-detected,SALUTEASD/Qwen-Qwen1.5-1.8B-1727306835
4,jerseyjerry/google-gemma-2b-it-1727306802,jerseyjerry,f67eec5232be68230fa1782f05ba417f0e95c4bc,2024-09-25 23:26:53+00:00,2024-09-25 23:26:42+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-25 23:26:53+00:00,,,66f49c328908ed9cc978aea9,pipeline-not-detected,jerseyjerry/google-gemma-2b-it-1727306802
6,janetsnakehole/samus,janetsnakehole,3b7fb2394ac0a5f07429d883adecc7a78c4a6955,2024-09-25 23:26:54+00:00,2024-09-25 23:26:17+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-25 23:26:54+00:00,,,66f49c19d8487f70b018186c,library-not-detected,janetsnakehole/samus
17,dogssss/Qwen-Qwen1.5-1.8B-1727306569,dogssss,ce5f346b3a324fe3781dacc17ad479235b4536e4,2024-09-25 23:22:54+00:00,2024-09-25 23:22:50+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-25 23:22:54+00:00,,,66f49b4ac10c79a643356d1f,pipeline-not-detected,dogssss/Qwen-Qwen1.5-1.8B-1727306569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894573,microsoft/git-large-textvqa,microsoft,416f91974ee1fb3e797144c9fc93787740787161,2024-04-09 07:18:18+00:00,2023-01-02 11:18:10+00:00,False,False,,78,4,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-04-09 07:18:18+00:00,,,63b2bd72d6c6529ede73eed4,explicit-opt-out,microsoft/git-large-textvqa
894584,microsoft/git-base-msrvtt-qa,microsoft,63507860e967eb797703eb473d51166db176c63d,2024-04-04 07:37:26+00:00,2023-01-02 10:55:17+00:00,False,False,,102,1,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-04-04 07:37:26+00:00,,,63b2b8150dddc8f717efa0d4,explicit-opt-out,microsoft/git-base-msrvtt-qa
894612,nairaxo/bantulm,nairaxo,b0bfaa418237a00a6ad5783f21aeed752e3f751d,2024-09-13 07:44:53+00:00,2023-01-02 09:16:17+00:00,False,False,,38,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-13 07:44:53+00:00,,,63b2a0e10dddc8f717ee822c,not-popular-enough,nairaxo/bantulm
894738,stjiris/bert-large-portuguese-cased-legal-tsdae,stjiris,53b369fea6028508626ec2e1e88a24d15d713453,2024-04-17 08:59:42+00:00,2023-01-01 20:16:22+00:00,False,False,,15,2,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-04-17 08:59:42+00:00,,,63b1ea16000cd823284050e8,pipeline-library-pair-not-supported,stjiris/bert-large-portuguese-cased-legal-tsdae


In [ ]:
# how many repositories with likes > 0?
df_filtered[df_filtered["likes"] > 0].shape[0]

## 3 - Save the data

In [None]:
# ensure gated column is boolean to avoid runtime errors
# df_filtered["gated"] = df_filtered["gated"].astype(bool)
# df_filtered.to_json(input_file.replace(".json", "_selected.json"))
