# Selecting model repositories to include on our study

1. Load the data
2. Filter the data
3. Save the data

## 1 - Load the repositories' metadata

In [28]:
import pandas as pd

# uncompress zip file
!unzip ../../data/huggingface_sort_by_createdAt_top996939.json.zip -d ../../data/
# load the data
input_file = "../../data/huggingface_sort_by_createdAt_top996939.json"
df = pd.read_json(input_file)
# delete the unzipped file
!rm ../../data/huggingface_sort_by_createdAt_top996939.json
df["created_at"].min()

Archive:  ../../data/huggingface_sort_by_createdAt_top996939.json.zip
  inflating: ../../data/huggingface_sort_by_createdAt_top996939.json  


Timestamp('2022-03-02 23:29:04+0000', tz='UTC')

In [36]:
# HF API has a unique situation: 
# It's important to note that there is a unique value, 2022-03-02T23:29:04.000Z assigned to all repositories that were created before we began storing creation dates.
df["created_at"].min()

## 2 - Filter the repositories based on the following criteria
- models created before 2022; AND
- models last modified in 2024; AND
- models with at least one model file.

In [38]:
from analyticaml import MODEL_FILE_EXTENSIONS

df['last_modified'] = pd.to_datetime(df['last_modified'], utc=True)
df['created_at'] = pd.to_datetime(df['created_at'], utc=True)


# find models created_at before Sep-2022 and last_modified in 2024
df_filtered = df[(df["created_at"].dt.year == 2022) & (df["created_at"].dt.month < 9) & (df["last_modified"].dt.year == 2024)]
df_filtered = df_filtered[(df_filtered["created_at"].dt.year <= 2022)]

# find models with at least one model file (extension in MODEL_FILE_EXTENSIONS)
df_filtered = df_filtered[
    df_filtered["siblings"].apply(lambda x: any([file["extension"] in MODEL_FILE_EXTENSIONS for file in x]))]
# 
df_filtered


Unnamed: 0,id,author,sha,last_modified,created_at,private,gated,disabled,downloads,likes,...,transformers_info,siblings,spaces,safetensors,lastModified,cardData,transformersInfo,_id,inference,modelId
932668,rinna/japanese-gpt-neox-small,rinna,6fe409ad8b0d2c99df9741f85d202535e995614b,2024-07-20 07:53:40+00:00,2022-08-31 05:58:25+00:00,False,False,,409,12,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-07-20 07:53:40+00:00,,,630ef8813f24a835ade6a213,explicit-opt-out,rinna/japanese-gpt-neox-small
932830,Intel/roberta-base-squad2-int8-static-inc,Intel,ced23b6a027eaea8b938f38ad34529415907fc3b,2024-03-21 12:57:36+00:00,2022-08-30 08:21:15+00:00,False,False,,0,1,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-03-21 12:57:36+00:00,,,630dc87bc0eca3037afc9361,pipeline-not-detected,Intel/roberta-base-squad2-int8-static-inc
932867,saphvis/ngpx2022,saphvis,d14f4d3aea1bdcc86a2dee670fe33c9685013495,2024-09-01 16:35:52+00:00,2022-08-30 01:23:35+00:00,False,False,,0,0,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-09-01 16:35:52+00:00,,,630d669781ef9b1772b5ad15,library-not-detected,saphvis/ngpx2022
932884,recogna-nlp/ptt5-base-summ-cstnews,recogna-nlp,948171d6b274c3ba02a7131077def64e804c1369,2024-01-02 19:50:08+00:00,2022-08-29 20:44:51+00:00,False,False,,21,8,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-01-02 19:50:08+00:00,,,630d25437dacb93b3357af4f,not-popular-enough,recogna-nlp/ptt5-base-summ-cstnews
932886,recogna-nlp/ptt5-base-summ-temario,recogna-nlp,659dc692e80022da91418270a24e422565cd6c20,2024-01-02 19:49:57+00:00,2022-08-29 20:34:35+00:00,False,False,,106,1,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-01-02 19:49:57+00:00,,,630d22db8e3ff0c7232f1453,not-popular-enough,recogna-nlp/ptt5-base-summ-temario
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996934,albert/albert-xlarge-v1,albert,ed6f87d14403b3c459a458fa6aa9dc5c51c517c1,2024-02-19 11:01:28+00:00,2022-03-02 23:29:04+00:00,False,False,,798,4,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-02-19 11:01:28+00:00,,,621ffdc036468d709f17432c,not-popular-enough,albert/albert-xlarge-v1
996935,albert/albert-large-v2,albert,dfed3a5ef4499fb3351c4ebbcf487375d1e942c8,2024-02-19 10:58:48+00:00,2022-03-02 23:29:04+00:00,False,False,,11271,16,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-02-19 10:58:48+00:00,,,621ffdc036468d709f17432b,not-popular-enough,albert/albert-large-v2
996936,albert/albert-large-v1,albert,94fd741fb5d6cb5bc578fc154837016c583bafef,2024-02-19 10:58:26+00:00,2022-03-02 23:29:04+00:00,False,False,,1522,3,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-02-19 10:58:26+00:00,,,621ffdc036468d709f17432a,not-popular-enough,albert/albert-large-v1
996937,albert/albert-base-v2,albert,8e2f239c5f8a2c0f253781ca60135db913e5c80c,2024-02-19 10:58:14+00:00,2022-03-02 23:29:04+00:00,False,False,,1890109,104,...,,"[{'rfilename': '.gitattributes', 'size': None,...",,,2024-02-19 10:58:14+00:00,,,621ffdc036468d709f174329,cold,albert/albert-base-v2


## 3 - Save the data

In [33]:
# ensure gated column is boolean to avoid runtime errors
df_filtered["gated"] = df_filtered["gated"].astype(bool)
df_filtered.to_json(input_file.replace(".json", "_selected.json"))
