# Selecting model repositories to include on our study

1. Load the data
2. Filter the data
3. Save the data

## 1 - Load the repositories' metadata

In [79]:
import pandas as pd
# uncompress zip file
# !unzip ../../data/huggingface_sort_by_downloads_top1000.json.zip -d ../../data/
# load the data
input_file = "../../data/huggingface_sort_by_createdAt_top996939.json"
df = pd.read_json(input_file)
df["created_at"].min() 

In [99]:
df["created_at"].min()

Timestamp('2022-03-02 23:29:04+0000', tz='UTC')

## 2 - Filter the repositories based on the following criteria
- models created before 2022; AND
- models last modified in 2024; AND
- models with at least one model file.

In [98]:
from analyticaml import MODEL_FILE_EXTENSIONS

df['last_modified'] = pd.to_datetime(df['last_modified'],utc=True)
df['created_at'] = pd.to_datetime(df['created_at'],utc=True)

# find models created_at before 2022 and last_modified in 2024
df_filtered = df[(df["created_at"].dt.year <= 2022) & (df["last_modified"].dt.year == 2024)]

# find models with at least one model file (extension in MODEL_FILE_EXTENSIONS)
df_filtered = df_filtered[df_filtered["siblings"].apply(lambda x: any([file["extension"] in MODEL_FILE_EXTENSIONS for file in x]))]
df_filtered

# find the oldest created_at date 
df["created_at"].min()


Timestamp('2022-03-02 23:29:04+0000', tz='UTC')

## 3 - Save the data

In [92]:
# ensure gated column is boolean to avoid runtime errors
# df_filtered["gated"] = df_filtered["gated"].astype(bool)
df_filtered.to_json(input_file.replace(".json", "_selected.json"))
df_filtered

KeyError: 'gated'