In [1]:
import requests
import webbrowser
import pandas as pd
import os
from tqdm import tqdm
import zipfile
import glob
import subprocess  # Added for GitHub CLI

def check_readme_issues(model_data):
    """
    Check for issues with the README file.
    You can implement specific logic here.
    """
    readme = model_data.get("readme")
    
    if not readme:
        return "README missing"
    
    if not isinstance(readme, str):
        return "README is not a text file"
    
    # Add more checks as needed
    
    return "No issues"  # If no issues found

def check_zip_issues(zip_path):
    """
    Check for issues with the zip file.
    You can implement specific logic here.
    """
    issues = []
    
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            file_list = zip_file.namelist()
            
            # Check for issues
            if any('__pycache__'in file for file in file_list):
                issues.append("__pycache__ folder")
            for bad_folder in ('arm64/', 'i386/', 'x86_64/'):
                if any(bad_folder in file for file in file_list):
                    issues.append(f"Bad folder: {bad_folder}")
            for invalid_type in ('.so', '.dll', '.o', '.pyc'):
                if any(file.endswith(invalid_type) for file in file_list):
                    issues.append(f"Invalid file format: {invalid_type}")
            if any(file.startswith('__MACOSX/') for file in file_list):
                issues.append("__MACOSX folder")
            top_level_folders=set([file.split('/')[0] for file in file_list])
            if len(top_level_folders) > 1:
                issues.append("Multiple top-level folders")
            if any('DS_Store' in file for file in file_list):
                issues.append('DS_store')


    except Exception as e:
        issues.append(f"Error scanning zip file: {str(e)}")

    if not issues:
        return "No issues"
    
    return ", ".join(issues)



In [2]:
current=pd.read_csv("current_readme_issues.csv",index_col=0)
print(f"{current['README Issues'].count()} of {len(current)} models filled")
current

1 of 1828 models filled


Unnamed: 0,Model ID,README Issues
0,279,n
1,2487,
2,2488,
3,2730,
4,2733,
...,...,...
1823,2014817,
1824,267599,
1825,267144,
1826,2014825,


In [3]:
for idx, row in current.iterrows():
    if pd.isna(row['README Issues']):
        webbrowser.open(f"modeldb.science/{int(row['Model ID'])}")
        inp=input('Problem: ')
        if not inp=='q':
            current.loc[idx,'README Issues']=inp
        else:
            break

current.to_csv('current_readme_issues.csv')
print(f"{current['README Issues'].count()} of {len(current)} models filled")
current

Problem: q
1 of 1828 models filled


Unnamed: 0,Model ID,README Issues
0,279,n
1,2487,
2,2488,
3,2730,
4,2733,
...,...,...
1823,2014817,
1824,267599,
1825,267144,
1826,2014825,


url=''

In [2]:
# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

In [4]:
# get all metadata
api_url = "https://modeldb.science/api/v1/models"
data_dict={}
for model_id in tqdm(data):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()
    data_dict[model_id]=model_data

  0%|          | 0/1828 [00:00<?, ?it/s]100%|██████████| 1828/1828 [04:05<00:00,  7.46it/s]


In [None]:
# Get all zip files (not necessary if they are already downloaded)

for model_id in tqdm(data):
    zip_url=f"https://modeldb.science/download/{model_id}"
    zip_filename = f"zip_files/{model_id}.zip"
    with open(zip_filename, 'wb') as zip_file:
        zip_response = requests.get(zip_url)
        zip_file.write(zip_response.content)

In [5]:
#Iterate over all data and save to dataframe

# Create an empty list to store the data
data_list = []

# Initialize count variables
total_models = 0
github_models = 0
zip_issues_models = 0

# Iterate through model IDs
for model_id in tqdm(data):
    # Fetch metadata for the model
    model_data=data_dict[model_id]
    # Extract additional metadata
    modeling_application = ", ".join(item['object_name'] for item in model_data.get('modeling_application', {}).get('value', []))
    model_paper = ", ".join(item['object_name'] for item in model_data.get('model_paper', {}).get('value', []))
    implemented_by = ", ".join(item['object_name'] for item in model_data.get('implemented_by', {}).get('value', []))
    public_submitter_email = model_data.get('public_submitter_email', {}).get('value', '')
    simPFid = model_data.get('simPFid', {}).get('value', 0)
    has_modelview = model_data.get('has_modelview', {}).get('value', False)

    # Extract missing metadata
    model_type = ", ".join(item['object_name'] for item in model_data.get('model_type', {}).get('value', []))
    cell_types = ", ".join(item['object_name'] for item in model_data.get('neurons', {}).get('value', []))
    currents = ", ".join(item['object_name'] for item in model_data.get('currents', {}).get('value', []))
    model_concept = ", ".join(item['object_name'] for item in model_data.get('model_concept', {}).get('value', []))
    
    # Check if GitHub repository URL exists
    github_repo_url = f"https://github.com/modeldbrepository/{model_id}"
    github_repo_exists = requests.head(github_repo_url).status_code == 200
    
    # Check README issues
    readme_issues = check_readme_issues(model_data)

    # Check zip issues
    zip_filename = f"zip_files/{model_id}.zip"
    zip_issues = check_zip_issues(zip_filename)
    
    # Count models with zip issues
    if zip_issues != "No issues":
        zip_issues_models += 1
    
    # Increment the total count
    total_models += 1
    
    # Count models with GitHub repositories
    if github_repo_exists:
        github_models += 1

    # Append the data to the list
    data_list.append({
        "Model ID": model_id,
        "Model Type": model_type,
        "Cell Types": cell_types,
        "Currents": currents,
        "Model Concept": model_concept,
        "Modeling Application": modeling_application,
        "Model Paper": model_paper,
        "Implemented By": implemented_by,
        "Public Submitter Email": public_submitter_email,
        "simPFid": simPFid,
        "Has Modelview": has_modelview,
        "GitHub Repository Exists": github_repo_exists,
        "README Issues": readme_issues,
        "Zip Issues": zip_issues,
        "total metadata": len(model_data.items())
    })

# Create a single-row summary DataFrame
summary_data = [
    {
        "Total Models Processed": total_models,
        "Total Models with GitHub Repositories": github_models,
        "Total Models with Zip Issues": zip_issues_models
    }
]

# Add the summary data as the first row
data_list.insert(0, summary_data[0])

# Create a DataFrame from the list of data
df = pd.DataFrame(data_list)

# Export the DataFrame to Excel or CSV
with pd.ExcelWriter("modeldb_data_oct_5.xlsx", engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='Model Data', index=False)

# Print the summary
print("Data exported to Excel successfully.")


100%|██████████| 1828/1828 [15:16<00:00,  2.00it/s]


Data exported to Excel successfully.


Zipfile 

In [9]:
import requests
from bs4 import BeautifulSoup

# Define the ModelDB URL to scrape
modeldb_url = "https://modeldb.science/"

# Send an HTTP GET request to the ModelDB URL
response = requests.get(modeldb_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all the model links on the page
    model_links = soup.find_all("a", href=True)

    # Create a list to store model IDs and zip file URLs
    model_data_list = []

    # Iterate through the model links
    for link in model_links:
        href = link.get("href")
        if href.startswith("/model/"):
            model_id = href.split("/")[-1]
            zip_url = f"https://modeldb.science/model/{model_id}/files/{model_id}.zip"
            model_data_list.append({"Model ID": model_id, "Zip URL": zip_url})

    # Now, you have a list of model IDs and zip file URLs
    # You can modify your existing script to use this data for downloading and checking zip files

else:
    print("Failed to retrieve ModelDB page. Status code:", response.status_code)


In [7]:
# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create an empty list to store the data
data_list = []

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

model_id=data[5]
model_id

2796

In [38]:
zip_issues=[]
for model_id in data:
    zip_path=f"zip_files/{model_id}.zip"
    zip_issues.append(check_zip_issues(zip_path))

model_df=pd.read_excel("modeldb_data_1.xlsx")
model_df['new_zip_issues']=['']+zip_issues
model_df

Unnamed: 0,Total Models Processed,Total Models with GitHub Repositories,Total Models with Zip Issues,Model ID,Model Type,Cell Types,Currents,Model Concept,Modeling Application,Model Paper,Implemented By,Public Submitter Email,simPFid,Has Modelview,GitHub Repository Exists,README Issues,Zip Issues,new_zip_issues
0,1828.0,1493.0,0.0,,,,,,,,,,,,,,,
1,,,,279.0,Neuron or other electrically excitable cell,Thalamus geniculate nucleus/lateral principal ...,"I Na,t, I T low threshold, I K","Dendritic Action Potentials, Bursting, Ion Cha...",NEURON,"Destexhe A, Neubig M, Ulrich D, Huguenard J (1...","Destexhe, Alain [Destexhe at iaf.cnrs-gif.fr]",,275.0,1.0,1.0,README missing,No zip file available,No issues
2,,,,2487.0,Neuron or other electrically excitable cell,Olfactory bulb main mitral GLU cell,"I Na,t, I L high threshold, I A, I K, I K,leak...","Parameter Fitting, Simplified Models, Olfaction",NEURON,"Davison AP, Feng J, Brown D (2000)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,304.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder, Multiple top-level folders"
3,,,,2488.0,Neuron or other electrically excitable cell,"Neocortex L5/6 pyramidal GLU cell, Neocortex L...","I Na,t, I K, I M, I K,Ca, I Sodium, I Calcium,...","Activity Patterns, Active Dendrites, Influence...",NEURON,"Mainen ZF, Sejnowski TJ (1996)","Mainen, Zach [Mainen at cshl.edu]",,315.0,1.0,1.0,README missing,No zip file available,No issues
4,,,,2730.0,Realistic Network,"Olfactory bulb main mitral GLU cell, Olfactory...","I Na,t, I L high threshold, I A, I K, I K,leak...","Oscillations, Synchronization, Spatio-temporal...",NEURON,"Davison AP, Feng J, Brown D (2003)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,302.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder, Multiple top-level folders"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1824,,,,2014817.0,Neuron or other electrically excitable cell,Cardiac ventricular cell,,Action Potentials,XPPAUT,"Bueno-Orovio A, Cherry EM, Fenton FH. (2008)",,071320@tool.caaumed.org.tw,0.0,0.0,0.0,README missing,No zip file available,No issues
1825,,,,267599.0,Neuron or other electrically excitable cell,Hippocampus CA1 pyramidal GLU cell,I TRPM4,,NEURON,"Combe CL, Upchurch CM, Canavier CC, Gasparini ...","Canavier, CC, Upchurch, Carol M",cupchu@lsuhsc.edu,0.0,0.0,1.0,README missing,No zip file available,Invalid file format: .dll
1826,,,,267144.0,,Abstract integrate-and-fire adaptive exponenti...,,,MATLAB,"Maes A, Barahona M, Clopath C (2023)","Maes, Amadeus [amadeus.maes at gmail.com]",amadeus.maes@gmail.com,0.0,0.0,1.0,README missing,No zip file available,No issues
1827,,,,2014825.0,,,,"Learning, Reinforcement Learning, Synaptic Pla...",Python,"Blackwell K, Doya K (2023)",,kim-blackwell@uiowa.edu,0.0,0.0,1.0,README missing,No zip file available,No issues


In [None]:
for model_id in tqdm(data):
    zip_url=f"https://modeldb.science/download/{model_id}"
    zip_filename = f"zip_files/{model_id}.zip"
    with open(zip_filename, 'wb') as zip_file:
        zip_response = requests.get(zip_url)
        zip_file.write(zip_response.content)


In [11]:
new_data= pd.read_excel("modeldb_data_with_zip.xlsx")
new_data

Unnamed: 0,Total Models Processed,Total Models with GitHub Repositories,Total Models with Zip Issues,Model ID,Model Type,Cell Types,Currents,Model Concept,Modeling Application,Model Paper,Implemented By,Public Submitter Email,simPFid,Has Modelview,GitHub Repository Exists,README Issues,Zip Issues,new_zip_issues
0,1828.0,1493.0,0.0,,,,,,,,,,,,,,,
1,,,,279.0,Neuron or other electrically excitable cell,Thalamus geniculate nucleus/lateral principal ...,"I Na,t, I T low threshold, I K","Dendritic Action Potentials, Bursting, Ion Cha...",NEURON,"Destexhe A, Neubig M, Ulrich D, Huguenard J (1...","Destexhe, Alain [Destexhe at iaf.cnrs-gif.fr]",,275.0,1.0,1.0,README missing,No zip file available,No issues
2,,,,2487.0,Neuron or other electrically excitable cell,Olfactory bulb main mitral GLU cell,"I Na,t, I L high threshold, I A, I K, I K,leak...","Parameter Fitting, Simplified Models, Olfaction",NEURON,"Davison AP, Feng J, Brown D (2000)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,304.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder found, Multiple top-level fold..."
3,,,,2488.0,Neuron or other electrically excitable cell,"Neocortex L5/6 pyramidal GLU cell, Neocortex L...","I Na,t, I K, I M, I K,Ca, I Sodium, I Calcium,...","Activity Patterns, Active Dendrites, Influence...",NEURON,"Mainen ZF, Sejnowski TJ (1996)","Mainen, Zach [Mainen at cshl.edu]",,315.0,1.0,1.0,README missing,No zip file available,No issues
4,,,,2730.0,Realistic Network,"Olfactory bulb main mitral GLU cell, Olfactory...","I Na,t, I L high threshold, I A, I K, I K,leak...","Oscillations, Synchronization, Spatio-temporal...",NEURON,"Davison AP, Feng J, Brown D (2003)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,302.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder found, Multiple top-level fold..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1824,,,,2014817.0,Neuron or other electrically excitable cell,Cardiac ventricular cell,,Action Potentials,XPPAUT,"Bueno-Orovio A, Cherry EM, Fenton FH. (2008)",,071320@tool.caaumed.org.tw,0.0,0.0,0.0,README missing,No zip file available,No issues
1825,,,,267599.0,Neuron or other electrically excitable cell,Hippocampus CA1 pyramidal GLU cell,I TRPM4,,NEURON,"Combe CL, Upchurch CM, Canavier CC, Gasparini ...","Canavier, CC, Upchurch, Carol M",cupchu@lsuhsc.edu,0.0,0.0,1.0,README missing,No zip file available,Invalid file format found
1826,,,,267144.0,,Abstract integrate-and-fire adaptive exponenti...,,,MATLAB,"Maes A, Barahona M, Clopath C (2023)","Maes, Amadeus [amadeus.maes at gmail.com]",amadeus.maes@gmail.com,0.0,0.0,1.0,README missing,No zip file available,No issues
1827,,,,2014825.0,,,,"Learning, Reinforcement Learning, Synaptic Pla...",Python,"Blackwell K, Doya K (2023)",,kim-blackwell@uiowa.edu,0.0,0.0,1.0,README missing,No zip file available,No issues


In [35]:

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    file_list = zip_file.namelist()

In [36]:
file_list

['267618/',
 '267618/nerve/',
 '267618/nerve/AFibreBuilder.hoc',
 '267618/nerve/sciaticNerveCoords.txt',
 '267618/nerve/fasciclesInfo.txt',
 '267618/nerve/sciaticNerveBuilder.hoc',
 '267618/nerve/CFibreBuilder.hoc',
 '267618/nerve/fibreCoordsGen.m',
 '267618/nerve/simpleFascicle.hoc',
 '267618/nerve/simpleFibre.hoc',
 '267618/example_sciaticNerve.py',
 '267618/example_simple.py',
 '267618/Makefile',
 '267618/setrx.hoc',
 '267618/COMSOL2NEURON_auto_conv.py',
 '267618/NEURON2COMSOL_auto_conv.py',
 '267618/scripts/',
 '267618/scripts/Fig3_CFibreModelValidation.hoc',
 '267618/scripts/Fig5_TIME_arrangement.py',
 '267618/scripts/stimStrat.py',
 '267618/scripts/Fig7_rampKFS.m',
 '267618/scripts/Fig8_PE.m',
 '267618/scripts/Fig8_PE_hex.py',
 '267618/scripts/attachStim.hoc',
 '267618/scripts/Fig6_KFS_AFibre.py',
 '267618/scripts/Fig6_KFS.m',
 '267618/scripts/Fig8_PE_mono.py',
 '267618/scripts/Fig4_TIME_diam.py',
 '267618/scripts/Fig3_AFibreModelValidation.hoc',
 '267618/scripts/Fig7_rampKFS_tao

In [42]:
with pd.ExcelWriter("modeldb_data_with_zip.xlsx", engine='xlsxwriter') as writer:
    model_df.to_excel(writer, sheet_name='Model Data', index=False)

## Extra: 

In [22]:
import requests
import pandas as pd
import openpyxl
import webbrowser
import os

# Check if the Excel file exists or create it if it doesn't
excel_file_path = "modeldb_data_responses.xlsx"
if not os.path.isfile(excel_file_path):
    # Create an empty DataFrame and save it as an Excel file
    df = pd.DataFrame(columns=['Model ID', 'Response'])
    df.to_excel(excel_file_path, index=False)

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Function to open a webpage and record user response
def process_model(model_id, last_processed_model_id):
    # Check if this model has already been processed
    if model_id <= last_processed_model_id:
        return last_processed_model_id

    # Open the ModelDB webpage for the current model
    model_page_url = f"https://modeldb.science/{model_id}"

    try:
        response = requests.get(model_page_url)
        response.raise_for_status()  # Raise an error if the page is not found
        webbrowser.open(model_page_url)
    except requests.exceptions.RequestException as e:
        print(f"Error opening the page for Model ID {model_id}: {str(e)}")
        return last_processed_model_id

    # Prompt the user for input ('n' or 'y' for issues, 'q' to quit)
    response = input(f"Model ID {model_id}: Press 'n' if no issues, 'y' if there are issues, 'q' to quit: ")

    if response.lower() == 'q':
        return -1  # User wants to quit
    elif response.lower() in ['n', 'y']:
        # Append the response to the Excel file
        excel_data = pd.DataFrame({'Model ID': [model_id], 'Response': [response]})
        with pd.ExcelWriter(excel_file_path, mode='a', engine='openpyxl') as writer:
            writer.book = openpyxl.load_workbook(excel_file_path)
            excel_data.to_excel(writer, sheet_name='Responses', index=False)
        return model_id

    return last_processed_model_id

# Load the Excel file with model data (if it exists)
df = pd.read_excel(excel_file_path)
last_processed_model_id = df['Model ID'].max() if not df.empty else -1

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Process each model
for model_id in data:
    last_processed_model_id = process_model(model_id, last_processed_model_id)
    
    if last_processed_model_id == -1:
        print("User chose to quit.")
        break

print("Processing complete. You can continue from the last processed model ID.")


Model ID 279: Press 'n' if no issues, 'y' if there are issues, 'q' to quit: n
Model ID 2487: Press 'n' if no issues, 'y' if there are issues, 'q' to quit: n


ValueError: Sheet 'Responses' already exists and if_sheet_exists is set to 'error'.