In [2]:
!pip install requests pandas



In [4]:
!pip install gh #GitHub

Collecting gh
  Downloading gh-0.0.4.tar.gz (2.2 kB)
Collecting gitpython
  Downloading GitPython-3.1.37-py3-none-any.whl (190 kB)
[K     |████████████████████████████████| 190 kB 4.5 MB/s eta 0:00:01
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 8.2 MB/s  eta 0:00:01
[?25hCollecting smmap<6,>=3.0.1
  Downloading smmap-5.0.1-py3-none-any.whl (24 kB)
Building wheels for collected packages: gh
  Building wheel for gh (setup.py) ... [?25ldone
[?25h  Created wheel for gh: filename=gh-0.0.4-py3-none-any.whl size=2494 sha256=1e5588e922b4d4cf5cf8969e118d0dd98658fe06e9ea0dad22aabd73fe752bb8
  Stored in directory: /Users/polina/Library/Caches/pip/wheels/58/1e/e0/a4b6fdaca0298a2c05954830cb87d5bc152dc442881f90ee81
Successfully built gh
Installing collected packages: smmap, gitdb, gitpython, gh
Successfully installed gh-0.0.4 gitdb-4.0.10 gitpython-3.1.37 smmap-5.0.1


In [5]:
!gh --version

gh version: v0.0.4


In [18]:
# ALL METADATA (count, no Zipfile, and no README)
import requests
import pandas as pd
import os
import zipfile
import subprocess  # Added for GitHub CLI

def check_readme_issues(model_data):
    """
    Check for issues with the README file.
    You can implement specific logic here.
    """
    readme = model_data.get("readme")
    
    if not readme:
        return "README missing"
    
    if not isinstance(readme, str):
        return "README is not a text file"
    
    # Add more checks as needed
    
    return "No issues"  # If no issues found

def check_zip_issues(zip_path):
    """
    Check for issues with the zip file.
    You can implement specific logic here.
    """
    issues = []
    
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            file_list = zip_file.namelist()
            
            # Check for issues
            if '__pycache__' in file_list:
                issues.append("__pycache__ folder found")
            if any(file.endswith(('.so', '.dll', '.o', 'x86_64', 'arm64')) for file in file_list):
                issues.append("Invalid file format found")
            if any(file.startswith('__MACOSX/') for file in file_list):
                issues.append("__MACOSX folder found")
            if len(zip_file.filelist) > 1:
                issues.append("Multiple top-level folders found")

    except Exception as e:
        issues.append(f"Error scanning zip file: {str(e)}")

    if not issues:
        return "No issues"
    
    return ", ".join(issues)

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create an empty list to store the data
data_list = []

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

# Initialize count variables
total_models = 0
github_models = 0
zip_issues_models = 0

# Iterate through model IDs
for model_id in data:
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Extract additional metadata
    modeling_application = ", ".join(item['object_name'] for item in model_data.get('modeling_application', {}).get('value', []))
    model_paper = ", ".join(item['object_name'] for item in model_data.get('model_paper', {}).get('value', []))
    implemented_by = ", ".join(item['object_name'] for item in model_data.get('implemented_by', {}).get('value', []))
    public_submitter_email = model_data.get('public_submitter_email', {}).get('value', '')
    simPFid = model_data.get('simPFid', {}).get('value', 0)
    has_modelview = model_data.get('has_modelview', {}).get('value', False)

    # Extract missing metadata
    model_type = ", ".join(item['object_name'] for item in model_data.get('model_type', {}).get('value', []))
    cell_types = ", ".join(item['object_name'] for item in model_data.get('neurons', {}).get('value', []))
    currents = ", ".join(item['object_name'] for item in model_data.get('currents', {}).get('value', []))
    model_concept = ", ".join(item['object_name'] for item in model_data.get('model_concept', {}).get('value', []))
    
    # Check if GitHub repository URL exists
    github_repo_url = f"https://github.com/modeldbrepository/{model_id}"
    github_repo_exists = requests.head(github_repo_url).status_code == 200
    
    # Check README issues
    readme_issues = check_readme_issues(model_data)

    # Download the zip file
    zip_url = model_data.get("zip_url")
    if zip_url:
        zip_filename = f"zip_files/{model_id}.zip"
        with open(zip_filename, 'wb') as zip_file:
            zip_response = requests.get(zip_url)
            zip_file.write(zip_response.content)
        # Check zip issues
        zip_issues = check_zip_issues(zip_filename)
        
        # Count models with zip issues
        if zip_issues != "No issues":
            zip_issues_models += 1
    else:
        zip_issues = "No zip file available"
    
    # Increment the total count
    total_models += 1
    
    # Count models with GitHub repositories
    if github_repo_exists:
        github_models += 1

    # Append the data to the list
    data_list.append({
        "Model ID": model_id,
        "Model Type": model_type,
        "Cell Types": cell_types,
        "Currents": currents,
        "Model Concept": model_concept,
        "Modeling Application": modeling_application,
        "Model Paper": model_paper,
        "Implemented By": implemented_by,
        "Public Submitter Email": public_submitter_email,
        "simPFid": simPFid,
        "Has Modelview": has_modelview,
        "GitHub Repository Exists": github_repo_exists,
        "README Issues": readme_issues,
        "Zip Issues": zip_issues
    })

# Create a single-row summary DataFrame
summary_data = [
    {
        "Total Models Processed": total_models,
        "Total Models with GitHub Repositories": github_models,
        "Total Models with Zip Issues": zip_issues_models
    }
]

# Add the summary data as the first row
data_list.insert(0, summary_data[0])

# Create a DataFrame from the list of data
df = pd.DataFrame(data_list)

# Export the DataFrame to Excel or CSV
with pd.ExcelWriter("modeldb_data_1.xlsx", engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='Model Data', index=False)

# Print the summary
print("Data exported to Excel successfully.")


Data exported to Excel successfully.


Zipfile 

In [3]:
import requests
from bs4 import BeautifulSoup

# Define the ModelDB URL to scrape
modeldb_url = "https://modeldb.science/"

# Send an HTTP GET request to the ModelDB URL
response = requests.get(modeldb_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all the model links on the page
    model_links = soup.find_all("a", href=True)

    # Create a list to store model IDs and zip file URLs
    model_data_list = []

    # Iterate through the model links
    for link in model_links:
        href = link.get("href")
        if href.startswith("/model/"):
            model_id = href.split("/")[-1]
            zip_url = f"https://modeldb.science/model/{model_id}/files/{model_id}.zip"
            model_data_list.append({"Model ID": model_id, "Zip URL": zip_url})

    # Now, you have a list of model IDs and zip file URLs
    # You can modify your existing script to use this data for downloading and checking zip files

else:
    print("Failed to retrieve ModelDB page. Status code:", response.status_code)
