In [1]:
import requests
import webbrowser
import pandas as pd
import os
from tqdm import tqdm
import zipfile
import glob
import subprocess  # Added for GitHub CLI

def check_readme_issues(model_data):
    """
    Check for issues with the README file.
    You can implement specific logic here.
    """
    readme = model_data.get("readme")

    if not readme:
        return "README missing"

    if not isinstance(readme, str):
        return "README is not a text file"

    # Add more checks as needed

    return "No issues"  # If no issues found

def check_zip_issues(zip_path):
    """
    Check for issues with the zip file.
    You can implement specific logic here.
    """
    issues = []

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            file_list = zip_file.namelist()

            # Check for issues
            if any('__pycache__'in file for file in file_list):
                issues.append("__pycache__ folder")
            for bad_folder in ('arm64/', 'i386/', 'x86_64/'):
                if any(bad_folder in file for file in file_list):
                    issues.append(f"Bad folder: {bad_folder}")
            for invalid_type in ('.so', '.dll', '.o', '.pyc'):
                if any(file.endswith(invalid_type) for file in file_list):
                    issues.append(f"Invalid file format: {invalid_type}")
            if any(file.startswith('__MACOSX/') for file in file_list):
                issues.append("__MACOSX folder")
            top_level_folders=set([file.split('/')[0] for file in file_list])
            if len(top_level_folders) > 1:
                issues.append("Multiple top-level folders")
            if any('DS_Store' in file for file in file_list):
                issues.append('DS_store')


    except Exception as e:
        issues.append(f"Error scanning zip file: {str(e)}")

    if not issues:
        return "No issues"

    return ", ".join(issues)



In [2]:
# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

In [None]:
# get all metadata
api_url = "https://modeldb.science/api/v1/models"
data_dict={}
for model_id in tqdm(data):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()
    data_dict[model_id]=model_data

  0%|          | 0/1828 [00:00<?, ?it/s]100%|██████████| 1828/1828 [04:05<00:00,  7.46it/s]


In [6]:
# Get all zip files (not necessary if they are already downloaded)

for model_id in tqdm(data):
    zip_url=f"https://modeldb.science/download/{model_id}"
    zip_filename = f"zip_files/{model_id}.zip"
    with open(zip_filename, 'wb') as zip_file:
        zip_response = requests.get(zip_url)
        zip_file.write(zip_response.content)

  9%|▉         | 172/1833 [01:31<14:42,  1.88it/s] 


KeyboardInterrupt: 

In [None]:
#Iterate over all data and save to dataframe

# Create an empty list to store the data
data_list = []

# Initialize count variables
total_models = 0
github_models = 0
zip_issues_models = 0

# Iterate through model IDs
for model_id in tqdm(data):
    # Fetch metadata for the model
    model_data=data_dict[model_id]
    # Extract additional metadata
    modeling_application = ", ".join(item['object_name'] for item in model_data.get('modeling_application', {}).get('value', []))
    model_paper = ", ".join(item['object_name'] for item in model_data.get('model_paper', {}).get('value', []))
    implemented_by = ", ".join(item['object_name'] for item in model_data.get('implemented_by', {}).get('value', []))
    public_submitter_email = model_data.get('public_submitter_email', {}).get('value', '')
    simPFid = model_data.get('simPFid', {}).get('value', 0)
    has_modelview = model_data.get('has_modelview', {}).get('value', False)

    # Extract missing metadata
    model_type = ", ".join(item['object_name'] for item in model_data.get('model_type', {}).get('value', []))
    cell_types = ", ".join(item['object_name'] for item in model_data.get('neurons', {}).get('value', []))
    currents = ", ".join(item['object_name'] for item in model_data.get('currents', {}).get('value', []))
    model_concept = ", ".join(item['object_name'] for item in model_data.get('model_concept', {}).get('value', []))

    # Check if GitHub repository URL exists
    github_repo_url = f"https://github.com/modeldbrepository/{model_id}"
    github_repo_exists = requests.head(github_repo_url).status_code == 200

    # Check README issues
    readme_issues = check_readme_issues(model_data)

    # Check zip issues
    zip_filename = f"zip_files/{model_id}.zip"
    zip_issues = check_zip_issues(zip_filename)

    # Count models with zip issues
    if zip_issues != "No issues":
        zip_issues_models += 1

    # Increment the total count
    total_models += 1

    # Count models with GitHub repositories
    if github_repo_exists:
        github_models += 1

    # Append the data to the list
    data_list.append({
        "Model ID": model_id,
        "Model Type": model_type,
        "Cell Types": cell_types,
        "Currents": currents,
        "Model Concept": model_concept,
        "Modeling Application": modeling_application,
        "Model Paper": model_paper,
        "Implemented By": implemented_by,
        "Public Submitter Email": public_submitter_email,
        "simPFid": simPFid,
        "Has Modelview": has_modelview,
        "GitHub Repository Exists": github_repo_exists,
        "README Issues": readme_issues,
        "Zip Issues": zip_issues,
        "total metadata": len(model_data.items())
    })

# Create a single-row summary DataFrame
summary_data = [
    {
        "Total Models Processed": total_models,
        "Total Models with GitHub Repositories": github_models,
        "Total Models with Zip Issues": zip_issues_models
    }
]

# Add the summary data as the first row
data_list.insert(0, summary_data[0])

# Create a DataFrame from the list of data
df = pd.DataFrame(data_list)

# Export the DataFrame to Excel or CSV
with pd.ExcelWriter("modeldb_data_oct_5.xlsx", engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='Model Data', index=False)

# Print the summary
print("Data exported to Excel successfully.")


100%|██████████| 1828/1828 [15:16<00:00,  2.00it/s]


Data exported to Excel successfully.


Zipfile

In [None]:
import requests
from bs4 import BeautifulSoup

# Define the ModelDB URL to scrape
modeldb_url = "https://modeldb.science/"

# Send an HTTP GET request to the ModelDB URL
response = requests.get(modeldb_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all the model links on the page
    model_links = soup.find_all("a", href=True)

    # Create a list to store model IDs and zip file URLs
    model_data_list = []

    # Iterate through the model links
    for link in model_links:
        href = link.get("href")
        if href.startswith("/model/"):
            model_id = href.split("/")[-1]
            zip_url = f"https://modeldb.science/model/{model_id}/files/{model_id}.zip"
            model_data_list.append({"Model ID": model_id, "Zip URL": zip_url})

    # Now, you have a list of model IDs and zip file URLs
    # You can modify your existing script to use this data for downloading and checking zip files

else:
    print("Failed to retrieve ModelDB page. Status code:", response.status_code)


In [8]:
# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create an empty list to store the data
data_list = []

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

model_id=data[5]
model_id

2796

In [9]:
zip_issues=[]
for model_id in data:
    zip_path=f"zip_files/{model_id}.zip"
    zip_issues.append(check_zip_issues(zip_path))

model_df=pd.read_excel("modeldb_data_1.xlsx")
model_df['new_zip_issues']=['']+zip_issues
model_df

NameError: name 'pd' is not defined

In [10]:
for model_id in tqdm(data):
    zip_url=f"https://modeldb.science/download/{model_id}"
    zip_filename = f"zip_files/{model_id}.zip"
    with open(zip_filename, 'wb') as zip_file:
        zip_response = requests.get(zip_url)
        zip_file.write(zip_response.content)


 93%|█████████▎| 1709/1833 [13:33<00:59,  2.10it/s]


ConnectionError: HTTPSConnectionPool(host='modeldb.science', port=443): Max retries exceeded with url: /download/267056 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000137019B13D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [None]:
new_data= pd.read_excel("modeldb_data_with_zip.xlsx")
new_data

Unnamed: 0,Total Models Processed,Total Models with GitHub Repositories,Total Models with Zip Issues,Model ID,Model Type,Cell Types,Currents,Model Concept,Modeling Application,Model Paper,Implemented By,Public Submitter Email,simPFid,Has Modelview,GitHub Repository Exists,README Issues,Zip Issues,new_zip_issues
0,1828.0,1493.0,0.0,,,,,,,,,,,,,,,
1,,,,279.0,Neuron or other electrically excitable cell,Thalamus geniculate nucleus/lateral principal ...,"I Na,t, I T low threshold, I K","Dendritic Action Potentials, Bursting, Ion Cha...",NEURON,"Destexhe A, Neubig M, Ulrich D, Huguenard J (1...","Destexhe, Alain [Destexhe at iaf.cnrs-gif.fr]",,275.0,1.0,1.0,README missing,No zip file available,No issues
2,,,,2487.0,Neuron or other electrically excitable cell,Olfactory bulb main mitral GLU cell,"I Na,t, I L high threshold, I A, I K, I K,leak...","Parameter Fitting, Simplified Models, Olfaction",NEURON,"Davison AP, Feng J, Brown D (2000)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,304.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder found, Multiple top-level fold..."
3,,,,2488.0,Neuron or other electrically excitable cell,"Neocortex L5/6 pyramidal GLU cell, Neocortex L...","I Na,t, I K, I M, I K,Ca, I Sodium, I Calcium,...","Activity Patterns, Active Dendrites, Influence...",NEURON,"Mainen ZF, Sejnowski TJ (1996)","Mainen, Zach [Mainen at cshl.edu]",,315.0,1.0,1.0,README missing,No zip file available,No issues
4,,,,2730.0,Realistic Network,"Olfactory bulb main mitral GLU cell, Olfactory...","I Na,t, I L high threshold, I A, I K, I K,leak...","Oscillations, Synchronization, Spatio-temporal...",NEURON,"Davison AP, Feng J, Brown D (2003)","Davison, Andrew [Andrew.Davison at iaf.cnrs-gi...",,302.0,1.0,1.0,README missing,No zip file available,"__MACOSX folder found, Multiple top-level fold..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1824,,,,2014817.0,Neuron or other electrically excitable cell,Cardiac ventricular cell,,Action Potentials,XPPAUT,"Bueno-Orovio A, Cherry EM, Fenton FH. (2008)",,071320@tool.caaumed.org.tw,0.0,0.0,0.0,README missing,No zip file available,No issues
1825,,,,267599.0,Neuron or other electrically excitable cell,Hippocampus CA1 pyramidal GLU cell,I TRPM4,,NEURON,"Combe CL, Upchurch CM, Canavier CC, Gasparini ...","Canavier, CC, Upchurch, Carol M",cupchu@lsuhsc.edu,0.0,0.0,1.0,README missing,No zip file available,Invalid file format found
1826,,,,267144.0,,Abstract integrate-and-fire adaptive exponenti...,,,MATLAB,"Maes A, Barahona M, Clopath C (2023)","Maes, Amadeus [amadeus.maes at gmail.com]",amadeus.maes@gmail.com,0.0,0.0,1.0,README missing,No zip file available,No issues
1827,,,,2014825.0,,,,"Learning, Reinforcement Learning, Synaptic Pla...",Python,"Blackwell K, Doya K (2023)",,kim-blackwell@uiowa.edu,0.0,0.0,1.0,README missing,No zip file available,No issues


In [None]:

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    file_list = zip_file.namelist()

In [None]:
file_list

['267618/',
 '267618/nerve/',
 '267618/nerve/AFibreBuilder.hoc',
 '267618/nerve/sciaticNerveCoords.txt',
 '267618/nerve/fasciclesInfo.txt',
 '267618/nerve/sciaticNerveBuilder.hoc',
 '267618/nerve/CFibreBuilder.hoc',
 '267618/nerve/fibreCoordsGen.m',
 '267618/nerve/simpleFascicle.hoc',
 '267618/nerve/simpleFibre.hoc',
 '267618/example_sciaticNerve.py',
 '267618/example_simple.py',
 '267618/Makefile',
 '267618/setrx.hoc',
 '267618/COMSOL2NEURON_auto_conv.py',
 '267618/NEURON2COMSOL_auto_conv.py',
 '267618/scripts/',
 '267618/scripts/Fig3_CFibreModelValidation.hoc',
 '267618/scripts/Fig5_TIME_arrangement.py',
 '267618/scripts/stimStrat.py',
 '267618/scripts/Fig7_rampKFS.m',
 '267618/scripts/Fig8_PE.m',
 '267618/scripts/Fig8_PE_hex.py',
 '267618/scripts/attachStim.hoc',
 '267618/scripts/Fig6_KFS_AFibre.py',
 '267618/scripts/Fig6_KFS.m',
 '267618/scripts/Fig8_PE_mono.py',
 '267618/scripts/Fig4_TIME_diam.py',
 '267618/scripts/Fig3_AFibreModelValidation.hoc',
 '267618/scripts/Fig7_rampKFS_tao

In [None]:
with pd.ExcelWriter("modeldb_data_with_zip.xlsx", engine='xlsxwriter') as writer:
    model_df.to_excel(writer, sheet_name='Model Data', index=False)

## Extra - README:

In [None]:
# go thought README's
import os
import requests
import pandas as pd
import webbrowser

# Define your custom CSV file name or path
csv_file_path = "progress_database.csv"  # Replace with your file name or path

# Check if the CSV file exists or create it if it doesn't
if not os.path.isfile(csv_file_path):
    # Create an empty DataFrame and save it as a CSV file
    df = pd.DataFrame(columns=['Model ID', 'Response'])
    df.to_csv(csv_file_path, index=False)

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Function to open a webpage and record user response
def process_model(model_id, total_models, processed_models):
    # Open the ModelDB webpage for the current model
    model_page_url = f"https://modeldb.science/{model_id}"

    try:
        response = requests.get(model_page_url)
        response.raise_for_status()  # Raise an error if the page is not found
        webbrowser.open(model_page_url)
    except requests.exceptions.RequestException as e:
        print(f"Error opening the page for Model ID {model_id}: {str(e)}")
        return

    # Prompt the user for input ('n' or 'y' for issues, 'q' to quit)
    response = input(f"Model ID {model_id} ({processed_models}/{total_models}): "
                     "Press 'n' if no issues, 'y' if there are issues, 'q' to quit: ")

    if response.lower() == 'q':
        return -1  # User wants to quit
    elif response.lower() in ['n', 'y']:
        # Append the response to the CSV file
        data = pd.DataFrame({'Model ID': [model_id], 'Response': [response]})
        data.to_csv(csv_file_path, mode='a', header=False, index=False)

# Load the CSV file with progress data (if it exists)
df = pd.read_csv(csv_file_path)

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

total_models = len(data)
processed_models = len(df)

# Process each model that hasn't been processed yet
for model_id in data:
    if model_id not in df['Model ID'].values:
        result = process_model(model_id, total_models, processed_models)
        if result == -1:
            print("User chose to quit.")
            break

print("Processing complete. You can continue from where you left off.")


Processing complete. You can continue from where you left off.


In [None]:
import pandas as pd

# Load the existing CSV file
csv_file_path = "progress_database.csv"  # Replace with your file name or path
df = pd.read_csv(csv_file_path)

# Update 'n' to 'NO ISSUE' and 'y' to 'ISSUE' (including models 118759, 127321, and 136296)
# Also, consider blanks as 'ISSUE'
df['Response'] = df['Response'].replace({'n': 'NO ISSUE', 'y': 'ISSUE', '': 'ISSUE', '118759': 'ISSUE', '127321': 'ISSUE', '136296': 'ISSUE'})

# Count the number of issues
issue_count = df['Response'].value_counts()

# Save the modified DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)

print("CSV file has been updated.")
print(f"Issues Count:\n{issue_count}")

CSV file has been updated.
Issues Count:
NO ISSUE    1668
ISSUE        160
Name: Response, dtype: int64


In [None]:
import pandas as pd

# Define the path to the CSV file
csv_file_path = "progress_database.csv"  # Update with your CSV file path

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Filter entries with "ISSUE" in the "Response" column
issues_df = df[df['Response'].str.lower() == 'issue']

# Check if there are any issues to review
if issues_df.empty:
    print("No entries with 'ISSUE' found.")
else:
    # Get the last processed index, if available
    last_processed_index = 0
    if 'Last Processed Index' in issues_df.columns:
        last_processed_index = issues_df['Last Processed Index'].max()

    # Iterate through the issues and allow adding comments
    for index, row in issues_df.iterrows():
        if index <= last_processed_index:
            continue  # Skip already processed entries

        # Check if the "Comments" column is already populated
        if not pd.isna(row['Comments']):
            print(f"Model ID {row['Model ID']} already processed. Skipping...")
            continue

        model_id = row['Model ID']
        response = row['Response']
        model_link = f"https://modeldb.science/{model_id}"  # Add the link here

        print(f"Model ID {model_id}: {response}")
        print(f"Model Link: {model_link}")

        comment = input(f"Add a comment for Model ID {model_id} (or press 'q' to quit): ")

        if comment.lower() == 'q':
            print("Quitting...")
            break  # Exit the loop and script if 'q' is entered

        # Update the "Comments" column
        issues_df.loc[index, 'Comments'] = comment

        # Update the "Last Processed Index" column to track progress
        issues_df.loc[index, 'Last Processed Index'] = index

        # Save the updated DataFrame back to the CSV file after processing each entry
        issues_df.to_csv(csv_file_path, index=False)

        print(f"Model ID {model_id} processed.")

    print("All available entries processed.")

# Remove the "Last Processed Index" column before saving the final result
issues_df.drop(columns=['Last Processed Index'], inplace=True)
issues_df.to_csv(csv_file_path, index=False)

print("Comments saved to the CSV file.")


Model ID 3682 already processed. Skipping...
Model ID 3808 already processed. Skipping...
Model ID 7400 already processed. Skipping...
Model ID 7485 already processed. Skipping...
Model ID 8210 already processed. Skipping...
Model ID 19698 already processed. Skipping...
Model ID 20212 already processed. Skipping...
Model ID 22203 already processed. Skipping...
Model ID 36861 already processed. Skipping...
Model ID 36869 already processed. Skipping...
Model ID 36956 already processed. Skipping...
Model ID 37103 already processed. Skipping...
Model ID 37129 already processed. Skipping...
Model ID 39949 already processed. Skipping...
Model ID 50207 already processed. Skipping...
Model ID 50219 already processed. Skipping...
Model ID 50392 already processed. Skipping...
Model ID 50656 already processed. Skipping...
Model ID 51196 already processed. Skipping...
Model ID 53425 already processed. Skipping...
Model ID 53572 already processed. Skipping...
Model ID 53894 already processed. Skipp

Model ID 152028 processed.
Model ID 152113: ISSUE
Model Link: https://modeldb.science/152113
Add a comment for Model ID 152113 (or press 'q' to quit): New link for XPPAUT - https://sites.math.duke.edu/ode-book/xppaut/installation.html
Model ID 152113 processed.
Model ID 152625: ISSUE
Model Link: https://modeldb.science/152625
Add a comment for Model ID 152625 (or press 'q' to quit): New paper link - https://bmcmededuc.biomedcentral.com/articles/10.1186/1472-6920-13-70 - hoc code - link new one not found 
Model ID 152625 processed.
Model ID 153633: ISSUE
Model Link: https://modeldb.science/153633
Add a comment for Model ID 153633 (or press 'q' to quit): HTML link for paper - not found - new one - https://pubmed.ncbi.nlm.nih.gov/10490941/
Model ID 153633 processed.
Model ID 153635: ISSUE
Model Link: https://modeldb.science/153635
Add a comment for Model ID 153635 (or press 'q' to quit): HTML link for paper - not found - new one - https://pubmed.ncbi.nlm.nih.gov/16014787/
Model ID 153635 

Model ID 236306 processed.
Model ID 238347: ISSUE
Model Link: https://modeldb.science/238347
Add a comment for Model ID 238347 (or press 'q' to quit): Model data zip - page not found 
Model ID 238347 processed.
Model ID 238920: ISSUE
Model Link: https://modeldb.science/238920
Add a comment for Model ID 238920 (or press 'q' to quit): No issues 
Model ID 238920 processed.
Model ID 241796: ISSUE
Model Link: https://modeldb.science/241796
Add a comment for Model ID 241796 (or press 'q' to quit): No issues 
Model ID 241796 processed.
Model ID 241826: ISSUE
Model Link: https://modeldb.science/241826
Add a comment for Model ID 241826 (or press 'q' to quit): New supplementary link - https://royalsocietypublishing.org/doi/suppl/10.1098/rsif.2017.0425
Model ID 241826 processed.
Model ID 243212: ISSUE
Model Link: https://modeldb.science/243212
Add a comment for Model ID 243212 (or press 'q' to quit): Supplementary could be downloaded from here the link on the page is not working - https://www.ncb

# Zip Files 

In [24]:
import pandas as pd

# Load the Excel file into a DataFrame
file_path = "C:\\Users\\polin\\Downloads\\modeldb_data_2.xlsx"
df = pd.read_excel(file_path)

# Extract the "Zip Issues" column and count unique values
zip_issues_column = df["Zip Issues"]
unique_zip_issues = zip_issues_column.value_counts()

# Display the unique values and their frequencies
print("Unique Zip Issues and Frequencies:")
for issue, frequency in unique_zip_issues.items():
    print(f"Issue: {issue}, Frequency: {frequency}")


Unique Zip Issues and Frequencies:
Issue: No issues, Frequency: 1472
Issue: __MACOSX folder, Multiple top-level folders, Frequency: 183
Issue: __MACOSX folder, Multiple top-level folders, DS_store, Frequency: 41
Issue: DS_store, Frequency: 40
Issue: Invalid file format: .pyc, Frequency: 18
Issue: Invalid file format: .dll, Frequency: 15
Issue: Invalid file format: .o, Frequency: 8
Issue: Multiple top-level folders, Frequency: 8
Issue: __pycache__ folder, Invalid file format: .pyc, Frequency: 4
Issue: Invalid file format: .dll, Invalid file format: .o, Frequency: 4
Issue: Invalid file format: .dll, Invalid file format: .o, __MACOSX folder, Multiple top-level folders, Frequency: 3
Issue: Invalid file format: .dll, __MACOSX folder, Multiple top-level folders, Frequency: 3
Issue: Bad folder: x86_64/, Invalid file format: .so, Invalid file format: .o, Frequency: 3
Issue: Invalid file format: .o, __MACOSX folder, Multiple top-level folders, Frequency: 2
Issue: Invalid file format: .pyc, __MA

In [2]:
import requests
import os
import zipfile
import shutil
from tqdm import tqdm

def check_zip_issues(zip_path):
    """
    Check for issues with the zip file.
    You can implement specific logic here.
    """
    issues = []

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_file:
            file_list = zip_file.namelist()

            # Check for issues
            if any(file.startswith('__MACOSX/') for file in file_list):
                issues.append("__MACOSX folder")
            top_level_folders = set([file.split('/')[0] for file in file_list])
            if len(top_level_folders) > 1:
                issues.append("Multiple top-level folders")

            # Check for issues related to file formats
            invalid_file_formats = ['.pyc', '.dll', '.o', '.so']
            for file in file_list:
                file_extension = os.path.splitext(file)[-1]
                if file_extension in invalid_file_formats:
                    issues.append(f"Invalid file format: {file_extension}")
                if file.endswith('.DS_Store'):
                    issues.append('DS_Store')
                    
                # And bad folders
                bad_folders=['__MACOSX', '__pycache__','x86_64']
                for fold in bad_folders:
                    if fold in file.split('/'):
                        issues.append(f"Bad Folder: {fold}")        
        
            

    except Exception as e:
        issues.append(f"Error scanning zip file: {str(e)}")

    if not issues:
        return "No issues"

    return ", ".join(list(set(issues)))

In [27]:
# Get zip files
# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

# Create a directory to store downloaded zip files
os.makedirs("zip_files", exist_ok=True)

# Download ZIP files if they are not already downloaded
for model_id in tqdm(data, desc="Downloading ZIP files"):
    zip_path = f"zip_files/{model_id}.zip"
    if not os.path.exists(zip_path):
        zip_url = f"https://modeldb.science/download/{model_id}"
        with open(zip_path, 'wb') as zip_file:
            zip_response = requests.get(zip_url, stream=True)
            zip_file.write(zip_response.content)


Downloading ZIP files: 100%|██████████| 1839/1839 [00:36<00:00, 49.73it/s]   


In [7]:
# Create a directory to store cleaned ZIP files
os.makedirs("cleaned_zip_files", exist_ok=True)

# Iterate over ZIP files and clean them
for model_id in tqdm(data, desc="Cleaning ZIP files"):
    zip_path = f"zip_files/{model_id}.zip"
    zip_issues = check_zip_issues(zip_path)

    if zip_issues == "No issues":
        #shutil.copy(zip_path, f"cleaned_zip_files/{model_id}.zip")
        pass
    else:
        # Create a temporary directory to extract the ZIP file
        temp_dir = f"temp_zip_extract_{model_id}"
        os.makedirs(temp_dir, exist_ok=True)

        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_file:
                zip_file.extractall(temp_dir)

            # Delete "__MACOSX" folders and their contents
            macosx_dir = os.path.join(temp_dir, "__MACOSX")
            if os.path.exists(macosx_dir):
                shutil.rmtree(macosx_dir)
            
            # Delete files with undesired extensions
            bad_files = []
            for ext in ('.pyc', '.dll', '.o', '.so'):
               bad_files.extend(glob.iglob(f"{temp_dir}/**/*{ext}", recursive=True))
            bad_files.extend(glob.iglob(f"{temp_dir}/**/.DS_Store", recursive=True))
            bad_folders=[]
            for bad_fold in ('__MACOSX', '__pycache__', 'x86_64'):
                bad_folders.extend(glob.iglob(f"{temp_dir}/**/{bad_fold}", recursive=True))

            for bad_file in bad_files:
                os.remove(bad_file)
            for bad_folder in bad_folders:
                shutil.rmtree(bad_folder)
                        
            # Combine multiple top level folders into one top level folder
            file_list=os.listdir(temp_dir)
            top_level_folders = set([file.split('/')[0] for file in file_list])
            if len(top_level_folders) > 1:
                new_top=f"{temp_dir}/All"
                os.makedirs(new_top, exist_ok=True)
                for top_folder in top_level_folders:
                    shutil.move(f"{temp_dir}/{top_folder}", new_top)
            
            # Create a new cleaned ZIP file
            cleaned_zip_path = f"cleaned_zip_files/{model_id}.zip"
            with zipfile.ZipFile(cleaned_zip_path, 'w', zipfile.ZIP_DEFLATED) as new_zip:
                for root, _, files in os.walk(temp_dir):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, temp_dir)
                        new_zip.write(file_path, arcname)

        except Exception as e:
            print(f"Error processing ZIP file {model_id}: {str(e)}")
        finally:
            # Clean up temporary directory
            shutil.rmtree(temp_dir)

print("ZIP files cleaned and saved in 'cleaned_zip_files' directory.")
# A few zip files do not copy for various reasons and will throw errors, those have been manually copied and fixed
# Two zip files that came from the API were empty, those have been manually downloaded and fixed as well.

Cleaning ZIP files:  53%|█████▎    | 967/1839 [03:05<04:08,  3.51it/s]

Error processing ZIP file 168950: [Errno 2] No such file or directory: 'temp_zip_extract_168950\\RoessertEtAl2015\\publish\\closedloop\\figures\\fig52l_randomnet_ifun2b_0tau1v50_1tau1v1_2tau1v50_1wv0.1_2wv0.003_2varw0.1_2probw0.5_0N1000_1N100_ihsigma0.1_noinv_lentrain10_alpha0.0001_fitlasso_amod0.1_cnoise_vec0w_m.pdf'


Cleaning ZIP files:  80%|████████  | 1473/1839 [05:51<02:15,  2.70it/s]

Error processing ZIP file 256140: [Errno 13] Permission denied: 'temp_zip_extract_256140\\LuqueEtAl2019'


Cleaning ZIP files: 100%|██████████| 1839/1839 [10:02<00:00,  3.05it/s]

Error processing ZIP file 2014996: [Errno 2] No such file or directory: 'zip_files/2014996.zip'
Error processing ZIP file 2015413: [Errno 2] No such file or directory: 'zip_files/2015413.zip'
Error processing ZIP file 2015412: [Errno 2] No such file or directory: 'zip_files/2015412.zip'
Error processing ZIP file 2015421: [Errno 2] No such file or directory: 'zip_files/2015421.zip'
Error processing ZIP file 2014998: [Errno 2] No such file or directory: 'zip_files/2014998.zip'
Error processing ZIP file 2014833: [Errno 2] No such file or directory: 'zip_files/2014833.zip'
ZIP files cleaned and saved in 'cleaned_zip_files' directory.





In [134]:
# Confirm that all zip issues are fixed

zip_issues=[]
for model_id in tqdm(data):
    zip_path=f"cleaned_zip_files/{model_id}.zip"
    zip_issues=check_zip_issues(zip_path)
    if zip_issues != "No issues":
        print(model_id)
        print(zip_issues)

100%|██████████| 1833/1833 [00:01<00:00, 1469.54it/s]


In [22]:
from git import Repo
import requests
import os
import zipfile
import shutil
from tqdm import tqdm

api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
data = response.json()

os.makedirs("new_repos", exist_ok=True)

token='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'
for model_id in tqdm(data, desc="Uploading models to github"):
    zip_path = f'cleaned_zip_files/{model_id}.zip'
    if model_id<=267391:
        continue
    if not os.path.exists(zip_path):
        continue
    # Clone the repository
    repo_url = f"https://{token}@github.com/ModelDBRepository/{model_id}.git"
    local_repo_path = f'new_repos/{model_id}'
    try:
        repo = Repo.clone_from(repo_url, local_repo_path)
    except Exception as e:
            print(f"Error processing ID {model_id}: {str(e)}")
            continue


    # Remove all existing files in the Git directory
    for item in os.listdir(local_repo_path):
        item_path = os.path.join(local_repo_path, item)
        if item != '.git':
            if os.path.isfile(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)

    # Copy new files into the Git directory
    temp_dir = f"temp_zip_extract_{model_id}"
    os.makedirs(temp_dir, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_file:
                    zip_file.extractall(temp_dir)

    # Find the source folder with unknown name
    source_folder_path = None
    for item in os.listdir(temp_dir):
        item_path = os.path.join(temp_dir, item)
        if os.path.isdir(item_path):
            source_folder_path = item_path
            break


    shutil.copytree(source_folder_path, local_repo_path, dirs_exist_ok=True)


    shutil.rmtree(temp_dir)     
    # Get the list of all tracked files in the repository
    tracked_files = set(repo.git.ls_files().split('\n'))

    # Iterate through the tracked files and check if they exist in the current directory
    for file in tracked_files:
        file_path = os.path.join(local_repo_path, file)

        # Check if the file exists in the current directory
        if not os.path.exists(file_path):
            # If the file doesn't exist locally, remove it from the repository
            repo.index.remove([file], working_tree=True)

    # # Commit changes
    repo.index.add('*')
    repo.index.commit('Fixed issues')

    # # Push changes to the remote repository
    origin = repo.remotes.origin
    origin.push()


Uploading models to github:  96%|█████████▌| 1767/1839 [00:00<00:00, 4460.22it/s]

Error processing ID 267395: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267395.git new_repos/267395
  stderr: 'Cloning into 'new_repos/267395'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267395.git/' not found
'
Error processing ID 267510: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267510.git new_repos/267510
  stderr: 'Cloning into 'new_repos/267510'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267510.git/' not found
'
Error processing ID 267531: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267531.git new_repos/267531
  stderr: 'Cloning into 'new_repos/267531'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267531.git/' not found
'
Error processing ID 267552: 

Uploading models to github:  97%|█████████▋| 1787/1839 [00:47<00:02, 25.49it/s]  

Error processing ID 267611: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267611.git new_repos/267611
  stderr: 'Cloning into 'new_repos/267611'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267611.git/' not found
'
Error processing ID 267621: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267621.git new_repos/267621
  stderr: 'Cloning into 'new_repos/267621'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267621.git/' not found
'
Error processing ID 267669: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267669.git new_repos/267669
  stderr: 'Cloning into 'new_repos/267669'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267669.git/' not found
'
Error processing ID 267680: 

Uploading models to github:  98%|█████████▊| 1802/1839 [01:35<00:04,  8.70it/s]

Error processing ID 267695: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267695.git new_repos/267695
  stderr: 'Cloning into 'new_repos/267695'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267695.git/' not found
'
Error processing ID 267696: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267696.git new_repos/267696
  stderr: 'Cloning into 'new_repos/267696'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267696.git/' not found
'
Error processing ID 267589: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267589.git new_repos/267589
  stderr: 'Cloning into 'new_repos/267589'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267589.git/' not found
'
Error processing ID 267596: 

Uploading models to github: 100%|██████████| 1839/1839 [01:58<00:00, 15.56it/s]

Error processing ID 267768: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****@github.com/ModelDBRepository/267768.git new_repos/267768
  stderr: 'Cloning into 'new_repos/267768'...
remote: Repository not found.
fatal: repository 'https://github.com/ModelDBRepository/267768.git/' not found
'





Notes:
This week:
Uploaded cleaned zip files to github. These zip files were obtained from the modeldb api, I automatically removed bad files and folders such as macosx and x86_64 and combined top level folders. Then, I cloned the github repositories, copied the contents of the cleaned zip folders and removed the files/folders that were no longer there, and committed and pushed to github.
Fixed some of the readme issues?

Next week:
Not all of the cleaned zip files have github repositories and it looks like some of the github repositories that do exist are not completely updated.
Do we want to make a github page for the models that don't have them and then refresh all of the github repositories with the files from the modeldb page?
New python package for git can be used to add tags so we can do that next week.

In [6]:
import requests
from tqdm import tqdm

# Define the ModelDB API endpoint
modeldb_api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of ModelDB entries
response = requests.get(modeldb_api_url)
modeldb_entries = response.json()

# Define a list to store ModelDB entries that do not match with GitHub repositories
entries_not_matching = []

# Create a tqdm progress bar
with tqdm(total=len(modeldb_entries), desc="Checking GitHub Repositories") as pbar:
    # Iterate through ModelDB entries
    for model_id in modeldb_entries:
        # Construct the GitHub repository URL based on your naming convention
        github_repo_url = f"https://github.com/ModelDBRepository/{model_id}"

        # You can use a library like PyGitHub to check if the GitHub repository exists
        # For simplicity, we will use a simple HTTP request here
        github_response = requests.head(github_repo_url)

        # Check if the GitHub repository exists (HTTP status code 200)
        if github_response.status_code != 200:
            # If it doesn't exist, add the ModelDB entry to the list
            entries_not_matching.append(model_id)

        # Update the progress bar
        pbar.update(1)

# Print the list of ModelDB entries that do not match with GitHub repositories
print("ModelDB entries without matching GitHub repositories:")
for model_id in entries_not_matching:
    print(model_id)


Checking GitHub Repositories: 100%|██████████| 1842/1842 [17:17<00:00,  1.78it/s]

ModelDB entries without matching GitHub repositories:
7485
18871
50219
50392
50656
53457
53572
55273
55748
55756
55859
64167
64170
64242
64266
76883
82385
82392
82394
83512
83514
83516
83517
84627
84641
84649
87450
87751
87760
91893
91898
91899
93315
112348
112914
112922
113426
114365
114643
115968
116053
116084
116312
116313
117351
117810
118020
118195
118524
118759
118797
118799
118894
119153
119214
120115
120227
120320
120521
121628
125649
125676
125855
126052
126096
126371
126389
126392
126489
126598
126636
127192
127305
127321
127351
127967
127996
138421
138631
138634
139457
140246
140964
141835
142062
143446
143602
144387
144518
145882
147185
147487
147740
147938
149737
150635
151549
152625
153351
153452
154192
154769
154871
154927
154955
155157
156162
156470
169278
180372
182373
183017
183251
185875
188543
188552
190610
195626
195659
195731
195856
206227
206256
206356
218015
223648
223890
224843
224998
225428
225552
225583
226432
227114
227677
227978
228373
229580
230046
230578





In [10]:
import requests
from tqdm import tqdm

# Define the ModelDB API endpoint
modeldb_api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of ModelDB entries
response = requests.get(modeldb_api_url)
modeldb_entries = response.json()

# Define a list to store ModelDB entries that do not match with GitHub repositories
entries_not_matching = []

# Create a tqdm progress bar
with tqdm(total=len(modeldb_entries), desc="Checking GitHub Repositories") as pbar:
    # Iterate through ModelDB entries
    for model_id in modeldb_entries:
        # Construct the GitHub repository URL based on your naming convention
        github_repo_url = f"https://github.com/ModelDBRepository/{model_id}"

        # You can use a library like PyGitHub to check if the GitHub repository exists
        # For simplicity, we will use a simple HTTP request here
        github_response = requests.head(github_repo_url)

        # Check if the GitHub repository exists (HTTP status code 200)
        if github_response.status_code != 200:
            # If it doesn't exist, add the ModelDB entry to the list
            entries_not_matching.append(model_id)
            
        # Update the progress bar
        pbar.update(1)

# Print the total count of ModelDB entries that do not match with GitHub repositories
print(f"Total ModelDB entries without matching GitHub repositories: {len(entries_not_matching)}")
    
# Print the list of ModelDB entries that do not have a GitHub repository
print("ModelDB entries without GitHub repositories:")
for model_id in entries_not_matching:
    print(model_id)

Checking GitHub Repositories: 100%|██████████| 1842/1842 [16:41<00:00,  1.84it/s]

Total ModelDB entries without matching GitHub repositories: 342
ModelDB entries without GitHub repositories:
7485
18871
50219
50392
50656
53457
53572
55273
55748
55756
55859
64167
64170
64242
64266
76883
82385
82392
82394
83512
83514
83516
83517
84627
84641
84649
87450
87751
87760
91893
91898
91899
93315
112348
112914
112922
113426
114365
114643
115968
116053
116084
116312
116313
117351
117810
118020
118195
118524
118759
118797
118799
118894
119153
119214
120115
120227
120320
120521
121628
125649
125676
125855
126052
126096
126371
126389
126392
126489
126598
126636
127192
127305
127321
127351
127967
127996
138421
138631
138634
139457
140246
140964
141835
142062
143446
143602
144387
144518
145882
147185
147487
147740
147938
149737
150635
151549
152625
153351
153452
154192
154769
154871
154927
154955
155157
156162
156470
169278
180372
182373
183017
183251
185875
188543
188552
190610
195626
195659
195731
195856
206227
206256
206356
218015
223648
223890
224843
224998
225428
225552
225583
2




In [6]:
import requests
import os
import zipfile
import shutil
from git import Repo
from tqdm import tqdm

def get_file_sizes(directory):
    file_sizes = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            path = os.path.join(root, file)
            size = os.path.getsize(path)
            file_sizes[path.replace(directory, '')] = size
    return file_sizes

def compare_files(zip_file_sizes, git_file_sizes):
    for file, size in zip_file_sizes.items():
        if file not in git_file_sizes or size != git_file_sizes[file]:
            return False
    for file in git_file_sizes:
        if file not in zip_file_sizes:
            return False
    return True

# Define the ModelDB API endpoint
modeldb_api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of ModelDB entries
response = requests.get(modeldb_api_url)
modeldb_entries = response.json()

non_matching_models = []

with tqdm(total=len(modeldb_entries), desc="Processing Models") as pbar:
    for model_id in modeldb_entries:
        zip_path = f"zip_files/{model_id}.zip"

        # Check if ZIP file exists
        if not os.path.exists(zip_path):
            print(f"ZIP file for model {model_id} does not exist.")
            pbar.update(1)
            continue

        # Extract ZIP file
        temp_zip_dir = f"temp_zip_{model_id}"
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_zip_dir)

        # List files and sizes from ZIP
        zip_file_sizes = get_file_sizes(temp_zip_dir)

        # Clone GitHub repo
        git_repo_dir = f"git_repos/{model_id}"
        repo_url = f"https://github.com/ModelDBRepository/{model_id}.git"
        try:
            repo = Repo.clone_from(repo_url, git_repo_dir)
        except Exception as e:
            print(f"Error cloning GitHub repo for model {model_id}: {str(e)}")
            shutil.rmtree(temp_zip_dir)
            pbar.update(1)
            continue

        git_file_sizes = get_file_sizes(git_repo_dir)

        # Compare files and collect non-matching models
        if not compare_files(zip_file_sizes, git_file_sizes):
            non_matching_models.append(model_id)

        # Cleanup
        shutil.rmtree(temp_zip_dir)
        shutil.rmtree(git_repo_dir)

        # Update progress bar
        pbar.update(1)

# Print non-matching models
print("Models with non-matching ZIP files and GitHub repositories:")
for model in non_matching_models:
    print(model)


Processing Models: 100%|█████████████████| 1843/1843 [00:00<00:00, 12203.47it/s]

ZIP file for model 279 does not exist.
ZIP file for model 2487 does not exist.
ZIP file for model 2488 does not exist.
ZIP file for model 2730 does not exist.
ZIP file for model 2733 does not exist.
ZIP file for model 2796 does not exist.
ZIP file for model 2798 does not exist.
ZIP file for model 2937 does not exist.
ZIP file for model 3167 does not exist.
ZIP file for model 3263 does not exist.
ZIP file for model 3264 does not exist.
ZIP file for model 3289 does not exist.
ZIP file for model 3332 does not exist.
ZIP file for model 3342 does not exist.
ZIP file for model 3343 does not exist.
ZIP file for model 3344 does not exist.
ZIP file for model 3434 does not exist.
ZIP file for model 3454 does not exist.
ZIP file for model 3457 does not exist.
ZIP file for model 3483 does not exist.
ZIP file for model 3488 does not exist.
ZIP file for model 3491 does not exist.
ZIP file for model 3493 does not exist.
ZIP file for model 3507 does not exist.
ZIP file for model 3509 does not exist.
Z




# Publications 

In [3]:
import requests

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
model_ids = response.json()

# Process a few models for inspection
for model_id in model_ids[:5]:  # Adjust the number for more or fewer models
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Print the model ID and its metadata for inspection
    print(f"Model ID: {model_id}")
    print("Metadata:")
    print(model_data)
    print("\n")


Model ID: 279
Metadata:
{'id': 279, 'name': 'Low Threshold Calcium Currents in TC cells (Destexhe et al 1998)', 'created': '2001-01-01T00:00:00', 'ver_number': 24, 'ver_date': '2015-01-02T22:01:45', 'class_id': 19, 'notes': {'value': "In Destexhe, Neubig, Ulrich, and Huguenard (1998) experiments and models examine low threshold calcium current's (IT, or T-current) distribution in thalamocortical (TC) cells.  Multicompartmental modeling supports the hypothesis that IT currents have a density at least several fold higher in the dendrites than the soma.  The IT current contributes significantly to rebound bursts and is thought to have important network behavior consequences.  See the paper for details. See also http://cns.iaf.cnrs-gif.fr   Correspondance may be addressed to Alain Destexhe: Destexhe@iaf.cnrs-gif.fr", 'attr_id': 24}, 'neurons': {'value': [{'object_id': 262, 'object_name': 'Thalamus geniculate nucleus/lateral principal GLU cell'}], 'attr_id': 25}, 'currents': {'value': [{'ob

Model ID: 2730
Metadata:
{'id': 2730, 'name': 'Olfactory Bulb Network (Davison et al 2003)', 'created': '2001-05-16T10:13:42', 'ver_number': 50, 'ver_date': '2022-05-27T18:15:00.95', 'class_id': 19, 'notes': {'value': 'A biologically-detailed model of the mammalian olfactory bulb, incorporating \r\nthe mitral and granule cells and the dendrodendritic synapses between them. \r\nThe results of simulation experiments with electrical stimulation agree \r\nclosely in most details with published experimental data. The model predicts \r\nthat the time course of dendrodendritic inhibition is dependent on the \r\nnetwork connectivity as well as on the intrinsic parameters of the synapses. \r\nIn response to simulated odor stimulation, strongly activated mitral cells \r\ntend to suppress neighboring cells, the mitral cells readily synchronize \r\ntheir firing, and increasing the stimulus intensity increases the degree of \r\nsynchronization. For more details, see the reference below.', 'attr_id'

In [15]:
import requests
import pandas as pd
from tqdm import tqdm
import re

# Function to check for paper title and identify DOI/PMID
def check_paper_details(paper_data):
    # Initialize default values
    paper_title = 'No Title'
    paper_attached = 'No'
    identifier_info = 'None'

    # Define regex patterns for DOI and PMID
    doi_pattern = re.compile(r'10.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
    pmid_pattern = re.compile(r'PMID:\s*\d+')

    # Check if there is a paper title
    if paper_data:
        paper_title = paper_data[0].get('object_name', 'No Title Provided')
        paper_attached = 'Yes'  # If there is a title, assume paper is attached
        
        # Search for DOI or PMID within the title
        if doi_pattern.search(paper_title):
            identifier_info = 'DOI Present'
        elif pmid_pattern.search(paper_title):
            identifier_info = 'PMID Present'
        elif 'http' in paper_title.lower():
            identifier_info = 'URL Present'
        else:
            identifier_info = 'Title Only'

    return paper_title, paper_attached, identifier_info

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
model_ids = response.json()

# Prepare a list to store the results
results = []

# Process each model with a progress bar
for model_id in tqdm(model_ids, desc="Processing Model Papers"):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Extract paper information
    model_paper = model_data.get('model_paper', {}).get('value', [])

    # Check for paper title and if a paper is attached
    paper_title, paper_attached, identifier_info = check_paper_details(model_paper)

    # Append results
    results.append({
        "Model ID": model_id,
        "Paper Title": paper_title,
        "Paper Attached": paper_attached,
        "Identifier Information": identifier_info
    })

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file = "modeldb_paper_details.csv"
df.to_csv(output_file, index=False)

print(f"Data exported to CSV successfully. File location: {output_file}")


Processing Model Papers: 100%|██████████| 1838/1838 [03:23<00:00,  9.05it/s]


Data exported to CSV successfully. File location: modeldb_paper_details.csv


In [7]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [8]:
import requests
from bs4 import BeautifulSoup

def check_doi_pmid(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Look for DOI or PMID in the webpage
    # This part may require adjustments based on the webpage's structure
    has_doi = soup.find(string="DOI")
    has_pmid = soup.find(string="PMID")

    if has_doi or has_pmid:
        print(f"ModelDB entry {modeldb_id} has associated DOI or PMID.")
    else:
        print(f"ModelDB entry {modeldb_id} does NOT have associated DOI or PMID.")

# Example usage
modeldb_id = "245071"  # Replace with the ModelDB ID you want to check
check_doi_pmid(modeldb_id)


ModelDB entry 245071 does NOT have associated DOI or PMID.


In [11]:
import requests
import pandas as pd
from tqdm import tqdm
import re

# Function to check for paper title and identify DOI/PMID
def check_paper_details(paper_data):
    # Initialize default values
    paper_title = 'No Title'
    paper_attached = 'No'
    identifier_info = 'None'

    # Define regex patterns for DOI and PMID
    doi_pattern = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
    pmid_pattern = re.compile(r'PMID:\s*\d+')

    # Check if there is a paper title
    if paper_data:
        paper_title = paper_data[0].get('object_name', 'No Title Provided')
        paper_attached = 'Yes'  # If there is a title, assume paper is attached
        
        # Search for DOI or PMID within the title
        if doi_pattern.search(paper_title):
            identifier_info = 'DOI Present'
        elif pmid_pattern.search(paper_title):
            identifier_info = 'PMID Present'
        elif 'http' in paper_title.lower():
            identifier_info = 'URL Present'
        else:
            identifier_info = 'Title Only'

    return paper_title, paper_attached, identifier_info

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
model_ids = response.json()

# Prepare a list to store the results
results = []

# Process each model with a progress bar
for model_id in tqdm(model_ids, desc="Processing Model Papers"):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Extract paper information
    model_paper = model_data.get('model_paper', {}).get('value', [])

    # Check for paper title and if a paper is attached
    paper_title, paper_attached, identifier_info = check_paper_details(model_paper)

    # Append results
    results.append({
        "Model ID": model_id,
        "Paper Title": paper_title,
        "Paper Attached": paper_attached,
        "Identifier Information": identifier_info
    })

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file = "modeldb_paper_details.csv"
df.to_csv(output_file, index=False)

print(f"Data exported to CSV successfully. File location: {output_file}")


Processing Model Papers: 100%|██████████| 1842/1842 [08:14<00:00,  3.73it/s]

Data exported to CSV successfully. File location: modeldb_paper_details.csv





In [14]:
import requests
import pandas as pd
from tqdm import tqdm
import re

# Function to check for paper title and identify DOI/PMID
def check_paper_details(paper_data):
    # Initialize default values
    paper_title = 'No Title'
    paper_attached = 'No'
    identifier_info = 'None'

    # Define regex patterns for DOI and PMID
    doi_pattern = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
    pmid_pattern = re.compile(r'PMID:\s*\d+')

    # Check if there is a paper title
    if paper_data:
        paper_title = paper_data[0].get('object_name', 'No Title Provided')
        paper_attached = 'Yes'  # If there is a title, assume paper is attached
        
        # Search for DOI or PMID within the title
        if doi_pattern.search(paper_title):
            identifier_info = 'DOI Present'
        elif pmid_pattern.search(paper_title):
            identifier_info = 'PMID Present'
        elif 'http' in paper_title.lower():
            identifier_info = 'URL Present'
        else:
            identifier_info = 'Title Only'

    return paper_title, paper_attached, identifier_info

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
model_ids = response.json()

# Prepare a list to store the results
results = []

# Process each model with a progress bar
for model_id in tqdm(model_ids, desc="Processing Model Papers"):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Extract paper information
    model_paper = model_data.get('model_paper', {}).get('value', [])

    # Check for paper title and if a paper is attached
    paper_title, paper_attached, identifier_info = check_paper_details(model_paper)

    # Append results
    results.append({
        "Model ID": model_id,
        "Paper Title": paper_title,
        "Paper Attached": paper_attached,
        "Identifier Information": identifier_info
    })

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save the original DataFrame to a CSV file
original_output_file = "modeldb_paper_details_1.csv"
df.to_csv(original_output_file, index=False)

# Create a DataFrame containing entries with missing information
missing_info_df = df[(df["Paper Title"] == 'No Title') | (df["Paper Attached"] == 'No') | (df["Identifier Information"] == 'None')]

# Save the DataFrame with missing information to a CSV file
missing_info_output_file = "modeldb_missing_info.csv"
missing_info_df.to_csv(missing_info_output_file, index=False)

print(f"Data exported to CSV successfully.")
print(f"Original data saved to {original_output_file}")
print(f"Missing information data saved to {missing_info_output_file}")

Processing Model Papers: 100%|██████████| 1842/1842 [05:47<00:00,  5.30it/s]

Data exported to CSV successfully.
Original data saved to modeldb_paper_details_1.csv
Missing information data saved to modeldb_missing_info.csv





In [17]:
import requests
import pandas as pd
from tqdm import tqdm
import re

# Function to check for paper title and identify DOI/PMID
def check_paper_details(paper_data):
    # Initialize default values
    paper_title = 'No Title'
    paper_attached = 'No'
    identifier_info = 'None'

    # Define regex patterns for DOI and PMID
    doi_pattern = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE)
    pmid_pattern = re.compile(r'PMID:\s*\d+')

    # Check if there is a paper title
    if paper_data:
        paper_title = paper_data[0].get('object_name', 'No Title Provided')
        
        # Check if the title is empty or only contains whitespace
        if paper_title.strip():
            paper_attached = 'Yes'  # If there is a non-empty title, assume paper is attached
            
            # Search for DOI or PMID within the title
            if doi_pattern.search(paper_title):
                identifier_info = 'DOI Present'
            elif pmid_pattern.search(paper_title):
                identifier_info = 'PMID Present'
            elif 'http' in paper_title.lower():
                identifier_info = 'URL Present'
            else:
                identifier_info = 'Title Only'

    return paper_title, paper_attached, identifier_info

# Define the ModelDB API endpoint
api_url = "https://modeldb.science/api/v1/models"

# Make an API request to get the list of model IDs
response = requests.get(api_url)
model_ids = response.json()

# Prepare a list to store the results
results = []

# Process each model with a progress bar
for model_id in tqdm(model_ids, desc="Processing Model Papers"):
    # Fetch metadata for the model
    model_data = requests.get(f"{api_url}/{model_id}").json()

    # Extract paper information
    model_paper = model_data.get('model_paper', {}).get('value', [])

    # Check for paper title and if a paper is attached
    paper_title, paper_attached, identifier_info = check_paper_details(model_paper)

    # Append results
    results.append({
        "Model ID": model_id,
        "Paper Title": paper_title,
        "Paper Attached": paper_attached,
        "Identifier Information": identifier_info
    })

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save the original DataFrame to a CSV file
original_output_file = "modeldb_paper_details_2.csv"
df.to_csv(original_output_file, index=False)

# Create a DataFrame containing entries with missing information
missing_info_df = df[(df["Paper Title"] == 'No Title') | (df["Paper Attached"] == 'No') | (df["Identifier Information"] == 'None')]

# Save the DataFrame with missing information to a CSV file
missing_info_output_file = "modeldb_missing_info_2.csv"
missing_info_df.to_csv(missing_info_output_file, index=False)

print(f"Data exported to CSV successfully.")
print(f"Original data saved to {original_output_file}")
print(f"Missing information data saved to {missing_info_output_file}")


Processing Model Papers: 100%|██████████| 1842/1842 [05:23<00:00,  5.70it/s]

Data exported to CSV successfully.
Original data saved to modeldb_paper_details_2.csv
Missing information data saved to modeldb_missing_info_2.csv





In [18]:
import requests
from bs4 import BeautifulSoup

def check_doi_pmid(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Look for DOI or PMID in the webpage
    doi_element = soup.find("a", string="DOI")
    pmid_element = soup.find("a", string="PubMed")

    if doi_element:
        doi = doi_element.get("href")
        print(f"ModelDB entry {modeldb_id} has associated DOI: {doi}")
    elif pmid_element:
        pmid = pmid_element.get("href")
        print(f"ModelDB entry {modeldb_id} has associated PMID: {pmid}")
    else:
        print(f"ModelDB entry {modeldb_id} does NOT have associated DOI or PMID.")

# Example usage
modeldb_id = "245071"  # Replace with the ModelDB ID you want to check
check_doi_pmid(modeldb_id)


ModelDB entry 245071 does NOT have associated DOI or PMID.


In [19]:
import requests
from bs4 import BeautifulSoup

def check_doi_pmid(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')

    # Look for DOI or PMID in the webpage
    doi_element = soup.find("a", string="DOI")
    pmid_element = soup.find("a", string="PubMed")

    if doi_element:
        doi = doi_element.get("href")
        return modeldb_id, f"DOI: {doi}"
    elif pmid_element:
        pmid = pmid_element.get("href")
        return modeldb_id, f"PMID: {pmid}"
    else:
        return modeldb_id, "No DOI or PMID"

# Fetch the list of ModelDB IDs from the API
api_url = "https://modeldb.science/api/v1/models"
response = requests.get(api_url)
if response.status_code != 200:
    print(f"Failed to retrieve the list of ModelDB IDs from {api_url}")
    exit()

modeldb_ids_to_check = response.json()

# Example usage to check all ModelDB entries
for modeldb_id in modeldb_ids_to_check:
    result = check_doi_pmid(modeldb_id)
    print(f"ModelDB entry {result[0]} - {result[1]}")


ModelDB entry 279 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=9570789
ModelDB entry 2487 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=10715559
ModelDB entry 2488 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=8684467
ModelDB entry 2730 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=12736241
ModelDB entry 2733 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=7688798
ModelDB entry 2796 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=10481998
ModelDB entry 2798 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=11158631
ModelDB entry 2937 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=8913580
ModelDB entry 3167 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=11731535
ModelDB entry 3263 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=modeldb&term=7608762
ModelDB entry 3264 - PMID: https://www.ncbi.nlm.nih.gov/pubmed?holding=model

KeyboardInterrupt: 

In [21]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def check_doi_pmid(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Look for DOI or PMID in the webpage
    doi_element = soup.find("a", string="DOI")
    pmid_element = soup.find("a", string="PubMed")

    if doi_element:
        doi = doi_element.get("href")
        return modeldb_id, f"DOI: {doi}"
    elif pmid_element:
        pmid = pmid_element.get("href")
        return modeldb_id, f"PMID: {pmid}"
    else:
        return None

# Fetch the list of ModelDB IDs from the API
api_url = "https://modeldb.science/api/v1/models"
response = requests.get(api_url)
if response.status_code != 200:
    print(f"Failed to retrieve the list of ModelDB IDs from {api_url}")
    exit()

modeldb_ids_to_check = response.json()

# Initialize a list to store the results
results = []

# Check all ModelDB entries for DOI or PMID and store the results
with tqdm(total=len(modeldb_ids_to_check), desc="Checking ModelDB Entries") as pbar:
    for modeldb_id in modeldb_ids_to_check:
        result = check_doi_pmid(modeldb_id)
        if result:
            results.append(result)
        pbar.update(1)

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=["ModelDB ID", "Identifier"])

# Save the results to a CSV file
output_file = "modeldb_doi_pmid_results.csv"
df.to_csv(output_file, index=False)

# Print entries with no DOI or PMID and count
no_identifier_entries = df[df["Identifier"].isna()]
print("Entries with no DOI or PMID:")
print(no_identifier_entries)

print(f"Data exported to CSV file: {output_file}")
print(f"Number of entries with no DOI or PMID: {len(no_identifier_entries)}")


Checking ModelDB Entries: 100%|██████████| 1842/1842 [06:26<00:00,  4.76it/s]


Entries with no DOI or PMID:
Empty DataFrame
Columns: [ModelDB ID, Identifier]
Index: []
Data exported to CSV file: modeldb_doi_pmid_results.csv
Number of entries with no DOI or PMID: 0


In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def check_doi_pmid(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Look for DOI or PMID in the webpage
    doi_element = soup.find("a", string="DOI")
    pmid_element = soup.find("a", string="PubMed")

    if doi_element:
        doi = doi_element.get("href")
        return modeldb_id, f"DOI: {doi}"
    elif pmid_element:
        pmid = pmid_element.get("href")
        return modeldb_id, f"PMID: {pmid}"
    else:
        return None

# Fetch the list of ModelDB IDs from the API
api_url = "https://modeldb.science/api/v1/models"
response = requests.get(api_url)
if response.status_code != 200:
    print(f"Failed to retrieve the list of ModelDB IDs from {api_url}")
    exit()

modeldb_ids_to_check = response.json()

# Initialize a list to store the results
results = []
skipped_entries = []

# Check all ModelDB entries for DOI or PMID and store the results
with tqdm(total=len(modeldb_ids_to_check), desc="Checking ModelDB Entries") as pbar:
    for modeldb_id in modeldb_ids_to_check:
        result = check_doi_pmid(modeldb_id)
        if result:
            results.append(result)
        else:
            skipped_entries.append(modeldb_id)
        pbar.update(1)

# Create a DataFrame from the results
df = pd.DataFrame(results, columns=["ModelDB ID", "Identifier"])

# Save the results to a CSV file
output_file = "modeldb_doi_pmid_results_1.csv"
df.to_csv(output_file, index=False)

# Print entries with no DOI or PMID and count
no_identifier_entries = df[df["Identifier"].isna()]
print("Entries with no DOI or PMID:")
print(no_identifier_entries)

# Print skipped entries and their count
print("Skipped entries (missing information):")
print(skipped_entries)
print(f"Number of skipped entries: {len(skipped_entries)}")


Checking ModelDB Entries: 100%|██████████| 1842/1842 [06:01<00:00,  5.10it/s]

Entries with no DOI or PMID:
Empty DataFrame
Columns: [ModelDB ID, Identifier]
Index: []
Skipped entries (missing information):
[36861, 36869, 128068, 136097, 181035, 182129, 185334, 190565, 224843, 230329, 230929, 231105, 244688, 245071, 245415, 246546, 247179, 247968, 249405, 254217, 259620, 260178, 260967, 261435, 261460, 261489, 261864, 261881, 262046, 262356, 263053, 263703, 263719, 264591, 266526, 266551, 266577, 266718, 266726, 266770, 266802, 266807, 266823, 266848, 266849, 266863, 266868, 266880, 266881, 266910, 266925, 266928, 266929, 267009, 267018, 267026, 267035, 267047, 267050, 267066, 267128, 267139, 267174, 267183, 267184, 267221, 267222, 267280, 267297, 267306, 267307, 267324, 267334, 267338, 267339, 267357, 267363, 267510, 267512, 267552, 267563, 267586, 267587, 267594, 267595, 267610, 267611, 267617, 267666, 267680, 267682, 267686, 267589, 267596, 267614, 267735, 267738, 260015, 266797, 2014814, 267591, 267620, 267144, 2014825, 267618, 267768, 2015414, 2014996, 20154




In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_model_details(modeldb_id):
    url = f"https://modeldb.science/{modeldb_id}?tab=7"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract additional details for the skipped entry
    model_name_element = soup.find("h1", class_="blue")
    if model_name_element:
        model_name = model_name_element.text.strip()
    else:
        model_name = "N/A"

    return {
        "ModelDB ID": modeldb_id,
        "Model Name": model_name
    }

# Fetch the list of ModelDB IDs from the API
api_url = "https://modeldb.science/api/v1/models"
response = requests.get(api_url)
if response.status_code != 200:
    print(f"Failed to retrieve the list of ModelDB IDs from {api_url}")
    exit()

modeldb_ids_to_check = response.json()

# Initialize a list to store the skipped entries
skipped_entries = []

# Initialize a list to store additional details for skipped entries
skipped_entries_details = []

# Check all ModelDB entries for DOI or PMID and store the skipped entries
with tqdm(total=len(modeldb_ids_to_check), desc="Checking ModelDB Entries") as pbar:
    for modeldb_id in modeldb_ids_to_check:
        result = check_doi_pmid(modeldb_id)
        if not result:
            skipped_entries.append(modeldb_id)
            # Retrieve additional details for the skipped entry
            details = get_model_details(modeldb_id)
            if details:
                skipped_entries_details.append(details)
        pbar.update(1)

# Print entries with no DOI or PMID and count
print("Entries with no DOI or PMID:")
print(no_identifier_entries)

# Print skipped entries and their count
print("Skipped entries (missing information):")
print(skipped_entries)
print(f"Number of skipped entries: {len(skipped_entries)}")

# Print additional details for skipped entries
print("Additional details for skipped entries:")
for entry in skipped_entries_details:
    print(f"ModelDB ID: {entry['ModelDB ID']}, Model Name: {entry['Model Name']}")


Checking ModelDB Entries: 100%|██████████| 1842/1842 [07:15<00:00,  4.23it/s]

Entries with no DOI or PMID:
Empty DataFrame
Columns: [ModelDB ID, Identifier]
Index: []
Skipped entries (missing information):
[36861, 36869, 128068, 136097, 181035, 182129, 185334, 190565, 224843, 230329, 230929, 231105, 244688, 245071, 245415, 246546, 247179, 247968, 249405, 254217, 259620, 260178, 260967, 261435, 261460, 261489, 261864, 261881, 262046, 262356, 263053, 263703, 263719, 264591, 266526, 266551, 266577, 266718, 266726, 266770, 266802, 266807, 266823, 266848, 266849, 266863, 266868, 266880, 266881, 266910, 266925, 266928, 266929, 267009, 267018, 267026, 267035, 267047, 267050, 267066, 267128, 267139, 267174, 267183, 267184, 267221, 267222, 267280, 267297, 267306, 267307, 267324, 267334, 267338, 267339, 267357, 267363, 267510, 267512, 267552, 267563, 267586, 267587, 267594, 267595, 267610, 267611, 267617, 267666, 267680, 267682, 267686, 267589, 267596, 267614, 267735, 267738, 260015, 266797, 2014814, 267591, 267620, 267144, 2014825, 267618, 267768, 2015414, 2014996, 20154




In [1]:
import csv

data = [
    36861, 36869, 128068, 136097, 181035, 182129, 185334, 190565, 224843, 230329, 230929, 231105, 244688, 245071, 245415,
    246546, 247179, 247968, 249405, 254217, 259620, 260178, 260967, 261435, 261460, 261489, 261864, 261881, 262046,
    262356, 263053, 263703, 263719, 264591, 266526, 266551, 266577, 266718, 266726, 266770, 266802, 266807, 266823,
    266848, 266849, 266863, 266868, 266880, 266881, 266910, 266925, 266928, 266929, 267009, 267018, 267026, 267035,
    267047, 267050, 267066, 267128, 267139, 267174, 267183, 267184, 267221, 267222, 267280, 267297, 267306, 267307,
    267324, 267334, 267338, 267339, 267357, 267363, 267510, 267512, 267552, 267563, 267586, 267587, 267594, 267595,
    267610, 267611, 267617, 267666, 267680, 267682, 267686, 267589, 267596, 267614, 267735, 267738, 260015, 266797,
    2014814, 267591, 267620, 267144, 2014825, 267618, 267768, 2015414, 2014996, 2015413, 2015412, 2015421, 2014998,
    2014833, 267613, 2014816
]

# Define the CSV file name
csv_file_name = "no_pmid.csv"

# Write the data to the CSV file
with open(csv_file_name, mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["Values"])  # Write a header row
    csv_writer.writerows(map(lambda x: [x], data))

print(f"Values have been saved to {csv_file_name}")


Values have been saved to no_pmid.csv
