# PubMed GitHub repository Almanack checks analysis

This notebook focuses on gathering and analyzing Almanack sustainability check data for PubMed-linked GitHub repositories and is an extension of the work done [gather-software-information-entropy](./gather-software-information-entropy.ipynb)

PubMed article repositories are extracted from a curated list of GitHub links.  

The Almanack package is used to process each repository in parallel batches, with results written to Parquet files.  
These batch outputs can be combined into a full dataset for further analysis of sustainability metrics.

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
import json
from typing import Optional
from pathlib import Path
from almanack import process_repo_for_almanack

def _sanitize_for_parquet(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans a DataFrame so all columns are parquet-safe.

    - Expands dict columns into multiple fields.
    - Converts lists into JSON strings.
    - Casts generic object types into strings.

    Args:
        df: Input DataFrame with raw metrics.

    Returns:
        df: Sanitized DataFrame safe for parquet storage.
    """
    for col in df.columns:
        # Check if column contains dicts
        if df[col].apply(lambda x: isinstance(x, dict)).any():
            nested = pd.json_normalize(df[col])
            nested.columns = [f"{col}_{c}" for c in nested.columns]
            df = df.drop(columns=[col]).join(nested)
        # Check if column contains lists
        elif df[col].apply(lambda x: isinstance(x, list)).any():
            df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, list) else x)
        # Fallback for generic objects
        elif df[col].dtype == "object":
            df[col] = df[col].astype(str)
    return df



def repository_almanack_analysis(batch_size: int = 500, max_workers: int = 16, limit: Optional[int] = None,):
    """
    Processes PubMed GitHub repositories in batches, flattens their Almanack metrics,
    and writes the results to parquet files.

    Args:
        batch_size (int): Number of repositories to process per batch.
        max_workers (int): Maximum number of parallel workers.
        limit (Optional[int]): Restrict total number of repositories processed.

    Returns:
        df: DataFrame of results from the last processed batch.
    """
    parquet_file = "gather-pubmed-repos/pubmed_github_links.parquet"

    if not Path(parquet_file).exists():
        raise FileNotFoundError(f"Parquet file not found: {parquet_file}")

    df = pd.read_parquet(parquet_file)

    if "github_link" not in df.columns:
        raise ValueError(f"'github_link' column not found. Available: {list(df.columns)}")

    repo_urls = df["github_link"].drop_duplicates().dropna().tolist()

    if limit:
        repo_urls = repo_urls[:limit]

    total_repos = len(repo_urls)

    repo_count = 0
    batch_number = 1
    last_batch_results = None

    for start in range(0, total_repos, batch_size):
        end = min(start + batch_size, total_repos)
        batch_urls = repo_urls[start:end]

        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(process_repo_for_almanack, repo_url): repo_url
                for repo_url in batch_urls
            }

            batch_results = []

            for future in as_completed(futures):
                repo_count += 1
                repo_url = futures[future]
                try:
                    result_dict = future.result()
                    # Only print the incrementer
                    print(f"[{repo_count}/{total_repos}]")
                    batch_results.append(result_dict)
                except Exception as e:
                    print(f"[{repo_count}/{total_repos}]")
                    batch_results.append({
                        "Repository URL": repo_url,
                        "almanack_error": str(e),
                        "checks_total": None,
                        "checks_passed": None,
                        "checks_pct": None,
                    })

        df_batch = pd.DataFrame(batch_results)
        df_batch = _sanitize_for_parquet(df_batch)

        batch_filename = f"repository_almanack_results_batch_{batch_number}.parquet"
        df_batch.to_parquet(batch_filename, compression="zstd", index=False)

        last_batch_results = df_batch
        batch_number += 1

    return last_batch_results


def test_pilot_run(num_repos: int = 10, batch_size: int = 5):
    try:
        result_df = repository_almanack_analysis(
            batch_size=batch_size,
            max_workers=4,
            limit=num_repos
        )
        return result_df
    except Exception as e:
        print(f"Pilot test failed: {e}")
        raise


if __name__ == "__main__":
    pilot_df = test_pilot_run(num_repos=10, batch_size=5)


No coverage.py data found in the repository.


[1/10]
[2/10]
[3/10]
[4/10]
[5/10]
[6/10]
[7/10]


No coverage.py data found in the repository.


[8/10]
[9/10]
[10/10]


In [15]:
import pandas as pd
import warnings
## FOR TESTING PURPOSES ONLY
df = pd.read_parquet("repository_almanack_results_batch_1.parquet")

# Supressing noisy warnings 
warnings.filterwarnings("ignore", category=UserWarning, module="urllib3")

with pd.option_context("display.max_columns", None, "display.width", 2000):
    print(df.head(10))


                              Repository URL                                          repo-path  repo-commits  repo-file-count        repo-commit-time-range  repo-days-of-development  repo-commits-per-day      almanack-table-datetime            almanack-version repo-primary-language repo-primary-license repo-doi repo-doi-publication-date  repo-includes-readme  repo-includes-contributing  repo-includes-code-of-conduct  repo-includes-license  repo-is-citable  repo-default-branch-not-master  repo-includes-common-docs  repo-unique-contributors  repo-unique-contributors-past-year  repo-unique-contributors-past-182-days  repo-tags-count  repo-tags-count-past-year  repo-tags-count-past-182-days  repo-stargazers-count repo-uses-issues  repo-issues-open-count repo-pull-requests-enabled  repo-forks-count  repo-subscribers-count  repo-packages-ecosystems-count  repo-packages-versions-count  repo-social-media-platforms-count repo-doi-valid-format repo-doi-https-resolvable repo-doi-cited-by-count r