## ADD Notebook Explanation

In [7]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd
import json
from typing import Optional
from pathlib import Path
from almanack import process_repo_for_almanack


def _sanitize_for_parquet(df: pd.DataFrame) -> pd.DataFrame:
    """Sanitize DataFrame for parquet compatibility"""
    def _encode_if_needed(v):
        if isinstance(v, (dict, list)):
            return json.dumps(v) if v else None
        return v

    result_df = df.copy()

    for col in result_df.columns:
        if result_df[col].dtype == "object":
            col_nonnull = result_df[col].dropna()
            sample_vals = col_nonnull.iloc[:3] if len(col_nonnull) > 0 else []
            needs_encoding = any(isinstance(val, (dict, list)) for val in sample_vals)
            if needs_encoding:
                result_df[col] = result_df[col].map(_encode_if_needed)

    numeric_cols = [
        "checks_total", "checks_passed", "checks_pct",
        "almanack-score", "almanack-score-numerator", "almanack-score-denominator",
    ]
    for col in numeric_cols:
        if col in result_df.columns:
            result_df[col] = pd.to_numeric(result_df[col], errors="coerce")

    return result_df


def repository_almanack_analysis(batch_size: int = 500, max_workers: int = 16, limit: Optional[int] = None,):
    """
    Analyzes PubMed GitHub repositories and writes flattened Almanack metrics (checks included) to parquet.
    """
    parquet_file = "gather-pubmed-repos/pubmed_github_links.parquet"

    if not Path(parquet_file).exists():
        raise FileNotFoundError(f"Parquet file not found: {parquet_file}")

    df = pd.read_parquet(parquet_file)

    if "github_link" not in df.columns:
        raise ValueError(f"'github_link' column not found. Available: {list(df.columns)}")

    repo_urls = df["github_link"].drop_duplicates().dropna().tolist()

    if limit:
        repo_urls = repo_urls[:limit]

    total_repos = len(repo_urls)

    repo_count = 0
    batch_number = 1
    last_batch_results = None

    for start in range(0, total_repos, batch_size):
        end = min(start + batch_size, total_repos)
        batch_urls = repo_urls[start:end]

        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = {
                executor.submit(process_repo_for_almanack, repo_url): repo_url
                for repo_url in batch_urls
            }

            batch_results = []

            for future in as_completed(futures):
                repo_count += 1
                repo_url = futures[future]
                try:
                    result_dict = future.result()
                    # Only print the incrementer
                    print(f"[{repo_count}/{total_repos}]")
                    batch_results.append(result_dict)
                except Exception as e:
                    print(f"[{repo_count}/{total_repos}]")
                    batch_results.append({
                        "Repository URL": repo_url,
                        "almanack_error": str(e),
                        "checks_total": None,
                        "checks_passed": None,
                        "checks_pct": None,
                    })

        df_batch = pd.DataFrame(batch_results)
        df_batch = _sanitize_for_parquet(df_batch)

        batch_filename = f"repository_almanack_results_batch_{batch_number}.parquet"
        df_batch.to_parquet(batch_filename, compression="zstd", index=False)

        last_batch_results = df_batch
        batch_number += 1

    return last_batch_results


def test_pilot_run(num_repos: int = 10, batch_size: int = 5):
    try:
        result_df = repository_almanack_analysis(
            batch_size=batch_size,
            max_workers=4,
            limit=num_repos
        )
        return result_df
    except Exception as e:
        print(f"Pilot test failed: {e}")
        raise


if __name__ == "__main__":
    pilot_df = test_pilot_run(num_repos=10, batch_size=5)


ModuleNotFoundError: No module named 'almanack'

In [None]:
import pandas as pd
import warnings
## FOR TESTING PURPOSES ONLY
df = pd.read_parquet("repository_almanack_results_batch_1.parquet")

# Supressing noisy warnings 
warnings.filterwarnings("ignore", category=UserWarning, module="urllib3")

with pd.option_context("display.max_columns", None, "display.width", 2000):
    df.head(10)


                              Repository URL                                          repo-path  repo-commits  repo-file-count    repo-commit-time-range  repo-days-of-development  repo-commits-per-day      almanack-table-datetime            almanack-version repo-primary-language repo-primary-license repo-doi repo-doi-publication-date  repo-includes-readme  repo-includes-contributing  repo-includes-code-of-conduct  repo-includes-license  repo-is-citable  repo-default-branch-not-master  repo-includes-common-docs  repo-unique-contributors  repo-unique-contributors-past-year  repo-unique-contributors-past-182-days  repo-tags-count  repo-tags-count-past-year  repo-tags-count-past-182-days  repo-stargazers-count repo-uses-issues  repo-issues-open-count repo-pull-requests-enabled  repo-forks-count  repo-subscribers-count repo-packages-ecosystems  repo-packages-ecosystems-count  repo-packages-versions-count repo-social-media-platforms  repo-social-media-platforms-count repo-doi-valid-format re

In [5]:
import sys
print(sys.executable)
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "."])


/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/bin/python


[0m[31mERROR: Exception:
Traceback (most recent call last):
  File "/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/lib/python3.12/site-packages/pip/_internal/cli/base_command.py", line 107, in _run_wrapper
    status = _inner_run()
             ^^^^^^^^^^^^
  File "/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/lib/python3.12/site-packages/pip/_internal/cli/base_command.py", line 98, in _inner_run
    return self.run(options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/lib/python3.12/site-packages/pip/_internal/cli/req_command.py", line 71, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/lib/python3.12/site-packages/pip/_internal/commands/install.py", line 339, in run
    session = self.get_default_session(options)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/williamdavidso

CalledProcessError: Command '['/Users/williamdavidson/Desktop/almanack-2/almanack/.venv312/bin/python', '-m', 'pip', 'install', '-e', '.']' returned non-zero exit status 2.