In [1]:
import requests
import re
from bs4 import BeautifulSoup
import json
import html2text

In [2]:
base_url = "https://sigstore.github.io/sigstore-python/sigstore/"
subpaths = ["errors.html", "oidc.html", "sign.html", "transparency.html", "verify.html"]
data = {}

for subpath in subpaths:
    url = f"{base_url}{subpath}"
    response = requests.get(url)
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        for tag in soup.find_all(['img', 'svg', 'style']):
            tag.decompose()

        markdown_content = html2text.html2text(str(soup))
        
        key = subpath.split('.')[0]
        data[key] = markdown_content

with open('../../data/raw/markdown_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("Markdown content extracted and stored in markdown_data.json")

Markdown content extracted and stored in markdown_data.json


In [3]:
# Print the first 2 markdowns
count = 0
for key, value in data.items():
    print(f"Markdown for '{key}':")
    print(value)
    count += 1
    if count == 2:
        break

Markdown for 'errors':
[ sigstore](../sigstore.html)

## API Documentation

  * Error
    * diagnostics
    * print_and_exit
  * NetworkError
    * diagnostics
  * TUFError
    * TUFError
    * message
    * diagnostics
  * MetadataError
    * diagnostics
  * RootError
    * diagnostics

[ built with pdoc ](https://pdoc.dev "pdoc: Python API documentation
generator")

#  [sigstore](./../sigstore.html).errors

Exceptions.

View Source

    
    
      1# Copyright 2023 The Sigstore Authors
      2#
      3# Licensed under the Apache License, Version 2.0 (the "License");
      4# you may not use this file except in compliance with the License.
      5# You may obtain a copy of the License at
      6#
      7#      http://www.apache.org/licenses/LICENSE-2.0
      8#
      9# Unless required by applicable law or agreed to in writing, software
     10# distributed under the License is distributed on an "AS IS" BASIS,
     11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or i

In [4]:
def download_file(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

base_url = "https://raw.githubusercontent.com/sigstore/sigstore-python/main/sigstore/"
files = {
    "errors": "errors.py",
    "oidc": "oidc.py",
    "sign": "sign.py",
    "transparency": "transparency.py",
    "verify_models": "verify/models.py",
    "verify_policy": "verify/policy.py",
    "verify_verifier": "verify/verifier.py"
}

data = {}

for key, file_path in files.items():
    file_url = f"{base_url}/{file_path}"  # Corrected URL formation
    file_content = download_file(file_url)
    if file_content:
        print(f"Downloaded content for {file_path} in {key}")
        data[key] = {
            "markdown": "",  # Placeholder for markdown content (to be added later)
            "code": [{file_path: file_content}]  # List to store code content
        }
    else:
        print(f"Failed to download content for {file_path} in {key}")

with open('../../data/raw/markdown_data.json', 'r') as json_file:
    existing_data = json.load(json_file)

for key, value in existing_data.items():
    if key in data:
        data[key]["markdown"] = re.sub(r'^\s*\d.*\n?', '', value, flags=re.MULTILINE)

with open('../../data/raw/nested_data.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("Nested JSON data created and stored in nested_data.json")

Downloaded content for errors.py in errors
Downloaded content for oidc.py in oidc
Downloaded content for sign.py in sign
Downloaded content for transparency.py in transparency
Downloaded content for verify/models.py in verify_models
Downloaded content for verify/policy.py in verify_policy
Downloaded content for verify/verifier.py in verify_verifier
Nested JSON data created and stored in nested_data.json
