In [1]:
from collections import defaultdict
from tree_sitter import Language, Parser
from pprint import pprint
import json

In [2]:
Language.build_library(
    # Store the library in the `build` directory
    "build/my-languages.so",
    # Include one or more languages
    ["tree-sitter-python"],
)

False

In [3]:
PY_LANGUAGE = Language("build/my-languages.so", "python")

In [4]:
parser = Parser()
parser.set_language(PY_LANGUAGE)

In [5]:
# load nested data
dataset_path = "../../data/raw/nested_data.json"
with open(dataset_path, 'r') as f:
		data = json.load(f)

In [6]:
data.keys()

dict_keys(['errors', 'oidc', 'sign', 'transparency', 'verify_models', 'verify_policy', 'verify_verifier'])

In [7]:
data["errors"].keys()

dict_keys(['markdown', 'code'])

In [8]:
files = {
    "errors": "errors.py",
    "oidc": "oidc.py",
    "sign": "sign.py",
    "transparency": "transparency.py",
    "verify_models": "verify/models.py",
    "verify_policy": "verify/policy.py",
    "verify_verifier": "verify/verifier.py"
}

### Tree

In [9]:
def extract_functions_classes_imports(node):
    if node.type == 'import_statement':
        import_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'imports': [import_text]}

    elif node.type == 'function_definition':
        function_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'functions': [function_text]}

    elif node.type == 'class_definition':
        class_text = node.text.strip() if hasattr(node, 'text') else ""
        class_body = []
        for child in node.children:
            if child.type == 'function_definition':
                method_text = child.text.strip() if hasattr(child, 'text') else ""
                class_body.append(method_text)

        return {'classes': {class_text: class_body}}

    # Check if it's a documentation string for the entire module
    elif node.type == 'expression_statement' and node.child_count > 0 and node.children[0].type == 'string':
        doc_string = node.children[0].text.strip() if hasattr(node.children[0], 'text') else ""
        return {'documentation': [doc_string]}

    # If it's not an import, function, or class, and not a documentation string, consider it as 'other'
    else:
        other_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'other': [other_text]}


In [10]:
def categorize_code(root_node):
    result = {'imports': [], 'functions': [], 'classes': defaultdict(list), 'documentation': [], 'other': []}
    for node in root_node.children:
        category_result = extract_functions_classes_imports(node)
        for category, items in category_result.items():
            if category == 'classes':
                for class_name, class_body in items.items():
                    result[category][class_name].extend(class_body)
            else:
                result[category].extend(items)

    return result

In [11]:
for k in data.keys():
    code_chunk = data[k]["code"][0][files[k]]
    tree = parser.parse(bytes(code_chunk, "utf8"))
    root_node = tree.root_node
    k_categorized_code = categorize_code(root_node)
    data[k]['code_chunks'] = k_categorized_code

In [12]:
data.keys()

dict_keys(['errors', 'oidc', 'sign', 'transparency', 'verify_models', 'verify_policy', 'verify_verifier'])

In [13]:
data["errors"].keys()

dict_keys(['markdown', 'code', 'code_chunks'])

In [14]:
data["errors"]["code_chunks"]

{'imports': [b'import sys'],
 'functions': [],
 'classes': defaultdict(list,
             {b'class Error(Exception):\n    """Base sigstore exception type. Defines helpers for diagnostics."""\n\n    def diagnostics(self) -> str:\n        """Returns human-friendly error information."""\n\n        return """An issue occurred."""\n\n    def print_and_exit(self, raise_error: bool = False) -> None:\n        """Prints all relevant error information to stderr and exits."""\n\n        remind_verbose = (\n            "Raising original exception:"\n            if raise_error\n            else "For detailed error information, run sigstore with the `--verbose` flag."\n        )\n\n        print(f"{self.diagnostics()}\\n{remind_verbose}", file=sys.stderr)\n\n        if raise_error:\n            # don\'t want "during handling another exception"\n            self.__suppress_context__ = True\n            raise self\n\n        sys.exit(1)': [],
              b'class NetworkError(Error):\n    """Raised w

In [15]:
data["oidc"]["code_chunks"]

{'imports': [b'import logging',
  b'import sys',
  b'import time',
  b'import urllib.parse',
  b'import webbrowser',
  b'import id',
  b'import jwt',
  b'import requests'],
 'functions': [b'def detect_credential() -> Optional[str]:\n    """Calls `id.detect_credential`, but wraps exceptions with our own exception type."""\n    try:\n        return cast(Optional[str], id.detect_credential(_DEFAULT_AUDIENCE))\n    except id.IdentityError as exc:\n        IdentityError.raise_from_id(exc)'],
 'classes': defaultdict(list,
             {b'class _OpenIDConfiguration(BaseModel):\n    """\n    Represents a (subset) of the fields provided by an OpenID Connect provider\'s\n    `.well-known/openid-configuration` response, as defined by OpenID Connect Discovery.\n\n    See: <https://openid.net/specs/openid-connect-discovery-1_0.html>\n    """\n\n    authorization_endpoint: StrictStr\n    token_endpoint: StrictStr': [],
              b'class ExpiredIdentity(Exception):\n    """An error raised when an