In [1]:
from collections import defaultdict
from tree_sitter import Language, Parser
from pprint import pprint
import json

In [2]:
Language.build_library(
    # Store the library in the `build` directory
    "build/my-languages.so",
    # Include one or more languages
    ["tree-sitter-python"],
)

False

In [3]:
PY_LANGUAGE = Language("build/my-languages.so", "python")

In [4]:
parser = Parser()
parser.set_language(PY_LANGUAGE)

In [5]:
# load nested data
dataset_path = "../../data/raw/nested_data.json"
with open(dataset_path, 'r') as f:
		data = json.load(f)

In [6]:
code_snippet = data["oidc"]["code"][0]['oidc.py']

In [7]:
tree = parser.parse(bytes(code_snippet, "utf8"))
root_node = tree.root_node

In [8]:
def extract_functions_classes_imports(node):
    if node.type == 'import_statement':
        import_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'imports': [import_text]}

    elif node.type == 'function_definition':
        function_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'functions': [function_text]}

    elif node.type == 'class_definition':
        class_text = node.text.strip() if hasattr(node, 'text') else ""
        class_body = []
        for child in node.children:
            if child.type == 'function_definition':
                method_text = child.text.strip() if hasattr(child, 'text') else ""
                class_body.append(method_text)

        return {'classes': {class_text: class_body}}

    # Check if it's a documentation string for the entire module
    elif node.type == 'expression_statement' and node.child_count > 0 and node.children[0].type == 'string':
        doc_string = node.children[0].text.strip() if hasattr(node.children[0], 'text') else ""
        return {'documentation': [doc_string]}

    # If it's not an import, function, or class, and not a documentation string, consider it as 'other'
    else:
        other_text = node.text.strip() if hasattr(node, 'text') else ""
        return {'other': [other_text]}


In [9]:
def categorize_code(root_node):
    result = {'imports': [], 'functions': [], 'classes': defaultdict(list), 'documentation': [], 'other': []}
    for node in root_node.children:
        category_result = extract_functions_classes_imports(node)
        for category, items in category_result.items():
            if category == 'classes':
                for class_name, class_body in items.items():
                    result[category][class_name].extend(class_body)
            else:
                result[category].extend(items)

    return result

In [10]:
result = categorize_code(root_node)

In [11]:
result.keys()

dict_keys(['imports', 'functions', 'classes', 'documentation', 'other'])

In [12]:
print("Imports:", result['imports'])

Imports: [b'import logging', b'import sys', b'import time', b'import urllib.parse', b'import webbrowser', b'import id', b'import jwt', b'import requests']


In [13]:
print("Functions:", result['functions'])

Functions: [b'def detect_credential() -> Optional[str]:\n    """Calls `id.detect_credential`, but wraps exceptions with our own exception type."""\n    try:\n        return cast(Optional[str], id.detect_credential(_DEFAULT_AUDIENCE))\n    except id.IdentityError as exc:\n        IdentityError.raise_from_id(exc)']


In [14]:
print("Classes:")
for class_name, class_body in result['classes'].items():
    print(f"  {class_name}: {class_body}")

Classes:
  b'class _OpenIDConfiguration(BaseModel):\n    """\n    Represents a (subset) of the fields provided by an OpenID Connect provider\'s\n    `.well-known/openid-configuration` response, as defined by OpenID Connect Discovery.\n\n    See: <https://openid.net/specs/openid-connect-discovery-1_0.html>\n    """\n\n    authorization_endpoint: StrictStr\n    token_endpoint: StrictStr': []
  b'class ExpiredIdentity(Exception):\n    """An error raised when an identity token is expired."""': []
  b'class IdentityToken:\n    """\n    An OIDC "identity", corresponding to an underlying OIDC token with\n    a sensible subject, issuer, and audience for Sigstore purposes.\n    """\n\n    def __init__(self, raw_token: str) -> None:\n        """\n        Create a new `IdentityToken` from the given OIDC token.\n        """\n\n        self._raw_token = raw_token\n\n        # NOTE: The lack of verification here is intentional, and is part of\n        # Sigstore\'s verification model: clients like s

In [15]:
print("Documentation:", result['documentation'])

Documentation: [b'"""\nAPI for retrieving OIDC tokens.\n"""']


In [16]:
print("Other:", result['other'])

Other: [b'# Copyright 2022 The Sigstore Authors', b'#', b'# Licensed under the Apache License, Version 2.0 (the "License");', b'# you may not use this file except in compliance with the License.', b'# You may obtain a copy of the License at', b'#', b'#      http://www.apache.org/licenses/LICENSE-2.0', b'#', b'# Unless required by applicable law or agreed to in writing, software', b'# distributed under the License is distributed on an "AS IS" BASIS,', b'# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.', b'# See the License for the specific language governing permissions and', b'# limitations under the License.', b'from __future__ import annotations', b'from datetime import datetime, timezone', b'from typing import NoReturn, Optional, cast', b'from pydantic import BaseModel, StrictStr', b'from sigstore.errors import Error, NetworkError', b'DEFAULT_OAUTH_ISSUER_URL = "https://oauth2.sigstore.dev/auth"', b'STAGING_OAUTH_ISSUER_URL = "https://oauth2.sigstage.dev/au