In [1]:
from llmcoder.analyze.SignatureAnalyzer import SignatureAnalyzer

In [2]:
analyzer = SignatureAnalyzer()

In [3]:
code = """import numpy as np
from pandas import DataFrame
import matplotlib

np.random.randn(1000, 2)
df = DataFrame()
np.array(1, 2, 3)

matplotlib.pyplot.plot(np.random.randn(1000, 2))"""

In [4]:
analyzer.get_signature_and_doc("get_signature_test.py", "randn")

import_aliases={'np': 'numpy', 'matplotlib': 'matplotlib'}
direct_imports={'DataFrame': 'pandas'}
function_calls=[('np', 'random.randn')]
module_alias='np' func_name='random.randn'


[{'name': 'random.randn',
  'signature': None,
  'doc': 'randn(d0, d1, ..., dn)\n\nReturn a sample (or samples) from the "standard normal" distribution.\n\n.. note::\n    This is a convenience function for users porting code from Matlab,\n    and wraps `standard_normal`. That function takes a\n    tuple to specify the size of the output, which is consistent with\n    other NumPy functions like `numpy.zeros` and `numpy.ones`.\n\n.. note::\n    New code should use the\n    `~numpy.random.Generator.standard_normal`\n    method of a `~numpy.random.Generator` instance instead;\n    please see the :ref:`random-quick-start`.\n\nIf positive int_like arguments are provided, `randn` generates an array\nof shape ``(d0, d1, ..., dn)``, filled\nwith random floats sampled from a univariate "normal" (Gaussian)\ndistribution of mean 0 and variance 1. A single float randomly sampled\nfrom the distribution is returned if no argument is provided.\n\nParameters\n----------\nd0, d1, ..., dn : int, optional

In [103]:
import ast
import inspect
import os
import re
# import builtins
import tempfile
from collections import namedtuple
from typing import Generator


Import = namedtuple("Import", ["module", "name", "alias"])

def get_imports(path: str, query: str | list[str] | None = None) -> Generator:
    if isinstance(query, str):
        query = [query]
    elif isinstance(query, list) and not query:
        print("Empty query specified.")
        return

    with open(path) as fh:
        root = ast.parse(fh.read(), path)

    for node in ast.walk(root):
        if isinstance(node, ast.Import):
            for alias in node.names:
                module = alias.name
                module_name = alias.asname if alias.asname else alias.name
                if query:
                    for q in query:
                        if q.startswith(module_name + ".") or q == module_name:
                            yield Import([module], q.split('.')[-1], module_name)
                else:
                    yield Import([module], None, module_name)
        elif isinstance(node, ast.ImportFrom):
            if node.module is None:
                continue
            for alias in node.names:
                module = node.module.split('.')
                name = alias.name
                asname = alias.asname if alias.asname else name
                if query:
                    if name in query or asname in query:
                        yield Import(module, name, asname)
                else:
                    yield Import(module, name, asname)

In [104]:
for i in get_imports("get_signature_test.py"):
    print(i)

Import(module=['numpy'], name=None, alias='np')
Import(module=['pandas'], name='DataFrame', alias='DataFrame')
Import(module=['matplotlib'], name=None, alias='matplotlib')


In [105]:
def get_signature_and_doc(path: str, query: str | list[str] | None) -> list[dict]:
        """
        Get the signature and documentation of a function or class.

        Parameters
        ----------
        path : str
            Path to the Python file. Can be temporary.
        query : str | list[str] | None
            The query string to search for. E.g. a function name or a class name.

        Returns
        -------
        list[dict]
            A list of dictionaries containing the signature and documentation of every match to the query.
        """
        signature_and_doc = []


        for imp in get_imports(path, query):
            try:
                module_path = '.'.join(imp.module) if imp.module else imp.name[0]

                if imp.name:  # Specific class/function is imported
                    obj = __import__(module_path, fromlist=[imp.name[-1]])
                    attr = getattr(obj, imp.name[-1], None)
                else:  # Entire module is imported
                    obj = __import__(module_path)
                    attr = obj

                # Check if the attr is callable or has a signature
                if callable(attr):
                    sig = inspect.signature(attr)
                    doc = inspect.getdoc(attr)
                    signature_and_doc.append({
                        "name": imp.alias if imp.alias else imp.name[-1] if imp.name else module_path,
                        "signature": str(sig),
                        "doc": doc
                    })
                else:
                    print(f"No callable attribute found for {module_path}")

            except (ImportError, AttributeError, TypeError) as e:
                print(f"Error handling {module_path}: {e}")
                continue

        return signature_and_doc

In [106]:
for i in get_signature_and_doc("get_signature_test.py", None):
    print(i)

No callable attribute found for numpy
No callable attribute found for pandas
No callable attribute found for matplotlib


In [107]:
def find_function_calls(code: str):
    """
    Parse the Python code and find all function or class calls.
    """
    root = ast.parse(code)
    function_calls = []

    for node in ast.walk(root):
        if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
            # This is a function or class call, like es.ElasticsearchStore()
            module_alias = node.func.value.id
            func_name = node.func.attr
            function_calls.append((module_alias, func_name))

    return function_calls

In [108]:
find_function_calls("es.ElasticsearchStore()")

[('es', 'ElasticsearchStore')]

In [109]:
import inspect
import ast

# Assume get_imports function is defined elsewhere

def find_function_calls(code: str, query: str | list[str] | None):
    root = ast.parse(code)
    function_calls = []

    if isinstance(query, str):
        query = [query]

    for node in ast.walk(root):
        if isinstance(node, ast.Call):
            if isinstance(node.func, ast.Attribute):
                # Handle nested attributes
                attribute_chain = []
                current = node.func
                while isinstance(current, ast.Attribute):
                    attribute_chain.append(current.attr)
                    current = current.value
                if isinstance(current, ast.Name):
                    attribute_chain.append(current.id)
                attribute_chain.reverse()
                module_alias = attribute_chain[0]
                func_name = '.'.join(attribute_chain[1:])
            elif isinstance(node.func, ast.Name):
                # Direct function call
                module_alias = None
                func_name = node.func.id
            else:
                continue

            if not query or func_name in query or '.'.join(attribute_chain) in query:
                function_calls.append((module_alias, func_name))

    return function_calls

def get_signature_and_doc(path: str, query: str | list[str] | None = None) -> list[dict]:
    signature_and_doc = []
    import_aliases = {}

    with open(path) as file:
        code = file.read()

    # Get all imports
    for imp in get_imports(path):
        full_module = '.'.join(imp.module) if imp.module else imp.name[0]
        alias = imp.alias if imp.alias else imp.name[-1] if imp.name else full_module
        import_aliases[alias] = full_module

    # Find all function calls that match the query
    function_calls = find_function_calls(code, query)

    print(f"{function_calls=}")

    for module_alias, func_name in function_calls:
        try:
            if module_alias and module_alias in import_aliases:
                module_path = import_aliases[module_alias]
                parts = func_name.split('.')
                module = __import__(module_path, fromlist=[parts[0]])
                attr = module
                for part in parts:
                    attr = getattr(attr, part, None)
            else:
                attr = globals().get(func_name, None)

            if attr and callable(attr):
                try:
                    sig = inspect.signature(attr)
                    doc = inspect.getdoc(attr)
                    signature_and_doc.append({
                        "name": func_name,
                        "signature": str(sig),
                        "doc": doc
                    })
                except ValueError:
                    signature_and_doc.append({
                        "name": func_name,
                        "signature": "Not available for built-in functions",
                        "doc": inspect.getdoc(attr)
                    })
            else:
                print(f"No callable attribute {func_name} found")

        except (ImportError, AttributeError) as e:
            print(f"Error importing {func_name}: {e}")

    return signature_and_doc


In [110]:
def get_signature_and_doc(path: str, query: str | list[str] | None = None) -> list[dict]:
    signature_and_doc = []
    import_aliases = {}
    direct_imports = {}  # Store direct imports

    with open(path) as file:
        code = file.read()

    # Get all imports
    for imp in get_imports(path):
        full_module = '.'.join(imp.module) if imp.module else imp.name[0]
        if imp.module and imp.name:  # Correctly identify direct imports
            direct_imports[imp.name] = full_module
        else:
            alias = imp.alias if imp.alias else imp.name[-1] if imp.name else full_module
            import_aliases[alias] = full_module

    print(f"{import_aliases=}")
    print(f"{direct_imports=}")

    # Find all function calls that match the query
    function_calls = find_function_calls(code, query)

    print(f"{function_calls=}")

    # Match the function calls to the imports
    matched_function_calls = []
    for module_alias, func_name in function_calls:
        if module_alias and module_alias in import_aliases:
            matched_function_calls.append((module_alias, func_name))
        elif func_name in direct_imports:
            matched_function_calls.append((direct_imports[func_name], func_name))
        else:
            print(f"No import found for {func_name}")

    for module_alias, func_name in matched_function_calls:
        print(f"{module_alias=} {func_name=}")
        try:
            if module_alias and module_alias in import_aliases:
                module_path = import_aliases[module_alias]
                parts = func_name.split('.')
                module = __import__(module_path, fromlist=[parts[0]])
                attr = module
                for part in parts:
                    attr = getattr(attr, part, None)
            elif func_name in direct_imports:  # Handle direct imports
                module_name = direct_imports[func_name]
                module = __import__(module_name, fromlist=[func_name])
                attr = getattr(module, func_name, None)
            else:
                attr = None

            if attr and callable(attr):
                try:
                    sig = inspect.signature(attr)
                    doc = inspect.getdoc(attr)
                    signature_and_doc.append({
                        "name": func_name,
                        "signature": str(sig),
                        "doc": doc
                    })
                except ValueError:
                    signature_and_doc.append({
                        "name": func_name,
                        "signature": None,
                        "doc": inspect.getdoc(attr)
                    })
            else:
                print(f"No callable attribute {func_name} found")

        except (ImportError, AttributeError) as e:
            print(f"Error importing {func_name}: {e}")

    return signature_and_doc


In [111]:
get_signature_and_doc("get_signature_test.py")

import_aliases={'np': 'numpy', 'matplotlib': 'matplotlib'}
direct_imports={'DataFrame': 'pandas'}
function_calls=[('np', 'random.randn'), (None, 'DataFrame'), ('np', 'array'), ('matplotlib', 'pyplot.plot'), ('np', 'random.randn')]
module_alias='np' func_name='random.randn'
module_alias='pandas' func_name='DataFrame'
module_alias='np' func_name='array'
module_alias='matplotlib' func_name='pyplot.plot'
module_alias='np' func_name='random.randn'


[{'name': 'random.randn',
  'signature': None,
  'doc': 'randn(d0, d1, ..., dn)\n\nReturn a sample (or samples) from the "standard normal" distribution.\n\n.. note::\n    This is a convenience function for users porting code from Matlab,\n    and wraps `standard_normal`. That function takes a\n    tuple to specify the size of the output, which is consistent with\n    other NumPy functions like `numpy.zeros` and `numpy.ones`.\n\n.. note::\n    New code should use the\n    `~numpy.random.Generator.standard_normal`\n    method of a `~numpy.random.Generator` instance instead;\n    please see the :ref:`random-quick-start`.\n\nIf positive int_like arguments are provided, `randn` generates an array\nof shape ``(d0, d1, ..., dn)``, filled\nwith random floats sampled from a univariate "normal" (Gaussian)\ndistribution of mean 0 and variance 1. A single float randomly sampled\nfrom the distribution is returned if no argument is provided.\n\nParameters\n----------\nd0, d1, ..., dn : int, optional

In [7]:
import ast
import inspect

In [8]:
import ast
from collections import namedtuple

Import = namedtuple("Import", ["module", "name", "alias"])

def get_imports(path):
    with open(path) as fh:       
        root = ast.parse(fh.read(), path)

    for node in ast.walk(root):
        if isinstance(node, ast.Import):
            module = []
        elif isinstance(node, ast.ImportFrom): 
            module = node.module.split('.')
        else:
            continue

        for n in node.names:
            yield Import(module, n.name.split('.'), n.asname)

In [9]:
for i in get_imports("get_signature_test.py"):
    print(i)

Import(module=['langchain', 'vectorstores', 'elasticsearch'], name=['ElasticsearchStore'], alias=None)
Import(module=[], name=['numpy'], alias='np')
Import(module=[], name=['json'], alias=None)
Import(module=['pandas'], name=['DataFrame'], alias=None)
Import(module=[], name=['matplotlib', 'pyplot'], alias='plt')


In [11]:
import inspect

for imp in get_imports("get_signature_test.py"):
    if imp.module:
        module = '.'.join(imp.module)
    else:
        module = imp.name[0]
    name = imp.alias if imp.alias else imp.name[0]
    try:
        obj = __import__(module, fromlist=[name])
        obj = getattr(obj, name)
        sig = inspect.signature(obj)
        doc = inspect.getdoc(obj)
        print(f"Signature of {name}: {sig}")
        print(f"Documentation of {name}: {doc[:1000]}")
    except (ImportError, AttributeError):
        print(f"Cannot get signature and documentation of {name}")

Signature of ElasticsearchStore: (index_name: str, *, embedding: Optional[langchain_core.embeddings.Embeddings] = None, es_connection: Optional[ForwardRef('Elasticsearch')] = None, es_url: Optional[str] = None, es_cloud_id: Optional[str] = None, es_user: Optional[str] = None, es_api_key: Optional[str] = None, es_password: Optional[str] = None, vector_query_field: str = 'vector', query_field: str = 'text', distance_strategy: Optional[Literal[<DistanceStrategy.COSINE: 'COSINE'>, <DistanceStrategy.DOT_PRODUCT: 'DOT_PRODUCT'>, <DistanceStrategy.EUCLIDEAN_DISTANCE: 'EUCLIDEAN_DISTANCE'>]] = None, strategy: langchain.vectorstores.elasticsearch.BaseRetrievalStrategy = <langchain.vectorstores.elasticsearch.ApproxRetrievalStrategy object at 0x7f03dbd0e790>)
Documentation of ElasticsearchStore: `Elasticsearch` vector store.

Example:
    .. code-block:: python

        from langchain.vectorstores import ElasticsearchStore
        from langchain.embeddings.openai import OpenAIEmbeddings

        

In [10]:
import ast

def extract_import_usages(code):
    # Parse the code into an AST
    tree = ast.parse(code)

    # Mapping of aliases to module names
    import_map = {}

    # Function to handle import and import from statements
    def handle_import(node):
        if isinstance(node, ast.Import):
            for name in node.names:
                import_map[name.asname or name.name] = name.name
        elif isinstance(node, ast.ImportFrom):
            module = node.module
            for name in node.names:
                full_name = f"{module}.{name.name}"
                import_map[name.asname or name.name] = full_name

    # Walk the AST and populate the import map
    for node in ast.walk(tree):
        if isinstance(node, (ast.Import, ast.ImportFrom)):
            handle_import(node)

    # Function to extract function or class usages
    def extract_usages(node):
        if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
            module_name = node.func.value.id
            if module_name in import_map:
                func_name = node.func.attr
                return f"{import_map[module_name]}.{func_name}"
        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
            name = node.func.id
            if name in import_map:
                return import_map[name]
        return None

    # Extract and return all usages
    usages = [extract_usages(node) for node in ast.walk(tree)]
    return [usage for usage in usages if usage is not None]

In [11]:
import builtins

def extract_builtin_usages(code):
    # Parse the code into an AST
    tree = ast.parse(code)

    # List of all built-in function names
    builtin_functions = [func for func in dir(builtins) if callable(getattr(builtins, func))]

    # Function to extract built-in function usages
    def extract_usages(node):
        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
            if node.func.id in builtin_functions:
                return node.func.id
        return None

    # Extract and return all built-in function usages
    usages = [extract_usages(node) for node in ast.walk(tree)]
    return ["builtins." + usage for usage in usages if usage is not None]


In [12]:
usages = set(extract_import_usages(code))
print(usages)

set()


In [13]:
builtin_usages = set(extract_builtin_usages(code))
print(builtin_usages)

set()


In [80]:
all_usages = usages.union(builtin_usages)

In [81]:
import inspect
import importlib

def get_signature(usage):
    module_name, _, attr_name = usage.rpartition('.')
    try:
        module = importlib.import_module(module_name)
        attr = getattr(module, attr_name)
        if callable(attr):
            try:
                return inspect.signature(attr)
            except ValueError:
                # For built-in functions, return the docstring as a fallback
                return attr.__doc__
        else:
            return f"{attr_name} is not callable"
    except ImportError:
        return f"Module {module_name} not found"
    except AttributeError:
        return f"{attr_name} not found in {module_name}"

In [82]:
for usage in all_usages:
    print(f"{usage}: {get_signature(usage)}")

llmcoder.utils.get_data_dir: (*args: str, create: bool = False) -> str
pandas.DataFrame: (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'
builtins.print: (*args, sep=' ', end='\n', file=None, flush=False)
numpy.array: array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
          like=None)

    Create an array.

    Parameters
    ----------
    object : array_like
        An array, any object exposing the array interface, an object whose
        ``__array__`` method returns an array, or any (nested) sequence.
        If object is a scalar, a 0-dimensional array containing object is
        returned.
    dtype : data-type, optional
        The desired data-type for the array. If not given, NumPy will try to use
        a default ``dtype`` that can represent the values (by applying promotion
        rules when necessary.)
    copy : bool, optional
        If true (default), th

In [83]:
def extract_function_usages_with_args(code):
    tree = ast.parse(code)
    function_usages = []

    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            function_name = ''
            if isinstance(node.func, ast.Attribute):
                function_name = node.func.attr
            elif isinstance(node.func, ast.Name):
                function_name = node.func.id
            
            args = [type(arg).__name__ for arg in node.args]  # Extract argument types
            kwargs = [type(value).__name__ for key, value in zip(node.keywords, node.keywords)]
            function_usages.append((function_name, args, kwargs))

    return function_usages

In [84]:
usages_with_args = extract_function_usages_with_args(code)
print(usages_with_args)

[('print', ['Call'], []), ('DataFrame', ['Dict'], []), ('DataFrame', ['Constant'], []), ('print', ['Call'], []), ('array', ['List'], []), ('get_data_dir', ['Constant'], [])]


In [85]:
def match_arg_types(usage_args, sig_params):
    """
    Match the types of arguments in usage against the expected types in the signature.
    For simplification, we'll just compare the count and presence of 'args' and 'kwargs'.
    """
    positional_args = [param for param in sig_params.values() if param.kind in [param.POSITIONAL_OR_KEYWORD, param.POSITIONAL_ONLY]]
    keyword_args = [param for param in sig_params.values() if param.kind == param.KEYWORD_ONLY]
    
    # Check positional arguments
    if len(usage_args) > len(positional_args):
        return False  # Too many arguments

    # For simplicity, this version won't delve into deeper type matching
    # It assumes that if the count is right, the types are matched.
    # In a more advanced implementation, you could attempt to match the types more accurately.

    # Check keyword arguments - this implementation assumes correct usage of keyword arguments
    # A more advanced implementation would check the actual keyword names and types

    return True

In [86]:
def usage_matches_signature(usage, signature):
    func_name, args, kwargs = usage
    sig_params = signature.parameters

    # Count the number of required positional arguments
    required_args_count = sum(1 for param in sig_params.values() if param.default is param.empty and param.kind == param.POSITIONAL_OR_KEYWORD)

    # Check if the number of arguments matches
    if len(args) < required_args_count:
        return False

    # Check for excess arguments if the function does not accept variadic arguments
    if not any(param.kind == param.VAR_POSITIONAL for param in sig_params.values()):
        if len(args) > len(sig_params):
            return False
        
    if not match_arg_types(args, sig_params):
        return False

    # Further type checks can be added here, based on the types in 'args' and the expected types from 'signature'
    # However, this requires a more complex implementation and type inference system

    return True

In [87]:
all_usages

{'builtins.print',
 'llmcoder.utils.get_data_dir',
 'numpy.array',
 'pandas.DataFrame'}

In [88]:
usages_with_args

[('print', ['Call'], []),
 ('DataFrame', ['Dict'], []),
 ('DataFrame', ['Constant'], []),
 ('print', ['Call'], []),
 ('array', ['List'], []),
 ('get_data_dir', ['Constant'], [])]

In [89]:
def create_usage_signature_list(unique_usages, usages_with_args, get_signature_func):
    # Map short function names to their fully qualified names
    name_mapping = {name.split('.')[-1]: name for name in unique_usages}

    usage_signature_list = []

    for usage in usages_with_args:
        func_name, args, kwargs = usage
        qualified_name = name_mapping.get(func_name, func_name)

        signature = get_signature_func(qualified_name)
        usage_signature_list.append({
            "usage": usage,
            "signature": signature
        })

    return usage_signature_list

In [90]:
usage_signature_list = create_usage_signature_list(all_usages, usages_with_args, get_signature)
for usage_signature in usage_signature_list:
    print(usage_signature)

{'usage': ('print', ['Call'], []), 'signature': <Signature (*args, sep=' ', end='\n', file=None, flush=False)>}
{'usage': ('DataFrame', ['Dict'], []), 'signature': <Signature (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'>}
{'usage': ('DataFrame', ['Constant'], []), 'signature': <Signature (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'>}
{'usage': ('print', ['Call'], []), 'signature': <Signature (*args, sep=' ', end='\n', file=None, flush=False)>}
{'usage': ('get_data_dir', ['Constant'], []), 'signature': <Signature (*args: str, create: bool = False) -> str>}


In [91]:
# For each usage, check if the arguments match the signature
for usage_signature in usage_signature_list:
    usage = usage_signature["usage"]
    signature = usage_signature["signature"]
    if type(signature) == str:
        print(f"Signature for usage {usage} could not be found")
        continue
    if usage_matches_signature(usage, signature):
        print(f"Usage {usage} matches signature")
    else:
        print(f"Usage {usage} does not match signature")

Usage ('print', ['Call'], []) does not match signature
Usage ('DataFrame', ['Dict'], []) matches signature
Usage ('DataFrame', ['Constant'], []) matches signature
Usage ('print', ['Call'], []) does not match signature
Signature for usage ('array', ['List'], []) could not be found
Usage ('get_data_dir', ['Constant'], []) does not match signature
