In [74]:
import ast
import inspect

In [75]:
code = """
import numpy as np
from pandas import DataFrame

from llmcoder.utils import get_data_dir

print(np.array([1,2,3]))
df = DataFrame({'a': [1,2,3], 'b': [4,5,6]})
illegal_df = DataFrame('not_a_dict')

print(get_data_dir(1))
"""

In [76]:
import ast

def extract_import_usages(code):
    # Parse the code into an AST
    tree = ast.parse(code)

    # Mapping of aliases to module names
    import_map = {}

    # Function to handle import and import from statements
    def handle_import(node):
        if isinstance(node, ast.Import):
            for name in node.names:
                import_map[name.asname or name.name] = name.name
        elif isinstance(node, ast.ImportFrom):
            module = node.module
            for name in node.names:
                full_name = f"{module}.{name.name}"
                import_map[name.asname or name.name] = full_name

    # Walk the AST and populate the import map
    for node in ast.walk(tree):
        if isinstance(node, (ast.Import, ast.ImportFrom)):
            handle_import(node)

    # Function to extract function or class usages
    def extract_usages(node):
        if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
            module_name = node.func.value.id
            if module_name in import_map:
                func_name = node.func.attr
                return f"{import_map[module_name]}.{func_name}"
        elif isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
            name = node.func.id
            if name in import_map:
                return import_map[name]
        return None

    # Extract and return all usages
    usages = [extract_usages(node) for node in ast.walk(tree)]
    return [usage for usage in usages if usage is not None]

In [77]:
import builtins

def extract_builtin_usages(code):
    # Parse the code into an AST
    tree = ast.parse(code)

    # List of all built-in function names
    builtin_functions = [func for func in dir(builtins) if callable(getattr(builtins, func))]

    # Function to extract built-in function usages
    def extract_usages(node):
        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name):
            if node.func.id in builtin_functions:
                return node.func.id
        return None

    # Extract and return all built-in function usages
    usages = [extract_usages(node) for node in ast.walk(tree)]
    return ["builtins." + usage for usage in usages if usage is not None]


In [78]:
usages = set(extract_import_usages(code))
print(usages)

{'llmcoder.utils.get_data_dir', 'pandas.DataFrame', 'numpy.array'}


In [79]:
builtin_usages = set(extract_builtin_usages(code))
print(builtin_usages)

{'builtins.print'}


In [80]:
all_usages = usages.union(builtin_usages)

In [81]:
import inspect
import importlib

def get_signature(usage):
    module_name, _, attr_name = usage.rpartition('.')
    try:
        module = importlib.import_module(module_name)
        attr = getattr(module, attr_name)
        if callable(attr):
            try:
                return inspect.signature(attr)
            except ValueError:
                # For built-in functions, return the docstring as a fallback
                return attr.__doc__
        else:
            return f"{attr_name} is not callable"
    except ImportError:
        return f"Module {module_name} not found"
    except AttributeError:
        return f"{attr_name} not found in {module_name}"

In [82]:
for usage in all_usages:
    print(f"{usage}: {get_signature(usage)}")

llmcoder.utils.get_data_dir: (*args: str, create: bool = False) -> str
pandas.DataFrame: (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'
builtins.print: (*args, sep=' ', end='\n', file=None, flush=False)
numpy.array: array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
          like=None)

    Create an array.

    Parameters
    ----------
    object : array_like
        An array, any object exposing the array interface, an object whose
        ``__array__`` method returns an array, or any (nested) sequence.
        If object is a scalar, a 0-dimensional array containing object is
        returned.
    dtype : data-type, optional
        The desired data-type for the array. If not given, NumPy will try to use
        a default ``dtype`` that can represent the values (by applying promotion
        rules when necessary.)
    copy : bool, optional
        If true (default), th

In [83]:
def extract_function_usages_with_args(code):
    tree = ast.parse(code)
    function_usages = []

    for node in ast.walk(tree):
        if isinstance(node, ast.Call):
            function_name = ''
            if isinstance(node.func, ast.Attribute):
                function_name = node.func.attr
            elif isinstance(node.func, ast.Name):
                function_name = node.func.id
            
            args = [type(arg).__name__ for arg in node.args]  # Extract argument types
            kwargs = [type(value).__name__ for key, value in zip(node.keywords, node.keywords)]
            function_usages.append((function_name, args, kwargs))

    return function_usages

In [84]:
usages_with_args = extract_function_usages_with_args(code)
print(usages_with_args)

[('print', ['Call'], []), ('DataFrame', ['Dict'], []), ('DataFrame', ['Constant'], []), ('print', ['Call'], []), ('array', ['List'], []), ('get_data_dir', ['Constant'], [])]


In [85]:
def match_arg_types(usage_args, sig_params):
    """
    Match the types of arguments in usage against the expected types in the signature.
    For simplification, we'll just compare the count and presence of 'args' and 'kwargs'.
    """
    positional_args = [param for param in sig_params.values() if param.kind in [param.POSITIONAL_OR_KEYWORD, param.POSITIONAL_ONLY]]
    keyword_args = [param for param in sig_params.values() if param.kind == param.KEYWORD_ONLY]
    
    # Check positional arguments
    if len(usage_args) > len(positional_args):
        return False  # Too many arguments

    # For simplicity, this version won't delve into deeper type matching
    # It assumes that if the count is right, the types are matched.
    # In a more advanced implementation, you could attempt to match the types more accurately.

    # Check keyword arguments - this implementation assumes correct usage of keyword arguments
    # A more advanced implementation would check the actual keyword names and types

    return True

In [86]:
def usage_matches_signature(usage, signature):
    func_name, args, kwargs = usage
    sig_params = signature.parameters

    # Count the number of required positional arguments
    required_args_count = sum(1 for param in sig_params.values() if param.default is param.empty and param.kind == param.POSITIONAL_OR_KEYWORD)

    # Check if the number of arguments matches
    if len(args) < required_args_count:
        return False

    # Check for excess arguments if the function does not accept variadic arguments
    if not any(param.kind == param.VAR_POSITIONAL for param in sig_params.values()):
        if len(args) > len(sig_params):
            return False
        
    if not match_arg_types(args, sig_params):
        return False

    # Further type checks can be added here, based on the types in 'args' and the expected types from 'signature'
    # However, this requires a more complex implementation and type inference system

    return True

In [87]:
all_usages

{'builtins.print',
 'llmcoder.utils.get_data_dir',
 'numpy.array',
 'pandas.DataFrame'}

In [88]:
usages_with_args

[('print', ['Call'], []),
 ('DataFrame', ['Dict'], []),
 ('DataFrame', ['Constant'], []),
 ('print', ['Call'], []),
 ('array', ['List'], []),
 ('get_data_dir', ['Constant'], [])]

In [89]:
def create_usage_signature_list(unique_usages, usages_with_args, get_signature_func):
    # Map short function names to their fully qualified names
    name_mapping = {name.split('.')[-1]: name for name in unique_usages}

    usage_signature_list = []

    for usage in usages_with_args:
        func_name, args, kwargs = usage
        qualified_name = name_mapping.get(func_name, func_name)

        signature = get_signature_func(qualified_name)
        usage_signature_list.append({
            "usage": usage,
            "signature": signature
        })

    return usage_signature_list

In [90]:
usage_signature_list = create_usage_signature_list(all_usages, usages_with_args, get_signature)
for usage_signature in usage_signature_list:
    print(usage_signature)

{'usage': ('print', ['Call'], []), 'signature': <Signature (*args, sep=' ', end='\n', file=None, flush=False)>}
{'usage': ('DataFrame', ['Dict'], []), 'signature': <Signature (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'>}
{'usage': ('DataFrame', ['Constant'], []), 'signature': <Signature (data=None, index: 'Axes | None' = None, columns: 'Axes | None' = None, dtype: 'Dtype | None' = None, copy: 'bool | None' = None) -> 'None'>}
{'usage': ('print', ['Call'], []), 'signature': <Signature (*args, sep=' ', end='\n', file=None, flush=False)>}
{'usage': ('get_data_dir', ['Constant'], []), 'signature': <Signature (*args: str, create: bool = False) -> str>}


In [91]:
# For each usage, check if the arguments match the signature
for usage_signature in usage_signature_list:
    usage = usage_signature["usage"]
    signature = usage_signature["signature"]
    if type(signature) == str:
        print(f"Signature for usage {usage} could not be found")
        continue
    if usage_matches_signature(usage, signature):
        print(f"Usage {usage} matches signature")
    else:
        print(f"Usage {usage} does not match signature")

Usage ('print', ['Call'], []) does not match signature
Usage ('DataFrame', ['Dict'], []) matches signature
Usage ('DataFrame', ['Constant'], []) matches signature
Usage ('print', ['Call'], []) does not match signature
Signature for usage ('array', ['List'], []) could not be found
Usage ('get_data_dir', ['Constant'], []) does not match signature
