In [1]:
import ast
import re
import pandas as pd
import os

In [2]:
class MethodAnalyzer(ast.NodeVisitor):
    def __init__(self):
        self.methods = []

    def visit_FunctionDef(self, node):
        method_info = {
            "name": node.name,
            "docstring_length": self._get_docstring_length(node),
            "code_length": self._get_code_length(node),
            "examples_length": self._get_examples_length(node),
            "start_line": node.lineno
        }
        self.methods.append(method_info)
        self.generic_visit(node)

    def _get_docstring_length(self, node):
        docstring = ast.get_docstring(node)
        if docstring:
            return len(docstring.splitlines())
        return 0

    def _get_examples_length(self, node):
        docstring = ast.get_docstring(node)
        if not docstring:
            return 0

        # Search for the Examples section
        examples_match = re.search(r'(?i)^Examples?\s*$', docstring, re.MULTILINE)
        if examples_match:
            start_idx = examples_match.end()
            examples_section = docstring[start_idx:].strip()
            next_header_match = re.search(r'^[A-Z][a-z]*\s*:', examples_section, re.MULTILINE)
            
            # Isolate the section up to the next header or end of the docstring
            if next_header_match:
                examples_section = examples_section[:next_header_match.start()].strip()

            # Count the lines in the Examples section
            return len(examples_section.splitlines())
        return 0

    def _get_code_length(self, node):
        start_line = node.lineno
        end_line = node.body[-1].lineno if node.body else start_line
        return end_line - start_line + 1

def analyze_python_file(file_path, output_csv_path):
    with open(file_path, "r") as file:
        tree = ast.parse(file.read(), filename=file_path)
    analyzer = MethodAnalyzer()
    analyzer.visit(tree)
    df = pd.DataFrame(analyzer.methods)

    # Filter out functions that start with an underscore (_)
    df = df[~df['name'].str.startswith('_')]

    df.to_csv(output_csv_path, index=False)


In [3]:
def analyze_directory_recursively(directory_path, output_csv_path):
    all_methods = []

    # Use os.walk to recursively visit each subdirectory
    for root, dirs, files in os.walk(directory_path):

        # Skip any "tests" directory
        dirs[:] = [d for d in dirs if d.lower() != "tests"]

        for filename in files:
            if filename.endswith(".py"):
                file_path = os.path.join(root, filename)
                absolute_file_path = os.path.abspath(file_path)
                # Use os.path.relpath to find the relative path
                relative_file_path = os.path.relpath(file_path, start=directory_path)

                # Open the file with utf-8 encoding
                try:
                    with open(file_path, "r", encoding='utf-8') as file:
                        tree = ast.parse(file.read(), filename=file_path)
                except (UnicodeDecodeError, SyntaxError) as e:
                    print(f"Skipped file due to decoding or syntax error: {file_path}")
                    continue
                
                # with open(file_path, "r") as file:
                #     tree = ast.parse(file.read(), filename=file_path)

                analyzer = MethodAnalyzer()
                analyzer.visit(tree)

                # Add the file path to each method's information
                for method in analyzer.methods:
                    method["file"] = relative_file_path

                # Append the methods to the global list
                all_methods.extend(analyzer.methods)

                # Clear methods for the next file analysis
                analyzer.methods.clear()

    # Create a DataFrame and filter out functions that start with an underscore, 
    df = pd.DataFrame(all_methods)
    df = df[~df['name'].str.startswith('_')]

    # Add other filtering options
    df = df[df['code_length'] >= 10]
    df = df[~df['name'].str.startswith('test_')]

    # Sort by docstring_length (ascending order)
    df = df.sort_values(by='docstring_length', ascending=True)

    # Export the sorted DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)

In [4]:
# Usage Example
# Provide the path to the root directory containing Python files and subdirectories, 
# and the desired CSV output path
# WINDOWS Example
directory_path = "..\\..\\numpy"
# Linux Example
# directory_path = "~/numpy"
csv_output_path = "docstring_lengths.csv"
analyze_directory_recursively(directory_path, csv_output_path)

  'args', {'val':'arch=', 'match': 'arg.*[0-9]|arch=.*', 'mfilter': '([0-9])|arch=(\w+)', 'mjoin': '*'},
