# File Structure Functionality Demonstration

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from yellhorn_mcp.token_counter import TokenCounter
from yellhorn_mcp.formatters import get_codebase_snapshot, get_codebase_context
from pathlib import Path

In [None]:
# Plug in with repo path
repo_path = Path.cwd().parent

In [None]:
# Get file paths from codebase snapshot
file_paths, file_contents = await get_codebase_snapshot(repo_path, just_paths=True)

In [None]:
file_paths[:10]

### Token Counter for Context

In [None]:
# Initialize token counter
tc = TokenCounter()

In [None]:
# Assume dir_chunk is your list of directory paths (e.g. ['.', 'src', 'tests', …])
# and file_paths is the full list of files (e.g. ['app.py', 'src/main.py', 'tests/test_main.py', …])

codebase_tree = await get_codebase_context(repo_path, "full", token_limit=256000, model="gpt-4o")

# Now you can inject `directory_tree` into your prompt
print(codebase_tree[:100])

In [None]:
print(codebase_tree[-2000:])

In [None]:
# Count the tokens
token_count = tc.count_tokens(codebase_tree, "gpt-4o")
print(token_count)

## Curate Context: Process Chunks

In [None]:
# Extract and analyze directories from filtered files
all_dirs = set()
for file_path in file_paths:
    # Get all parent directories of this file
    parts = file_path.split('/')
    for i in range(1, len(parts)):
        dir_path = '/'.join(parts[:i])
        if dir_path:  # Skip empty strings
            all_dirs.add(dir_path)

# Add root directory ('.') if there are files at the root level
if any('/' not in f for f in file_paths):
    all_dirs.add('.')
    
# Sort directories for consistent output
sorted_dirs = sorted(list(all_dirs))

# Set chunk size based on reasoning mode
chunk_size = 3000  # Process more files per chunk for file structure mode
    
# Calculate number of chunks needed
total_chunks = (len(sorted_dirs) + chunk_size - 1) // chunk_size  # Ceiling division

# Create chunks of directories
dir_chunks = []
for i in range(0, len(sorted_dirs), chunk_size):
    dir_chunks.append(sorted_dirs[i:i + chunk_size])


In [None]:
# Track important directories
all_important_dirs = set()

# Helper function to process a single chunk
async def process_chunk(chunk_idx, dir_chunk):    
    lines = []
    for dir_path in dir_chunk:
        # Choose a nicer label for the root directory
        dir_label = 'top_directory' if dir_path == '.' else dir_path
        lines.append(dir_label)

        # Gather up to 5 direct children files of this directory
        if dir_path == '.':
            dir_files = [f for f in file_paths if '/' not in f]
        else:
            prefix = dir_path.rstrip('/') + '/'
            dir_files = [
                f for f in file_paths 
                if f.startswith(prefix) and '/' not in f[len(prefix):]
            ]
        samples = dir_files[:5]

        # Append each sample file under the directory, indented with a tab
        for f in samples:
            lines.append(f"\t{os.path.basename(f)}")

    # Final single representation
    directory_tree = "\n".join(lines)
    
    # Construct the prompt for this chunk
    prompt = f"""You are an expert software developer tasked with analyzing a codebase structure to identify important directories for AI context.

<user_task>
{user_task}
</user_task>

Your goal is to identify the most important directories that should be included when an AI assistant analyzes this codebase for the user's task.

Below is a list of directories from the codebase (chunk {chunk_idx + 1} of {total_chunks}):

<directories>
{directory_tree}
</directories>

Analyze these directories and identify the ones that:
1. Contain core application code relevant to the user's task
2. Likely contain important business logic
3. Would be essential for understanding the codebase architecture
4. Are needed to implement the requested task

Ignore directories that:
1. Contain only build artifacts or generated code
2. Store dependencies or vendor code
3. Contain temporary or cache files
4. Probably aren't relevant to the user's specific task

Return your analysis as a list of important directories, one per line, in this format:

```context
dir1
dir2
dir3
```

Don't include explanations for your choices, just return the list in the specified format.
"""
    print(prompt)

In [None]:
await process_chunk(0, dir_chunks[0])

## LSP Prompt Inspection

In [None]:
# For lsp mode, format with tree and LSP file contents
codebase_info = await get_codebase_context(repo_path, "lsp", token_limit=256000, model="gpt-4o")


## Inspect File Structure Filtering

In [None]:
# For lsp mode, format with tree and LSP file contents
codebase_info = await get_codebase_context(repo_path, "file_structure", token_limit=256000, model="gpt-4o")


In [None]:
print(codebase_info)

# Inspect the File Filtering

In [None]:
repo_path = Path("/Users/sravanj/project_work/yellhorn-mcp")

In [None]:
from yellhorn_mcp.server import run_git_command
# import fnmatch
from fnmatch import fnmatch
from pathlib import Path

In [None]:
async def get_codebase_snapshot(repo_path: Path, _mode: str = "full", log_function = print) -> tuple[list[str], dict[str, str]]:
    # Get list of all tracked and untracked files (respects .gitignore by default)
    files_output = await run_git_command(repo_path, ["ls-files", "-c", "-o", "--exclude-standard"])
    file_paths = [f for f in files_output.split("\n") if f]

    # Priority order: .yellhorncontext overrides .yellhornignore
    yellhorncontext_path = repo_path / ".yellhorncontext"
    context_exists = yellhorncontext_path.exists() and yellhorncontext_path.is_file()

    yellhornignore_path = repo_path / ".yellhornignore"
    ignore_exists = yellhornignore_path.exists() and yellhornignore_path.is_file()

    # Initialize pattern lists
    ignore_patterns = []
    whitelist_patterns = []

    # First try .yellhorncontext, then fall back to .yellhornignore
    if context_exists:
        # Read patterns from .yellhorncontext
        with open(yellhorncontext_path, "r") as f:
            for line in f:
                pattern = line.strip()
                if pattern:
                    if pattern.startswith("!"):
                        whitelist_patterns.append(pattern[1:])
                    else:
                        ignore_patterns.append(pattern)
    elif ignore_exists:
        # Read patterns from .yellhornignore
        with open(yellhornignore_path, "r") as f:
            for line in f:
                pattern = line.strip()
                if pattern:
                    ignore_patterns.append(pattern)

    print(ignore_patterns)
    # Apply filtering with fnmatch
    if ignore_patterns or whitelist_patterns:
        def is_ignored(file_path: str) -> bool:
            # Check whitelist patterns first (take precedence)
            for pattern in whitelist_patterns:
                if fnmatch.fnmatch(file_path, pattern):
                    return False  # Don't ignore whitelisted files

            # Then check blacklist patterns
            for pattern in ignore_patterns:
                if fnmatch.fnmatch(file_path, pattern):
                    return True  # Ignore matching files
            return False

        # Filter files
        filtered_paths = [f for f in file_paths if not is_ignored(f)]
        file_paths = filtered_paths
    
    return file_paths

In [None]:
file_paths = await get_codebase_snapshot(repo_path)
print(file_paths)

In [None]:
path = 'hello/.github/workflows/publish.yml'
pat = '*.github/'
fnmatch.fnmatch(path, pat.rstrip("/") + "/*")

In [None]:
import fnmatch

whitelist_patterns = ["python/**/*.py", "*.py", "*.ipynb", "*.md"]
ignore_patterns = [".gitignore", ".yellhornignore", "hello/*"]

# Use the same is_ignored function that get_codebase_snapshot uses
def is_ignored(file_path: str) -> bool:
    # First check if the file is whitelisted
    for pattern in whitelist_patterns:
        # Regular pattern matching (e.g., "*.py")
        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, pattern.rstrip("/") + "/*"):
            return False  # Whitelisted, don't ignore
    
    # Then check if it matches any ignore patterns
    for pattern in ignore_patterns:
        # Regular pattern matching (e.g., "*.log")
        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, pattern.rstrip("/") + "/*"):
            return True

    return False

In [None]:
fnmatch.fnmatch("hello/hello/poetry.lock", "poetry.lock")

In [None]:
is_ignored("hello/hello/hello.js")