In [2]:
# repo_cloner.py
from git import Repo
import os

def clone_github_repo(repo_url, target_dir="/home/sid/demo-sum"):
    """
    Clone a GitHub repository locally
    """
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    repo_path = os.path.join(target_dir, repo_name)

    if os.path.exists(repo_path):
        print(f"✅ Repo exists: {repo_path}")
        return repo_path

    print(f"📥 Cloning {repo_url}...")
    os.makedirs(target_dir, exist_ok=True)
    Repo.clone_from(repo_url, repo_path)
    print(f"✅ Cloned to: {repo_path}")
    return repo_path

# Test
repo_path = clone_github_repo("https://github.com/S-j-15/image_processing")


✅ Repo exists: /home/sid/demo-sum/image_processing


In [None]:
!rm -rf /home/sid/demo-sum/Kernel-process-Tree

In [3]:
# tree_builder.py
import os
from pathlib import Path

class CodeNode:
    """Represents a node in the code tree (folder, file, or function)"""

    # Define code vs non-code extensions
    CODE_EXTENSIONS = {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs', '.ts', '.jsx', '.tsx', '.rb', '.php', '.swift', '.kt', '.cs'}
    NON_CODE_EXTENSIONS = {'.md', '.txt', '.json', '.yaml', '.yml', '.toml', '.xml', '.csv', '.sql'}

    def __init__(self, title, node_id, node_type="file", path=None, start_line=None, end_line=None):
        self.title = title
        self.node_id = node_id
        self.type = node_type  # "folder", "file_py", "file_md", "function", "class"
        self.path = path
        self.start_line = start_line
        self.end_line = end_line
        self.text = None  # Only for non-code files
        self.summary = ""
        self.parent = None
        self.nodes = []  # Children nodes

    def set_type_from_extension(self):
        """Set type based on file extension"""
        if self.type == "file" and self.path:
            ext = Path(self.path).suffix.lower()

            if ext:
                # Convert .py to file_py, .md to file_md, etc.
                self.type = f"file{ext.replace('.', '_')}"
            else:
                self.type = "file"

    def should_store_text(self):
        """Determine if text should be stored in tree (only non-code files)"""
        if self.type.startswith("file_"):
            ext = self.type.replace("file_", ".")
            return ext in self.NON_CODE_EXTENSIONS
        return False

    def to_dict(self):
        """Convert to dictionary"""
        result = {
            "title": self.title,
            "node_id": self.node_id,
            "type": self.type,
            "path": self.path
        }

        # Add line numbers for functions/classes
        if self.start_line is not None:
            result["start_line"] = self.start_line
            result["end_line"] = self.end_line

        # Only include text for non-code files
        if self.text and self.should_store_text():
            result["text"] = self.text

        if self.summary:
            result["summary"] = self.summary

        if self.nodes:
            result["nodes"] = [child.to_dict() for child in self.nodes]

        return result


class TreeBuilder:
    """Builds directory tree with proper counter state"""

    def __init__(self, ignore_patterns=None):
        self.node_counter = 0  # Instance variable for counter
        self.ignore_patterns = ignore_patterns or [
            '.git', '__pycache__', 'node_modules', '.venv',
            'venv', 'build', 'dist', '.pytest_cache', '.DS_Store', '.bmp'
        ]

    def get_node_id(self):
        """Generate next node ID"""
        self.node_counter += 1
        return str(self.node_counter).zfill(4)

    def should_ignore(self, path):
        """Check if path should be ignored"""
        return any(pattern in path for pattern in self.ignore_patterns)

    def traverse_directory(self, dir_path):
        """Recursively traverse directory structure"""
        items = sorted(os.listdir(dir_path))
        nodes = []

        for item in items:
            item_path = os.path.join(dir_path, item)

            if self.should_ignore(item_path):
                continue

            if os.path.isdir(item_path):
                # Create folder node
                folder_node = CodeNode(
                    title=item,
                    node_id=self.get_node_id(),
                    node_type="folder",
                    path=item_path
                )
                # Recurse into subdirectories
                folder_node.nodes = self.traverse_directory(item_path)
                nodes.append(folder_node)

            elif os.path.isfile(item_path):
                # Create file node
                file_node = CodeNode(
                    title=item,
                    node_id=self.get_node_id(),
                    node_type="file",  # Will be set to file_py, file_md, etc.
                    path=item_path
                )
                # Set type based on extension
                file_node.set_type_from_extension()
                nodes.append(file_node)

        return nodes

    def build(self, repo_path):
        """Build complete directory tree"""
        # Reset counter for new tree
        self.node_counter = 0

        # Build root node
        root = CodeNode(
            title=os.path.basename(repo_path),
            node_id="0000",
            node_type="repository",
            path=repo_path
        )

        # Build children
        root.nodes = self.traverse_directory(repo_path)

        return root


def build_directory_tree(repo_path, ignore_patterns=None):
    """
    Convenience function to build hierarchical tree from repository directory structure
    Similar to PageIndex's process_not_toc() function
    """
    builder = TreeBuilder(ignore_patterns)
    return builder.build(repo_path)


# Test usage
if __name__ == "__main__":
    import json

    tree = build_directory_tree("/home/sid/demo-sum")
    print(json.dumps(tree.to_dict(), indent=2))


{
  "title": "demo-sum",
  "node_id": "0000",
  "type": "repository",
  "path": "/home/sid/demo-sum",
  "nodes": [
    {
      "title": "image_processing",
      "node_id": "0001",
      "type": "folder",
      "path": "/home/sid/demo-sum/image_processing",
      "nodes": [
        {
          "title": "README.md",
          "node_id": "0002",
          "type": "file_md",
          "path": "/home/sid/demo-sum/image_processing/README.md"
        },
        {
          "title": "image_processing",
          "node_id": "0003",
          "type": "folder",
          "path": "/home/sid/demo-sum/image_processing/image_processing",
          "nodes": [
            {
              "title": "ascii.txt",
              "node_id": "0004",
              "type": "file_txt",
              "path": "/home/sid/demo-sum/image_processing/image_processing/ascii.txt"
            },
            {
              "title": "ascii_2char_render.txt",
              "node_id": "0005",
              "type": "file_txt"

In [3]:
!pip install tree_sitter tree_sitter_javascript tree_sitter_python openai asyncio gitpython

# Install tree-sitter and language bindings
!pip install tree-sitter

# Install language bindings (install what you need)
!pip install tree-sitter-python
!pip install tree-sitter-javascript
!pip install tree-sitter-typescript
!pip install tree-sitter-cpp
!pip install tree-sitter-c
!pip install tree-sitter-java
!pip install tree-sitter-go
!pip install tree-sitter-rust
!pip install tree-sitter-ruby
!pip install tree-sitter-php




In [4]:
#code_parser.py
from tree_sitter import Language, Parser
from pathlib import Path

# Try importing all languages, handling different API versions
AVAILABLE_PARSERS = {}

# Python
try:
    import tree_sitter_python as tspython
    AVAILABLE_PARSERS['.py'] = ('python', tspython.language)
except ImportError:
    pass

# JavaScript
try:
    import tree_sitter_javascript as tsjavascript
    AVAILABLE_PARSERS['.js'] = ('javascript', tsjavascript.language)
    AVAILABLE_PARSERS['.jsx'] = ('javascript', tsjavascript.language)
except ImportError:
    pass

# TypeScript
try:
    import tree_sitter_typescript as tstypescript
    AVAILABLE_PARSERS['.ts'] = ('typescript', tstypescript.language_typescript)
    AVAILABLE_PARSERS['.tsx'] = ('typescript', tstypescript.language_tsx)
except ImportError:
    pass

# C++
try:
    import tree_sitter_cpp as tscpp
    AVAILABLE_PARSERS['.cpp'] = ('cpp', tscpp.language)
    AVAILABLE_PARSERS['.cc'] = ('cpp', tscpp.language)
    AVAILABLE_PARSERS['.cxx'] = ('cpp', tscpp.language)
    AVAILABLE_PARSERS['.hpp'] = ('cpp', tscpp.language)
except ImportError:
    pass

# C
try:
    import tree_sitter_c as tsc
    AVAILABLE_PARSERS['.c'] = ('c', tsc.language)
    AVAILABLE_PARSERS['.h'] = ('c', tsc.language)
except ImportError:
    pass

# Java
try:
    import tree_sitter_java as tsjava
    AVAILABLE_PARSERS['.java'] = ('java', tsjava.language)
except ImportError:
    pass

# Go
try:
    import tree_sitter_go as tsgo
    AVAILABLE_PARSERS['.go'] = ('go', tsgo.language)
except ImportError:
    pass

# Rust
try:
    import tree_sitter_rust as tsrust
    AVAILABLE_PARSERS['.rs'] = ('rust', tsrust.language)
except ImportError:
    pass

# Ruby
try:
    import tree_sitter_ruby as tsruby
    AVAILABLE_PARSERS['.rb'] = ('ruby', tsruby.language)
except ImportError:
    pass

# PHP - Skip if it has API issues
# try:
#     import tree_sitter_php as tsphp
#     AVAILABLE_PARSERS['.php'] = ('php', tsphp.language)
# except (ImportError, AttributeError):
#     pass


class CodeParser:
    """
    Multi-language code parser supporting 10+ programming languages
    Extracts functions, classes, methods from code files
    """

    # Language-specific node type mappings
    LANGUAGE_CONSTRUCTS = {
        'python': {
            'function': 'function_definition',
            'class': 'class_definition',
            'method': 'function_definition'
        },
        'javascript': {
            'function': ['function_declaration', 'function', 'arrow_function', 'method_definition'],
            'class': 'class_declaration',
            'method': 'method_definition'
        },
        'typescript': {
            'function': ['function_declaration', 'arrow_function', 'method_definition'],
            'class': 'class_declaration',
            'method': 'method_definition'
        },
        'cpp': {
            'function': 'function_definition',
            'class': 'class_specifier',
            'method': 'function_definition'
        },
        'c': {
            'function': 'function_definition',
            'struct': 'struct_specifier'
        },
        'java': {
            'function': 'method_declaration',
            'class': 'class_declaration',
            'method': 'method_declaration'
        },
        'go': {
            'function': 'function_declaration',
            'method': 'method_declaration',
            'struct': 'type_declaration'
        },
        'rust': {
            'function': 'function_item',
            'struct': 'struct_item',
            'impl': 'impl_item'
        },
        'ruby': {
            'function': 'method',
            'class': 'class',
            'module': 'module'
        }
    }

    def __init__(self):
        self.parsers = {}
        self.node_counter = 0

        print(f"🔧 Initializing parsers for {len(AVAILABLE_PARSERS)} extensions...")

        # Initialize all available parsers
        for ext, (lang_name, lang_func) in AVAILABLE_PARSERS.items():
            try:
                parser_info = self._create_parser(lang_func, lang_name)
                if parser_info:
                    self.parsers[ext] = parser_info
                    print(f"   ✅ {ext:6s} → {lang_name}")
            except Exception as e:
                print(f"   ⚠️  {ext:6s} → Failed: {e}")

        print(f"\n✅ CodeParser ready with {len(self.parsers)} language(s)")

    def _create_parser(self, language_func, lang_name):
        """Create parser for a specific language"""
        try:
            # Call the language function to get the language object
            lang_obj = language_func()
            LANG = Language(lang_obj)
            parser = Parser(LANG)
            return {'parser': parser, 'language': lang_name}
        except Exception as e:
            print(f"      Error creating parser for {lang_name}: {e}")
            return None

    def get_node_id(self, prefix="node"):
        """Generate unique node ID"""
        self.node_counter += 1
        return f"{prefix}_{self.node_counter}"

    def parse_file(self, file_path):
        """
        Extract functions and classes from a code file
        Returns list of CodeNode objects
        """
        ext = Path(file_path).suffix.lower()
        parser_info = self.parsers.get(ext)

        if not parser_info:
            return []

        parser = parser_info['parser']
        language = parser_info['language']

        try:
            with open(file_path, 'rb') as f:
                code = f.read()
        except Exception as e:
            print(f"⚠️ Could not read {file_path}: {e}")
            return []

        tree = parser.parse(code)
        root_node = tree.root_node

        constructs = self._extract_constructs(root_node, file_path, language)

        return constructs

    def _extract_constructs(self, root_node, file_path, language):
        """Extract language-specific constructs"""
        constructs = []
        construct_types = self.LANGUAGE_CONSTRUCTS.get(language, {})

        for child in root_node.children:
            node_type = child.type

            # Check for function definitions
            function_types = construct_types.get('function', [])
            if isinstance(function_types, str):
                function_types = [function_types]

            if node_type in function_types:
                constructs.append(CodeNode(
                    title=self._get_name(child),
                    node_id=self.get_node_id("func"),
                    node_type="function",
                    path=file_path,
                    start_line=child.start_point[0],
                    end_line=child.end_point[0]
                ))

            # Check for class definitions
            class_types = construct_types.get('class', [])
            if isinstance(class_types, str):
                class_types = [class_types]

            if node_type in class_types:
                class_node = CodeNode(
                    title=self._get_name(child),
                    node_id=self.get_node_id("class"),
                    node_type="class",
                    path=file_path,
                    start_line=child.start_point[0],
                    end_line=child.end_point[0]
                )
                # Extract methods
                class_node.nodes = self._extract_methods(child, file_path, language)
                constructs.append(class_node)

            # Check for struct/module/impl (language-specific)
            for construct_key in ['struct', 'module', 'impl']:
                struct_types = construct_types.get(construct_key, [])
                if isinstance(struct_types, str):
                    struct_types = [struct_types]

                if node_type in struct_types:
                    constructs.append(CodeNode(
                        title=self._get_name(child),
                        node_id=self.get_node_id(construct_key),
                        node_type=construct_key,
                        path=file_path,
                        start_line=child.start_point[0],
                        end_line=child.end_point[0]
                    ))

        return constructs

    # def _get_name(self, node):
    #     """Extract name from AST node (works across languages)"""
    #     # Try to find identifier child
    #     for child in node.children:
    #         if child.type in ['identifier', 'name', 'type_identifier']:
    #             return child.text.decode('utf8')

    #     # Fallback: check named children
    #     if hasattr(node, 'child_by_field_name'):
    #         name_node = node.child_by_field_name('name')
    #         if name_node:
    #             return name_node.text.decode('utf8')

    #     return "unknown"

    def _get_name(self, node):
        """Extract name from AST node (works across all languages)"""
        import re

        # Strategy 1: Try field-based name lookup (most reliable across languages)
        if hasattr(node, 'child_by_field_name'):
            # Try 'name' field (Python, JavaScript, TypeScript, Java, Go, Rust, Ruby, C++)
            name_node = node.child_by_field_name('name')
            if name_node:
                return name_node.text.decode('utf8')

            # Try 'declarator' field (C/C++/Java for complex declarations)
            declarator_node = node.child_by_field_name('declarator')
            if declarator_node:
                # Recursively extract identifier from any declarator structure
                def extract_identifier(decl):
                    if decl.type == 'identifier':
                        return decl.text.decode('utf8')

                    # Check nested declarator field
                    if hasattr(decl, 'child_by_field_name'):
                        inner = decl.child_by_field_name('declarator')
                        if inner:
                            return extract_identifier(inner)
                        # Also try 'name' field within declarator
                        inner_name = decl.child_by_field_name('name')
                        if inner_name:
                            return inner_name.text.decode('utf8')

                    # Search children for identifier or nested structures
                    for child in decl.children:
                        if child.type == 'identifier':
                            return child.text.decode('utf8')
                        # Recurse into declarator-like nodes
                        if 'declarator' in child.type or child.type in ['identifier', 'name']:
                            result = extract_identifier(child)
                            if result != "unknown":
                                return result

                    return "unknown"

                result = extract_identifier(declarator_node)
                if result != "unknown":
                    return result

        # Strategy 2: Search direct children for identifier/name nodes
        # Works for most languages when field lookup isn't available
        for child in node.children:
            if child.type in ['identifier', 'name']:
                return child.text.decode('utf8')

        # Strategy 3: Regex extraction from node text as last resort
        # Handles edge cases and unusual structures across all languages
        try:
            node_text = node.text.decode('utf8')
            # Pattern 1: function_name(...) or method_name(...)
            match = re.search(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(', node_text)
            if match:
                # Verify it's not a keyword
                potential_name = match.group(1)
                keywords = {'if', 'for', 'while', 'switch', 'return', 'class', 'def', 'function', 'const', 'let', 'var'}
                if potential_name not in keywords:
                    return potential_name

            # Pattern 2: class ClassName or struct StructName
            match = re.search(r'\b(?:class|struct|interface|module|impl)\s+([a-zA-Z_][a-zA-Z0-9_]*)', node_text)
            if match:
                return match.group(1)

            # Pattern 3: def method_name or function name (Ruby, Python)
            match = re.search(r'\b(?:def|fn)\s+([a-zA-Z_][a-zA-Z0-9_]*)', node_text)
            if match:
                return match.group(1)
        except:
            pass

        return "unknown"

    def _extract_methods(self, class_node, file_path, language):
        """Extract methods from a class"""
        methods = []
        construct_types = self.LANGUAGE_CONSTRUCTS.get(language, {})
        method_types = construct_types.get('method', [])
        if isinstance(method_types, str):
            method_types = [method_types]

        # Recursively search for methods
        def find_methods(node):
            if node.type in method_types:
                methods.append(CodeNode(
                    title=self._get_name(node),
                    node_id=self.get_node_id("method"),
                    node_type="method",
                    path=file_path,
                    start_line=node.start_point[0],
                    end_line=node.end_point[0]
                ))

            for child in node.children:
                find_methods(child)

        find_methods(class_node)
        return methods


def enrich_tree_with_code_structure(tree, parser):
    """Add function/class nodes to file nodes"""
    if tree.type.startswith("file_"):
        ext = Path(tree.path).suffix.lower()

        if ext in parser.parsers:
            print(f"🔍 Parsing {tree.title} ({parser.parsers[ext]['language']})...")
            constructs = parser.parse_file(tree.path)
            tree.nodes = constructs
            if constructs:
                print(f"   Found {len(constructs)} constructs")

    for child in tree.nodes:
        enrich_tree_with_code_structure(child, parser)

    return tree



In [5]:
# ==================== TEST MULTI-LANGUAGE CODE PARSER ====================
import json
from pathlib import Path

print("=" * 70)
print("TESTING MULTI-LANGUAGE CODE PARSER ON /home/sid/demo-sum")
print("=" * 70)

# Initialize the multi-language parser
parser = CodeParser()

print("\n✅ Supported Extensions:")
for ext in sorted(parser.parsers.keys()):
    lang = parser.parsers[ext]['language']
    print(f"   {ext:8s} → {lang}")

# Enrich tree with code structure
print("\n🔍 Parsing code files in repository...\n")
tree = enrich_tree_with_code_structure(tree, parser)

print("\n" + "=" * 70)
print("ENRICHED TREE WITH CODE CONSTRUCTS")
print("=" * 70)

# Display enriched tree
enriched_tree_dict = tree.to_dict()
print(json.dumps(enriched_tree_dict, indent=2))

# Save enriched tree
output_file = "/home/sid/demo-sum/multi_lang_tree.json"
with open(output_file, 'w') as f:
    json.dump(enriched_tree_dict, f, indent=2)
print(f"\n💾 Saved to: {output_file}")

# Detailed statistics
print("\n" + "=" * 70)
print("PARSING STATISTICS")
print("=" * 70)

def collect_stats(node, stats=None):
    """Collect statistics recursively"""
    if stats is None:
        stats = {
            'by_type': {},
            'by_language': {},
            'total_functions': 0,
            'total_classes': 0,
            'total_methods': 0,
            'total_structs': 0,
            'files_parsed': 0,
            'files_skipped': 0
        }

    node_dict = node if isinstance(node, dict) else node.to_dict()
    node_type = node_dict.get('type')

    # Count by type
    stats['by_type'][node_type] = stats['by_type'].get(node_type, 0) + 1

    # Count constructs
    if node_type == 'function':
        stats['total_functions'] += 1
    elif node_type == 'class':
        stats['total_classes'] += 1
    elif node_type == 'method':
        stats['total_methods'] += 1
    elif node_type in ['struct', 'impl']:
        stats['total_structs'] += 1

    # Count files parsed
    if node_type.startswith('file_'):
        ext = '.' + node_type.replace('file_', '')
        if ext in parser.parsers:
            if node_dict.get('nodes'):
                stats['files_parsed'] += 1
                lang = parser.parsers[ext]['language']
                stats['by_language'][lang] = stats['by_language'].get(lang, 0) + 1
            else:
                stats['files_skipped'] += 1

    # Recurse
    for child in node_dict.get('nodes', []):
        collect_stats(child, stats)

    return stats

stats = collect_stats(tree)

print("\n📊 Node Type Distribution:")
for node_type, count in sorted(stats['by_type'].items()):
    print(f"   {node_type:20s}: {count:3d}")

print(f"\n📝 Code Constructs:")
print(f"   Functions:  {stats['total_functions']:3d}")
print(f"   Classes:    {stats['total_classes']:3d}")
print(f"   Methods:    {stats['total_methods']:3d}")
print(f"   Structs:    {stats['total_structs']:3d}")

print(f"\n🔍 Files Processed:")
print(f"   Parsed:     {stats['files_parsed']:3d}")
print(f"   Skipped:    {stats['files_skipped']:3d} (no parser or non-code)")

if stats['by_language']:
    print(f"\n🌐 Languages Detected:")
    for lang, count in sorted(stats['by_language'].items()):
        print(f"   {lang:15s}: {count:3d} files")

# Find and display all parsed files
print("\n" + "=" * 70)
print("DETAILED PARSING RESULTS")
print("=" * 70)

def find_code_files(node, code_files=None):
    """Find all parsed code files"""
    if code_files is None:
        code_files = []

    node_dict = node if isinstance(node, dict) else node.to_dict()
    node_type = node_dict.get('type')

    # Check if this is a code file with parsed constructs
    if node_type.startswith('file_'):
        ext = '.' + node_type.replace('file_', '')
        if ext in parser.parsers and node_dict.get('nodes'):
            code_files.append(node_dict)

    # Recurse
    for child in node_dict.get('nodes', []):
        find_code_files(child, code_files)

    return code_files

code_files = find_code_files(tree)

if code_files:
    for i, code_file in enumerate(code_files, 1):
        title = code_file['title']
        path = code_file['path']
        ext = '.' + code_file['type'].replace('file_', '')
        lang = parser.parsers[ext]['language']
        constructs = code_file.get('nodes', [])

        print(f"\n{i}. {title}")
        print(f"   Language: {lang}")
        print(f"   Path: {path}")
        print(f"   Constructs: {len(constructs)}")

        if constructs:
            for construct in constructs:
                c_type = construct['type']
                c_title = construct['title']
                c_start = construct.get('start_line', '?')
                c_end = construct.get('end_line', '?')

                print(f"      └─ {c_type}: {c_title} (lines {c_start}-{c_end})")

                # Show methods/members if class/struct
                if construct.get('nodes'):
                    for member in construct['nodes']:
                        m_type = member['type']
                        m_title = member['title']
                        m_start = member.get('start_line', '?')
                        m_end = member.get('end_line', '?')
                        print(f"         └─ {m_type}: {m_title} (lines {m_start}-{m_end})")
else:
    print("\n⚠️ No code files with parseable constructs found")
    print("\nYour repository contains:")
    for node_type, count in stats['by_type'].items():
        if node_type.startswith('file_'):
            ext = node_type.replace('file_', '.')
            print(f"   {count} {ext} file(s)")

    print("\n💡 To test the parser, try:")
    print("   1. Clone a Python/JS/Java/C++ repository")
    print("   2. Or create test files in supported languages")

print("\n" + "=" * 70)
print("✅ MULTI-LANGUAGE PARSER TEST COMPLETE")
print("=" * 70)

# Summary
print(f"\n📈 Summary:")
print(f"   Total nodes:      {sum(stats['by_type'].values())}")
print(f"   Code files:       {stats['files_parsed']}")
print(f"   Languages:        {len(stats['by_language'])}")
print(f"   Total constructs: {stats['total_functions'] + stats['total_classes'] + stats['total_methods']}")



TESTING MULTI-LANGUAGE CODE PARSER ON /home/sid/demo-sum
🔧 Initializing parsers for 15 extensions...
   ✅ .py    → python
   ✅ .js    → javascript
   ✅ .jsx   → javascript
   ✅ .ts    → typescript
   ✅ .tsx   → typescript
   ✅ .cpp   → cpp
   ✅ .cc    → cpp
   ✅ .cxx   → cpp
   ✅ .hpp   → cpp
   ✅ .c     → c
   ✅ .h     → c
   ✅ .java  → java
   ✅ .go    → go
   ✅ .rs    → rust
   ✅ .rb    → ruby

✅ CodeParser ready with 15 language(s)

✅ Supported Extensions:
   .c       → c
   .cc      → cpp
   .cpp     → cpp
   .cxx     → cpp
   .go      → go
   .h       → c
   .hpp     → cpp
   .java    → java
   .js      → javascript
   .jsx     → javascript
   .py      → python
   .rb      → ruby
   .rs      → rust
   .ts      → typescript
   .tsx     → typescript

🔍 Parsing code files in repository...

🔍 Parsing bmp_img_processor.c (c)...
   Found 58 constructs
🔍 Parsing bmp_img_processor.h (c)...
🔍 Parsing test.c (c)...
   Found 4 constructs
🔍 Parsing test2.c (c)...
   Found 6 constructs

ENRIC

In [5]:
import os

def extract_code_text(node):
    """
    Extract actual code text for a node
    Similar to PageIndex's page text extraction
    """
    if not node.path or not os.path.exists(node.path):
        return ""

    # Skip directories - only process files
    if os.path.isdir(node.path):
        return ""

    try:
        with open(node.path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"⚠️ Error reading {node.path}: {e}")
        return ""

    # Extract code based on node type
    if node.type in ["function", "class", "method", "struct", "module", "impl"] and node.start_line is not None:
        # Extract specific construct code (function/class/method)
        # Ensure line numbers are within bounds
        start_line = max(0, node.start_line)
        end_line = min(len(lines) - 1, node.end_line) if node.end_line is not None else len(lines) - 1

        if start_line <= end_line:
            code_lines = lines[start_line:end_line + 1]
            return ''.join(code_lines)
        return ""

    elif node.type.startswith("file_"):
        # Return entire file content
        return ''.join(lines)

    return ""


def add_text_to_tree(tree):
    """
    Add text content to all nodes in the tree
    Similar to PageIndex's addnodetext()
    Recursively processes all nodes including nested structures

    IMPORTANT: Ensures 'text' attribute is set on the node object
    """
    # Extract text for current node
    text_content = extract_code_text(tree)
    print(text_content)

    # Ensure text attribute exists and is set
    tree.text = text_content

    # If using a dict-based node, also set it in __dict__
    if hasattr(tree, '__dict__'):
        tree.__dict__['text'] = text_content

    # Recursively process all children
    if hasattr(tree, 'nodes'):
        for child in tree.nodes:
            add_text_to_tree(child)

    return tree


def node_to_dict(node, include_text=True):
    """
    Convert node to dictionary with all fields including text
    Use this if default serialization doesn't include text
    """
    result = {
        'title': node.title,
        'node_id': node.node_id,
        'type': node.type,
        'path': node.path,
    }

    # Add optional fields if they exist
    if hasattr(node, 'start_line') and node.start_line is not None:
        result['start_line'] = node.start_line
    if hasattr(node, 'end_line') and node.end_line is not None:
        result['end_line'] = node.end_line

    # Add text field
    if include_text and hasattr(node, 'text'):
        result['text'] = node.text

    # Recursively convert children
    if hasattr(node, 'nodes') and node.nodes:
        result['nodes'] = [node_to_dict(child, include_text) for child in node.nodes]
    else:
        result['nodes'] = []

    return result


def tree_to_json(tree, filepath, include_text=True):
    """
    Save tree to JSON file with text included
    """
    import json

    tree_dict = node_to_dict(tree, include_text)

    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(tree_dict, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved tree to {filepath}")


def get_node_summary(node, max_lines=5):
    """
    Get a summary of the node's code (first few lines)
    Useful for previews and debugging
    """
    if not hasattr(node, 'text') or not node.text:
        return ""

    lines = node.text.split('\n')[:max_lines]
    summary = '\n'.join(lines)

    if len(node.text.split('\n')) > max_lines:
        summary += "\n..."

    return summary


def print_tree_with_text(tree, indent=0, show_text=False, max_text_lines=3):
    """
    Print tree structure with optional text preview
    Useful for debugging and visualization
    """
    prefix = "  " * indent

    # Node info
    info = f"{prefix}└─ {tree.type}: {tree.title}"
    if hasattr(tree, 'start_line') and tree.start_line is not None:
        info += f" (lines {tree.start_line}-{tree.end_line})"

    print(info)

    # Show text preview if requested
    if show_text and hasattr(tree, 'text') and tree.text:
        text_preview = get_node_summary(tree, max_text_lines)
        for line in text_preview.split('\n'):
            print(f"{prefix}   | {line}")

    # Recursively print children
    if hasattr(tree, 'nodes'):
        for child in tree.nodes:
            print_tree_with_text(child, indent + 1, show_text, max_text_lines)


In [None]:
tree2=add_text_to_tree(tree)
print_tree_with_text(tree2)



# image_processing
My lazy ass...finally decided to add a readme...lol:
Anyways here is whats up...
I basically tried making openCV from scratch in C...
This only works on BMP file format...
Cus Bitmap is damn Simple haha
ya so i used only standard C libs...
Extracted the DIB and BMP header info...got the shape and size of image...
and also the loc in the bmp file where img data starts...
Extracted the image data into an RGB struct array...
and BAM! U can do any kind of crazy stuff with it...like ascii art:
<img width="832" height="678" alt="image" src="https://github.com/user-attachments/assets/a91fed63-9a76-4b5d-833b-6bbdb1366923" />
lol
# I also implemented stuff like:
# 1> convolution
# 2> min and maxpooling
# 3> edge detection like: sobels, DoG, etc
# 4> blurring filters
# 5> some morphological op like: dilation and erosion
# 6> histogram equilization
# 7> some basic func: like log and gamma function
# 8> also fun stuff like: ascii art and "coloured" ascii art
I am planning to a

In [6]:
def make_parents(root, fix_existing=True, verbose=False):
    """
    Walk the tree rooted at `root` and make each child point to its parent.

    Args:
        root: CodeNode — root of the tree
        fix_existing (bool): if True, overwrite incorrect parent pointers;
                             if False, only set parent when child.parent is None.
        verbose (bool): if True, print fixes as they are applied.

    Returns:
        dict with:
            - fixed_count: number of parent pointers created/changed
            - changed_nodes: list of node_id strings that were changed (trimmed if large)
            - total_nodes: total nodes visited
    """
    def nid(n):
        return f"{getattr(n, 'node_id', id(n))}::{getattr(n, 'title', '<no-title>')}"

    if root is None:
        return {"fixed_count": 0, "changed_nodes": [], "total_nodes": 0}

    root.parent = None  # root should not have a parent
    fixed = 0
    changed = []
    total = 0

    stack = [root]
    while stack:
        parent = stack.pop()
        total += 1
        children = parent.nodes or []
        for child in children:
            # if we should set/overwrite parent
            if child.parent is None:
                child.parent = parent
                fixed += 1
                changed.append(nid(child))
                if verbose:
                    print(f"Set parent of {nid(child)} -> {nid(parent)}")
            elif child.parent is not parent and fix_existing:
                old = nid(child.parent)
                child.parent = parent
                fixed += 1
                changed.append(nid(child))
                if verbose:
                    print(f"Re-pointed parent of {nid(child)}: {old} -> {nid(parent)}")
            # push child to stack to fix its subtree
            stack.append(child)

    # keep changed list short for big trees
    changed_preview = changed if len(changed) <= 100 else changed[:100] + ["..."]

    return {"fixed_count": fixed, "changed_nodes": changed_preview, "total_nodes": total}

In [7]:
make_parents(tree)

{'fixed_count': 79,
 'changed_nodes': ['0001::image_processing',
  '0011::multi_lang_tree.json',
  '0002::README.md',
  '0003::image_processing',
  '0004::ascii.txt',
  '0005::ascii_2char_render.txt',
  '0006::bmp_img_processor.c',
  '0007::bmp_img_processor.h',
  '0008::images',
  '0009::test.c',
  '0010::test2.c',
  'func_63::exp_kernel',
  'func_64::exp_pool',
  'func_65::concatIMGA',
  'func_66::renderAsciiArt_char_render_twice_my',
  'func_67::renderAsciiArt_terminal_my',
  'func_68::main',
  'func_59::logp',
  'func_60::gammafunc',
  'func_61::contraE',
  'func_62::main',
  'func_1::errMsg',
  'func_2::isBMP',
  'func_3::getBMPHeader',
  'func_4::getDIBHeader',
  'func_5::isValidFormat',
  'func_6::getImage',
  'func_7::greyScale',
  'func_8::bwImg',
  'func_9::renderAsciiArt',
  'func_10::renderAsciiArt_space',
  'func_11::renderAsciiArt_char_render_twice',
  'func_12::renderAsciiArt_color',
  'func_13::render_terminal',
  'func_14::freeIMG',
  'func_15::createIMG',
  'func_16::

In [7]:
!pip install ollama



In [8]:
import os
import ollama
import concurrent.futures

MODEL_NAME = "qwen3:1.7b"

def find_function_nodes_in_code_files(tree):
    code_exts = {".c", ".cpp", ".cc", ".py", ".java", ".h"}
    function_nodes = []

    def traverse(node):
        node_type = getattr(node, "type", "")
        node_path = getattr(node, "path", "")
        nodes = getattr(node, "nodes", [])

        if node_type.startswith("file_"):
            _, ext = os.path.splitext(node_path)
            if ext.lower() in code_exts:
                for child in nodes:
                    if getattr(child, "type", "") == "function":
                        function_nodes.append(child)

        for child in nodes:
            traverse(child)

    traverse(tree)
    print(f"🧩 Found {len(function_nodes)} function nodes in all code files.")
    return function_nodes


def summarize_function_node(node):
    try:
        with open(node.path, "r", encoding="utf-8", errors="ignore") as f:
            lines = f.readlines()
        code_segment = "".join(lines[node.start_line: node.end_line + 1])
    except Exception as e:
        return f"[File read error: {e}]"

    if not code_segment.strip():
        return "[Empty function code]"

    prompt = (
        f"Summarize the following C function in 2–3 concise lines "
        f"covering what it does and how:\n\n{code_segment[:1500]}"
    )

    try:
        response = ollama.chat(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are a C code summarizer."},
                {"role": "user", "content": prompt}
            ],
            options={"temperature": 0.4}
        )
        summary = response["message"]["content"].strip()
        print(f"✅ Finished {node.title}")
        return summary
    except Exception as e:
        return f"[Ollama error: {e}]"


def summarize_code_functions_parallel(tree, max_workers=4):
    function_nodes = find_function_nodes_in_code_files(tree)

    if not function_nodes:
        print("⚠️ No function nodes detected — check your tree structure.")
        return tree

    print(f"🚀 Starting parallel summarization for {len(function_nodes)} functions...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Schedule the function executions
        futures = {executor.submit(summarize_function_node, node): node for node in function_nodes}

        # As each future completes, assign its result
        for future in concurrent.futures.as_completed(futures):
            node = futures[future]
            try:
               node.summary = future.result()
            except Exception as e:
                node.summary = f"[Thread error: {e}]"
            print(f"🔍 {node.title} — summary saved ✅")

    print("✅ Completed all function summaries in parallel.")
    return tree


In [9]:
summarize_code_functions_parallel(tree)


🧩 Found 68 function nodes in all code files.
🚀 Starting parallel summarization for 68 functions...
✅ Finished errMsg
🔍 errMsg — summary saved ✅
✅ Finished getBMPHeader
🔍 getBMPHeader — summary saved ✅
✅ Finished isBMP
🔍 isBMP — summary saved ✅
✅ Finished getDIBHeader
🔍 getDIBHeader — summary saved ✅
✅ Finished isValidFormat
🔍 isValidFormat — summary saved ✅
✅ Finished getImage
🔍 getImage — summary saved ✅
✅ Finished greyScale
🔍 greyScale — summary saved ✅
✅ Finished bwImg
🔍 bwImg — summary saved ✅
✅ Finished renderAsciiArt
🔍 renderAsciiArt — summary saved ✅
✅ Finished renderAsciiArt_space
🔍 renderAsciiArt_space — summary saved ✅
✅ Finished renderAsciiArt_char_render_twice
🔍 renderAsciiArt_char_render_twice — summary saved ✅
✅ Finished renderAsciiArt_color
🔍 renderAsciiArt_color — summary saved ✅
✅ Finished render_terminal
🔍 render_terminal — summary saved ✅
✅ Finished freeIMG
🔍 freeIMG — summary saved ✅
✅ Finished createIMG
🔍 createIMG — summary saved ✅
✅ Finished edgeDetect_gradient
🔍

<__main__.CodeNode at 0x7f3c466525a0>

In [15]:
def print_tree(node, depth=0):
    print("  " * depth + f"{node.title}: {node.summary}\n")
    for child in getattr(node, "nodes", []):
        print_tree(child, depth + 1)

print_tree(tree)


demo-sum: The `bmp_img_processor.c` validates BMP files, extracts headers, and processes pixel data with error checks. `test.c` and `test2.c` handle image normalization (logarithmic scaling, contrast adjustment) and pixel conversion (convolution, max pooling, RGB-weighted ASCII art).

  image_processing: The `bmp_img_processor.c` validates BMP files, extracts headers, and processes pixel data with error checks. `test.c` and `test2.c` handle image normalization (logarithmic scaling, contrast adjustment) and pixel conversion (convolution/max pooling, RGB-weighted ASCII art).

    README.md: 

    image_processing: The `bmp_img_processor.c` handles BMP file validation, header extraction, and pixel data processing with error checks. `test.c` implements logarithmic/gamma scaling, contrast adjustment, and ASCII art rendering for image normalization. `test2.c` performs convolution/max pooling, image concatenation, and RGB-weighted ASCII rendering for pixel data conversion.

      ascii.txt: 


In [23]:
MODEL_NAME = "qwen3:8b"

def generate_parent_summary(node, max_context=1500):
    """
    Recursively generate summaries for non-leaf nodes,
    using concatenated summaries of immediate children as context.
    """
    # Base case: leaf node, assume summary already generated.
    if not getattr(node, "nodes", []):
        return getattr(node, "summary", "")

    # Recursively summarize all children first
    child_summaries = []
    for child in node.nodes:
        child_summary = generate_parent_summary(child, max_context)
        if child_summary:
            child_summaries.append(f"{child.title}: {child_summary}")

    if not child_summaries:
        return ""

    # Limit context length (roughly)
    context_text = "\n".join(child_summaries)
    context_text = context_text[:max_context]

    prompt = f"Read the child summaries carefully.Generate a summary that captures the key ideas, themes, or results of the section into 2-3 lines .If content is highly technical, preserve key terms or equations, but stay concise.Write in **neutral, factual tone** suitable for embedding and retrieval.:\n{context_text}"

    try:
        response = ollama.chat(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": "You are an expert summarizer helping to build a hierarchical knowledge index.Your task is to summarize the given node's content concisely and factually"},
                {"role": "user", "content": prompt}
            ],
            options={"temperature": 0.4,
                     "top_p": 0.9}
        )
        summary = response["message"]["content"].strip()
    except Exception as e:
        summary = f"[Ollama error generating summary: {e}]"

    # Store summary in current node
    node.summary = summary

    return summary


In [24]:
generate_parent_summary(tree)

'The `bmp_img_processor.c` module manages BMP image processing, including format validation, header parsing, pixel data extraction, and memory management. `test.c` implements pipelines for logarithmic scaling, gamma correction, contrast adjustment, and 3x3 dilation, generating bitmap files and ASCII art. `test2.c` focuses on convolution, max pooling, image concatenation, and ASCII art via RGB-to-character mapping, with explicit memory management and operation chaining.'

In [None]:
def print_tree(node, depth=0):      #with top_p as 0.9
    print("  " * depth + f"{node.title}: {node.summary}\n")
    for child in getattr(node, "nodes", []):
        print_tree(child, depth + 1)

print_tree(tree)

demo-sum: The `bmp_img_processor.c` module manages BMP image processing, including format validation, header parsing, pixel data extraction, and memory management. `test.c` implements pipelines for logarithmic scaling, gamma correction, contrast adjustment, and 3x3 dilation, generating bitmap files and ASCII art. `test2.c` focuses on convolution, max pooling, image concatenation, and ASCII art via RGB-to-character mapping, with explicit memory management and operation chaining.

  image_processing: The `bmp_img_processor.c` module handles BMP image processing tasks including format validation, header parsing, pixel data extraction, and memory management. `test.c` implements image processing pipelines for logarithmic scaling, gamma correction, contrast adjustment, and 3x3 dilation, saving outputs as bitmap files and ASCII art. `test2.c` focuses on convolution, max pooling, image concatenation, and ASCII art generation via RGB-to-character mapping, with explicit memory management and ope

In [None]:
def print_tree(node, depth=0):          #lesser creative no top_p
    print("  " * depth + f"{node.title}: {node.summary}\n")
    for child in getattr(node, "nodes", []):
        print_tree(child, depth + 1)

print_tree(tree)


demo-sum: The `bmp_img_processor.c` module manages BMP file processing tasks such as signature validation, header parsing, and pixel data extraction. `test.c` implements logarithmic scaling, gamma correction, and 3x3 dilation to generate ASCII art from images. `test2.c` employs convolution/max pooling, memory-efficient concatenation, and RGB-to-character conversion via weighted averaging for ASCII output.

  image_processing: The `bmp_img_processor.c` module handles BMP file processing, including signature validation, header parsing, format checks, and pixel data extraction. `test.c` applies logarithmic scaling, gamma correction, and 3x3 dilation to generate ASCII art from images. `test2.c` performs convolution/max pooling, memory-efficient image concatenation, and RGB-to-character conversion via weighted averaging for ASCII output.

    README.md: 

    image_processing: The `bmp_img_processor.c` module implements BMP file processing functions, including signature validation ("BM" che

In [26]:
print(json.dumps(tree.to_dict(), indent=2))

{
  "title": "demo-sum",
  "node_id": "0000",
  "type": "repository",
  "path": "/home/sid/demo-sum",
  "summary": "The `bmp_img_processor.c` module manages BMP image processing, including format validation, header parsing, pixel data extraction, and memory management. `test.c` implements pipelines for logarithmic scaling, gamma correction, contrast adjustment, and 3x3 dilation, generating bitmap files and ASCII art. `test2.c` focuses on convolution, max pooling, image concatenation, and ASCII art via RGB-to-character mapping, with explicit memory management and operation chaining.",
  "nodes": [
    {
      "title": "image_processing",
      "node_id": "0001",
      "type": "folder",
      "path": "/home/sid/demo-sum/image_processing",
      "summary": "The `bmp_img_processor.c` module handles BMP image processing tasks including format validation, header parsing, pixel data extraction, and memory management. `test.c` implements image processing pipelines for logarithmic scaling, gamm