# Initializing the data folder

In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Determine the base directory relative to the file's path or current working directory
current_dir = Path(__file__).resolve().parent if '__file__' in locals() else Path.cwd()

# Adjust the path to point to the supporting_files/data/.env file
supporting_files = os.path.join(current_dir.parents[1], "00-supporting-files")

In [2]:
supporting_files

'/Users/progressedd/personal-projects/glma/00-supporting-files'

In [3]:
linux_dir = os.path.join(supporting_files, "data", "linux-kernel")
ag2_dir = os.path.join(supporting_files, "data", "ag2-framework")

## Getting a tree printout of the repository

In [100]:
from pathlib import Path
import os

def get_directory_tree(directory_path, level=0, prefix='', max_depth=None):
    path = Path(directory_path)
    
    # Early return if we've reached max depth
    if max_depth is not None and level >= max_depth:
        # Count files and folders at this level
        items = list(path.iterdir())
        folders = [item for item in items if item.is_dir()]
        files = [item for item in items if item.is_file()]
        
        # Count file types
        file_types = {}
        for file in files:
            ext = file.suffix.lower() or 'no extension'
            file_type = ext[1:] if ext.startswith('.') else ext
            file_types[file_type] = file_types.get(file_type, 0) + 1
        
        # Build summary string
        summary_parts = []
        if folders:
            summary_parts.append(f"{len(folders)} folder{'s' if len(folders) > 1 else ''}")
        
        for file_type, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True):
            summary_parts.append(f"{count} {file_type}{'s' if count > 1 else ''}")
        
        summary = ", ".join(summary_parts)
        return [f"{prefix}├── ... {len(items)} more items: {summary}"]
    
    result = []
    
    # Add directory name for non-root directories
    if level > 0:
        connector = "└──" if "└" in prefix else "├──"
        result.append(f"{prefix}{connector} {path.name}/")
    
    # Process all files and subdirectories
    items = sorted(path.iterdir(), key=lambda x: (x.is_file(), x.name))
    
    for i, item in enumerate(items):
        is_last = (i == len(items) - 1)
        
        # Choose the appropriate prefix for the next level
        if is_last:
            next_prefix = prefix + "    "  # space after last item
            current_connector = "└──"
        else:
            next_prefix = prefix + "│   "  # vertical line for items that have siblings
            current_connector = "├──"
        
        if item.is_file():
            result.append(f"{prefix}{current_connector} {item.name}")
        else:
            # For directories, recursively process
            child_prefix = next_prefix
            child_tree = get_directory_tree(item, level+1, child_prefix, max_depth)
            result.extend(child_tree)
    
    return result

# Example usage
def print_tree(directory, max_depth=None):
    tree = get_directory_tree(directory, max_depth=max_depth)
    for line in tree:
        print(line)

# Use with your supporting_files path
print_tree(current_dir.parents[1], 2)

# Or with depth limit
# print_tree(supporting_files, max_depth=2)

│   ├── .git/
│   │   ├── ... 13 more items: 13 samples
│   │   ├── ... 1 more items: 1 no extension
│   │   ├── ... 2 more items: 1 folder, 1 no extension
│   │   ├── ... 2 more items: 2 folders
│   │   ├── ... 32 more items: 32 folders
│   │   ├── ... 3 more items: 3 folders
│   ├── COMMIT_EDITMSG
│   ├── HEAD
│   ├── config
│   ├── description
│   ├── index
│   └── packed-refs
│   ├── .venv/
│   │   ├── ... 30 more items: 21 no extensions, 3 bats, 1 ps1, 1 fish, 1 py, 1 nu, 1 13, 1 csh
│   │   ├── ... 1 more items: 1 folder
│   │   ├── ... 1 more items: 1 folder
│   │   ├── ... 2 more items: 2 folders
│   ├── .gitignore
│   ├── CACHEDIR.TAG
│   └── pyvenv.cfg
│   ├── 00-dev-log/
│   └── 00-template.md
│   ├── 00-supporting-files/
│   │   ├── ... 2 more items: 2 folders
│       ├── ... 0 more items: 
│   ├── 01-dev-onboarding/
│   │   ├── ... 2 more items: 2 mds
│   │   ├── ... 3 more items: 2 mds, 1 code-profile
│   │   ├── ... 4 more items: 2 tomls, 1 md, 1 lock
│   │   ├── ... 1 m

In [101]:
print_tree(linux_dir, 3)

│   ├── Documentation/
│   │   ├── ABI/
│   │   │   ├── ... 22 more items: 22 no extensions
│   │   │   ├── ... 16 more items: 16 no extensions
│   │   │   ├── ... 48 more items: 48 no extensions
│   │   │   ├── ... 564 more items: 564 no extensions
│   │   └── README
│   │   ├── PCI/
│   │   │   ├── ... 2 more items: 2 rsts
│   │   │   ├── ... 11 more items: 1 folder, 10 rsts
│   │   ├── acpi-info.rst
│   │   ├── boot-interrupts.rst
│   │   ├── index.rst
│   │   ├── msi-howto.rst
│   │   ├── pci-error-recovery.rst
│   │   ├── pci-iov-howto.rst
│   │   ├── pci.rst
│   │   ├── pcieaer-howto.rst
│   │   ├── pciebus-howto.rst
│   │   ├── sysfs-pci.rst
│   │   └── tph.rst
│   │   ├── RCU/
│   │   │   ├── ... 4 more items: 4 folders
│   │   ├── NMI-RCU.rst
│   │   ├── RTFP.txt
│   │   ├── UP.rst
│   │   ├── checklist.rst
│   │   ├── index.rst
│   │   ├── listRCU.rst
│   │   ├── lockdep-splat.rst
│   │   ├── lockdep.rst
│   │   ├── rcu.rst
│   │   ├── rcu_dereference.rst
│   │   ├── rcubarri

In [6]:
files = [file.name for file in Path(supporting_files).iterdir()]
files

['images', 'data']

# Loading environment files

In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Determine the base directory relative to the file's path or current working directory
current_dir = Path(__file__).resolve().parent if '__file__' in locals() else Path.cwd()

# Adjust the path to point to the supporting_files/data/.env file
env_file_path = os.path.join(current_dir.parents[1], "00-supporting-files", "data", ".env")
load_dotenv(env_file_path)

False

# Loading files as nodes in a graph

In [4]:
import os
from pathlib import Path
import kuzu

# ─── 1. Paths ────────────────────────────────────────────────────────────────────
# Ensure `supporting_files` is defined, e.g.:
# supporting_files = Path("/absolute/path/to/your/project")
current_dir = Path(__file__).resolve().parent if "__file__" in locals() else Path.cwd()
supporting_files = current_dir.parents[1]  # adjust if needed
linux_fp = os.path.join(supporting_files, "data", "linux-kernel")

# ─── 2. Kùzu DB Setup ─────────────────────────────────────────────────────────────
db_dir   = os.path.join(supporting_files, "data", "kuzu_db")
os.makedirs(db_dir, exist_ok=True)

db   = kuzu.Database(str(db_dir))
conn = kuzu.Connection(db)

# ─── 3. Define Schema (idempotent) ───────────────────────────────────────────────
conn.execute("""
CREATE NODE TABLE IF NOT EXISTS Dir (
    path STRING,
    name STRING,
    PRIMARY KEY(path)
);
""")  # :contentReference[oaicite:0]{index=0}

conn.execute("""
CREATE NODE TABLE IF NOT EXISTS File (
    path STRING,
    name STRING,
    ext STRING,
    size INT64,
    PRIMARY KEY(path)
);
""")  # :contentReference[oaicite:1]{index=1}

conn.execute("""
CREATE REL TABLE IF NOT EXISTS SUBDIR(FROM Dir TO Dir);
""")  # :contentReference[oaicite:2]{index=2}

conn.execute("""
CREATE REL TABLE IF NOT EXISTS CONTAINS(FROM Dir TO File);
""")  # :contentReference[oaicite:3]{index=3}

# ─── 4. Indexing Function (idempotent via MERGE) ─────────────────────────────────
def index_repo(root_path: Path):
    root_str = str(root_path)
    for dirpath, dirnames, filenames in os.walk(root_path):
        dir_str  = str(Path(dirpath))
        dir_name = Path(dirpath).name

        # 1) MERGE the Dir node
        conn.execute(f"""
        MERGE (d:Dir {{path: '{dir_str}'}})
        ON CREATE SET d.name = '{dir_name}';
        """)  # :contentReference[oaicite:4]{index=4}

        # 2) MERGE SUBDIR relationship to parent
        parent = Path(dirpath).parent
        parent_str = str(parent)
        if parent_str.startswith(root_str):
            conn.execute(f"""
            MATCH (p:Dir {{path: '{parent_str}'}}), (c:Dir {{path: '{dir_str}'}})
            MERGE (p)-[:SUBDIR]->(c);
            """)  # :contentReference[oaicite:5]{index=5}

        # 3) For each file, MERGE File node then CONTAINS edge
        for fname in filenames:
            fpath      = Path(dirpath) / fname
            fpath_str  = str(fpath)
            ext        = fpath.suffix or ""
            size       = fpath.stat().st_size

            # a) MERGE File node
            conn.execute(f"""
            MERGE (f:File {{path: '{fpath_str}'}})
            ON CREATE SET f.name = '{fname}', f.ext = '{ext}', f.size = {size}
            ON MATCH      SET f.size = {size};
            """)  # :contentReference[oaicite:6]{index=6}

            # b) MERGE CONTAINS relationship
            conn.execute(f"""
            MATCH (d:Dir {{path: '{dir_str}'}}), (f:File {{path: '{fpath_str}'}})
            MERGE (d)-[:CONTAINS]->(f);
            """)  # :contentReference[oaicite:7]{index=7}

# ─── 5. Run Indexing ────────────────────────────────────────────────────────────
print(f"Indexing Linux kernel repo at: {linux_fp}")
index_repo(linux_fp)
print("Indexing complete!")


Indexing Linux kernel repo at: /Users/progressedd/personal-projects/glma/data/linux-kernel
Indexing complete!


In [13]:
import os
from pathlib import Path
import ast
import kuzu

# ─── Paths ───────────────────────────────────────────────────────────────────────
current_dir   = Path(__file__).resolve().parent if "__file__" in locals() else Path.cwd()
supporting    = os.path.join(current_dir.parents[1], "00-supporting-files")
linux_fp      = os.path.join(supporting, "data", "linux-kernel")
db_dir        = os.path.join(supporting, "data", "kuzu_db")
os.makedirs(db_dir, exist_ok=True)

# ─── Kùzu DB Setup ───────────────────────────────────────────────────────────────
db   = kuzu.Database(str(db_dir))
conn = kuzu.Connection(db)

# ─── 1) Define / ensure schema ────────────────────────────────────────────────────
conn.execute("""
CREATE NODE TABLE IF NOT EXISTS Dir (
    path STRING, name STRING,
    PRIMARY KEY(path)
);
""")
conn.execute("""
CREATE NODE TABLE IF NOT EXISTS File (
    path STRING, name STRING, ext STRING, size INT64,
    PRIMARY KEY(path)
);
""")
conn.execute("CREATE REL TABLE IF NOT EXISTS SUBDIR(FROM Dir TO Dir);")
conn.execute("CREATE REL TABLE IF NOT EXISTS CONTAINS(FROM Dir TO File);")

# new tables for functions & variables
conn.execute("""
CREATE NODE TABLE IF NOT EXISTS Function (
    id STRING, name STRING, filePath STRING,
    PRIMARY KEY(id)
);
""")
conn.execute("""
CREATE NODE TABLE IF NOT EXISTS Variable (
    id STRING, name STRING, filePath STRING,
    PRIMARY KEY(id)
);
""")
conn.execute("CREATE REL TABLE IF NOT EXISTS HAS_FUNCTION(FROM File TO Function);")
conn.execute("CREATE REL TABLE IF NOT EXISTS HAS_VARIABLE(FROM File TO Variable);")

# ─── 2) Helper: parse Python file for defs & assignments ─────────────────────────
def extract_py_symbols(py_path: Path):
    """
    Returns two lists of names: (functions, variables)
    - functions: all top-level `def` names
    - variables: all top-level assignment targets
    """
    src = py_path.read_text(encoding="utf8", errors="ignore")
    tree = ast.parse(src)
    funcs = []
    vars_ = []
    for node in tree.body:
        if isinstance(node, ast.FunctionDef):
            funcs.append(node.name)
        elif isinstance(node, ast.Assign):
            # only simple names, skip destructuring
            for target in node.targets:
                if isinstance(target, ast.Name):
                    vars_.append(target.id)
    return funcs, vars_

# ─── 3) Indexing function ────────────────────────────────────────────────────────
def index_repo(root_path: Path):
    root_str = str(root_path)
    for dirpath, dirnames, filenames in os.walk(root_path):
        dir_str  = str(dirpath)
        dir_name = Path(dirpath).name

        # MERGE Dir node
        conn.execute(f"""
        MERGE (d:Dir {{path: '{dir_str}'}})
        ON CREATE SET d.name = '{dir_name}';
        """)

        # MERGE SUBDIR relation
        parent = Path(dirpath).parent
        pstr   = str(parent)
        if pstr.startswith(root_str):
            conn.execute(f"""
            MATCH (p:Dir {{path: '{pstr}'}}), (c:Dir {{path: '{dir_str}'}})
            MERGE (p)-[:SUBDIR]->(c);
            """)

        # For each file, MERGE File node then CONTAINS edge
        for fname in filenames:
            fpath    = Path(dirpath) / fname
            fpath_str= str(fpath)
            size     = fpath.stat().st_size
            ext      = fpath.suffix.lower()

            # a) MERGE File node
            conn.execute(f"""
            MERGE (f:File {{path: '{fpath_str}'}})
            ON CREATE SET f.name = '{fname}', f.ext = '{ext}', f.size = {size}
            ON MATCH      SET f.size = {size};
            """)

            # b) MERGE CONTAINS
            conn.execute(f"""
            MATCH (d:Dir {{path: '{dir_str}'}}), (f:File {{path: '{fpath_str}'}})
            MERGE (d)-[:CONTAINS]->(f);
            """)

            # c) If Python, extract symbols & create Function/Variable nodes + edges
            if ext == '.py':
                funcs, vars_ = extract_py_symbols(fpath)
                for name in funcs:
                    uid = f"{fpath_str}::func::{name}"
                    conn.execute(f"""
                    MERGE (g:Function {{id: '{uid}'}})
                    ON CREATE SET g.name = '{name}', g.filePath = '{fpath_str}';
                    """)
                    conn.execute(f"""
                    MATCH (f:File {{path: '{fpath_str}'}}), (g:Function {{id: '{uid}'}})
                    MERGE (f)-[:HAS_FUNCTION]->(g);
                    """)
                for name in vars_:
                    vid = f"{fpath_str}::var::{name}"
                    conn.execute(f"""
                    MERGE (v:Variable {{id: '{vid}'}})
                    ON CREATE SET v.name = '{name}', v.filePath = '{fpath_str}';
                    """)
                    conn.execute(f"""
                    MATCH (f:File {{path: '{fpath_str}'}}), (v:Variable {{id: '{vid}'}})
                    MERGE (f)-[:HAS_VARIABLE]->(v);
                    """)

# ─── 4) Run it ──────────────────────────────────────────────────────────────────
print(f"Indexing repo at: {linux_fp}")
index_repo(linux_fp)
print("Done indexing!")


Indexing repo at: /Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel




Done indexing!


In [55]:
# All node labels and how many you have of each
q = """
MATCH (n) 
RETURN DISTINCT labels(n)[0]          AS label,
       COUNT(*)                       AS cnt
ORDER BY cnt DESC;
"""
res = conn.execute(q)



In [56]:
res.get_schema()

{'label': 'STRING', 'cnt': 'INT64'}

In [62]:
res.get_as_df()

Unnamed: 0,label,cnt
0,,98429


In [75]:
q = """
MATCH (d:Dir)-[:CONTAINS]->(f:File)
RETURN d.path AS dir, f.name AS file, f.size
ORDER BY dir, file
"""
res = conn.execute(q)

In [76]:
df_request = res.get_as_df()
df_request

Unnamed: 0,dir,file,f.size
0,/Users/progressedd/personal-projects/glma/00-s...,.clang-format,24229
1,/Users/progressedd/personal-projects/glma/00-s...,.clippy.toml,374
2,/Users/progressedd/personal-projects/glma/00-s...,.cocciconfig,59
3,/Users/progressedd/personal-projects/glma/00-s...,.editorconfig,575
4,/Users/progressedd/personal-projects/glma/00-s...,.get_maintainer.ignore,229
...,...,...,...
89855,/Users/progressedd/personal-projects/glma/00-s...,vfio.c,7932
89856,/Users/progressedd/personal-projects/glma/00-s...,vfio.h,289
89857,/Users/progressedd/personal-projects/glma/00-s...,Kconfig,76
89858,/Users/progressedd/personal-projects/glma/00-s...,Makefile,88


In [81]:
df_request.columns

Index(['dir', 'file', 'f.size'], dtype='object')

In [90]:
nodes = []
for label in ["Dir", "File", "Function", "Variable"]:
    q = f"""
    MATCH (n:{label})
    RETURN n
    """
    res = conn.execute(q)
    while res.has_next():
        # Each row is a list with one element: the node dict
        row = res.get_next()
        node_dict = row[0]  # because RETURN n gives a single dict in a list
        node_dict['label'] = label
        nodes.append(node_dict)


In [95]:
nodes

[{'_id': {'offset': 0, 'table': 0},
  '_label': 'Dir',
  'path': '/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel',
  'name': 'linux-kernel',
  'label': 'Dir'},
 {'_id': {'offset': 1, 'table': 0},
  '_label': 'Dir',
  'path': '/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/init',
  'name': 'init',
  'label': 'Dir'},
 {'_id': {'offset': 2, 'table': 0},
  '_label': 'Dir',
  'path': '/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/crypto',
  'name': 'crypto',
  'label': 'Dir'},
 {'_id': {'offset': 3, 'table': 0},
  '_label': 'Dir',
  'path': '/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/crypto/asymmetric_keys',
  'name': 'asymmetric_keys',
  'label': 'Dir'},
 {'_id': {'offset': 4, 'table': 0},
  '_label': 'Dir',
  'path': '/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/crypto/async_tx',
  'name': 'async_tx',
  'label': 'Dir'},

In [93]:
import pandas as pd
df_nodes = pd.DataFrame(nodes)
df_nodes.head()


Unnamed: 0,_id,_label,path,name,label,ext,size,id,filePath
0,"{'offset': 0, 'table': 0}",Dir,/Users/progressedd/personal-projects/glma/00-s...,linux-kernel,Dir,,,,
1,"{'offset': 1, 'table': 0}",Dir,/Users/progressedd/personal-projects/glma/00-s...,init,Dir,,,,
2,"{'offset': 2, 'table': 0}",Dir,/Users/progressedd/personal-projects/glma/00-s...,crypto,Dir,,,,
3,"{'offset': 3, 'table': 0}",Dir,/Users/progressedd/personal-projects/glma/00-s...,asymmetric_keys,Dir,,,,
4,"{'offset': 4, 'table': 0}",Dir,/Users/progressedd/personal-projects/glma/00-s...,async_tx,Dir,,,,


In [94]:
import networkx as nx

G = nx.DiGraph()
for node in nodes:
    G.add_node(node['path'], name=node['name'], label=node['label'])


KeyError: 'path'

# Another attempt

## Crawling the repository

In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Determine the base directory relative to the file's path or current working directory
current_dir = Path(__file__).resolve().parent if '__file__' in locals() else Path.cwd()

# Adjust the path to point to the supporting_files/data/.env file
supporting_files = os.path.join(current_dir.parents[1], "00-supporting-files")
linux_dir = os.path.join(supporting_files, "data", "linux-kernel")
ag2_dir = os.path.join(supporting_files, "data", "ag2-framework")

In [3]:
import os

def crawl_fs(root):
    """
    Recursively crawl folders/files from root.
    Returns: nodes (list of dict), edges (list of dict)
    """
    nodes = []
    edges = []

    def add_dir(path, parent=None):
        node = {
            "type": "Dir",
            "path": path,
            "name": os.path.basename(path)
        }
        nodes.append(node)
        if parent:
            edges.append({"from": parent, "to": path, "type": "SUBDIR"})
        # Recurse
        for entry in sorted(os.listdir(path)):
            full_path = os.path.join(path, entry)
            if os.path.isdir(full_path):
                add_dir(full_path, path)
            elif os.path.isfile(full_path):
                add_file(full_path, path)

    def add_file(path, parent):
        with open(path, "r", errors="ignore") as f:
            content = f.read()
        node = {
            "type": "File",
            "path": path,
            "name": os.path.basename(path),
            "content": content[:10000]  # Limit content for demo
        }
        nodes.append(node)
        edges.append({"from": parent, "to": path, "type": "CONTAINS"})

    add_dir(root)
    return nodes, edges

In [None]:
# Example usage
nodes, edges = crawl_fs(ag2_dir)

In [4]:
# Example usage
nodes, edges = crawl_fs(linux_dir)

## Crawling and chunking the repository

In [52]:
import tree_sitter_python as tspython
import tree_sitter_c as tsc
import tree_sitter_cpp as tscpp
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language()) 
C_LANGUAGE = Language(tsc.language())
CPP_LANGUAGE = Language(tscpp.language())

parser = Parser()

def crawl_fs(path):
    print()

### Exploring the tree-split library

In [None]:
for item in os.walk(linux_dir):
    print(item)
    # break

('/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel', ['init', 'crypto', 'Documentation', 'usr', 'tools', 'net', 'drivers', 'LICENSES', 'security', 'include', 'rust', 'virt', 'samples', 'certs', 'arch', 'scripts', 'lib', 'mm', 'ipc', 'io_uring', 'fs', 'sound', 'kernel', 'block'], ['.pylintrc', '.get_maintainer.ignore', '.cocciconfig', 'Makefile', 'Kconfig', 'Kbuild', 'MAINTAINERS', 'README', '.editorconfig', '.clippy.toml', 'COPYING', '.gitignore', '.clang-format', '.gitattributes', '.rustfmt.toml', '.mailmap', '.git', 'CREDITS'])
('/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/init', [], ['initramfs_internal.h', 'do_mounts_rd.c', 'do_mounts.c', 'do_mounts_initrd.c', 'Makefile', 'initramfs.c', 'initramfs_test.c', 'version-timestamp.c', 'Kconfig', '.kunitconfig', 'init_task.c', 'main.c', '.gitignore', 'calibrate.c', 'do_mounts.h', 'version.c', 'noinitramfs.c'])
('/Users/progressedd/personal-projects/glma/00-supporting-files

In [4]:
from pathlib import Path

linux_dir = Path(supporting_files) / "data" / "linux-kernel"


In [5]:
example_file = linux_dir / "arch" / "alpha" / "boot" / "bootp.c"
example_file

PosixPath('/Users/progressedd/personal-projects/glma/00-supporting-files/data/linux-kernel/arch/alpha/boot/bootp.c')

In [53]:
tree = Parser(CPP_LANGUAGE).parse(example_file.read_bytes())
root = tree.root_node

In [None]:
for i, node in enumerate(tree.root_node.children):
    text = node.text.decode('utf-8')
    print(f"Node {i+1}/{len(tree.root_node.children)}:")
    print(f"  Type: {node.type}")
    print(f"  Start: {node.start_point}")
    print(f"  End: {node.end_point}")
    print(f"  Text: {text[:100]}{'...' if len(text) > 100 else ''}")
    print("-" * 40)
    
    if input("Press Enter to continue, or type 'q' to quit: ") == 'q':
        break

Node 1/28:
  Type: comment
  Start: Point(row=0, column=0)
  End: Point(row=0, column=35)
  Text: // SPDX-License-Identifier: GPL-2.0
----------------------------------------
Node 2/28:
  Type: comment
  Start: Point(row=1, column=0)
  End: Point(row=9, column=3)
  Text: /*
 * arch/alpha/boot/bootp.c
 *
 * Copyright (C) 1997 Jay Estabrook
 *
 * This file is used for cre...
----------------------------------------
Node 3/28:
  Type: preproc_include
  Start: Point(row=10, column=0)
  End: Point(row=11, column=0)
  Text: #include <linux/kernel.h>

----------------------------------------
Node 4/28:
  Type: preproc_include
  Start: Point(row=11, column=0)
  End: Point(row=12, column=0)
  Text: #include <linux/slab.h>

----------------------------------------
Node 5/28:
  Type: preproc_include
  Start: Point(row=12, column=0)
  End: Point(row=13, column=0)
  Text: #include <linux/string.h>

----------------------------------------
Node 6/28:
  Type: preproc_include
  Start: Point(row=13, co

In [23]:
root = tree.root_node

def token_count(text):
    # super-naïve: split on whitespace
    return len(text.decode('utf-8').split())

for node in root.children:
    txt = node.text
    count = token_count(txt)
    print(node.type, token_count(txt))
    
    sz = token_count(node.text)
    print(f"Node type: {node.type}, Size: {sz} tokens")

    start_row, start_col = node.start_point
    # convert to 1-indexed line number
    line_no = start_row + 1

    print(f"{node.type} (line {line_no}): {count} tokens")
    print("-" * 40)

    if node.type == "ERROR":
        snippet = node.text.decode("utf-8")
        print(f"ERROR node at {node.start_point}–{node.end_point}:")
        print(repr(snippet))
        print("-" * 40)


    if input("Press Enter to continue, or type 'q' to quit: ") == 'q':
        break

ERROR 2
Node type: ERROR, Size: 2 tokens
ERROR (line 1): 2 tokens
----------------------------------------
ERROR node at Point(row=0, column=0)–Point(row=0, column=27):
'// SPDX-License-Identifier:'
----------------------------------------
expression_statement 1
Node type: expression_statement, Size: 1 tokens
expression_statement (line 1): 1 tokens
----------------------------------------
ERROR 21
Node type: ERROR, Size: 21 tokens
ERROR (line 2): 21 tokens
----------------------------------------
ERROR node at Point(row=1, column=0)–Point(row=6, column=46):
'/*\n * arch/alpha/boot/bootp.c\n *\n * Copyright (C) 1997 Jay Estabrook\n *\n * This file is used for creating a bootp file'
----------------------------------------
for 1
Node type: for, Size: 1 tokens
for (line 7): 1 tokens
----------------------------------------
ERROR 1
Node type: ERROR, Size: 1 tokens
ERROR (line 7): 1 tokens
----------------------------------------
ERROR node at Point(row=6, column=51)–Point(row=6, column=54)

In [55]:
for node in root.children:
    txt = node.text.decode('utf-8')
    raw_lines = txt.splitlines()

    # drop empty lines at start/end
    lines = [l for l in raw_lines if l.strip()]
    if not lines:
        continue

    # compute line numbers
    start_line = node.start_point[0] + 1
    end_line   = node.end_point[0]   + 1

    # first/last content snippets
    first = lines[0].strip()
    last  = lines[-1].strip()
    first_snip = first if len(first) <= 30 else first[:30] + '...'
    last_snip  = last  if len(last)  <= 20 else last[:20]   + '...'

    # output
    print(node)
    print(txt)
    print("-" * 40)

    if input("Enter to continue, or 'q' to quit: ") == 'q':
        break


(comment)
// SPDX-License-Identifier: GPL-2.0
----------------------------------------
(comment)
/*
 * arch/alpha/boot/bootp.c
 *
 * Copyright (C) 1997 Jay Estabrook
 *
 * This file is used for creating a bootp file for the Linux/AXP kernel
 *
 * based significantly on the arch/alpha/boot/main.c of Linus Torvalds
 */
----------------------------------------
(preproc_include path: (system_lib_string))
#include <linux/kernel.h>

----------------------------------------
(preproc_include path: (system_lib_string))
#include <linux/slab.h>

----------------------------------------
(preproc_include path: (system_lib_string))
#include <linux/string.h>

----------------------------------------
(preproc_include path: (system_lib_string))
#include <generated/utsrelease.h>

----------------------------------------
(preproc_include path: (system_lib_string))
#include <linux/mm.h>

----------------------------------------
(preproc_include path: (system_lib_string))
#include <asm/console.h>

--------

## Loading into kuzu

### AG2

In [11]:
import kuzu
# ─── Paths ───────────────────────────────────────────────────────────────────────
current_dir   = Path(__file__).resolve().parent if "__file__" in locals() else Path.cwd()
supporting    = os.path.join(current_dir.parents[1], "00-supporting-files")
linux_fp      = os.path.join(supporting, "data", "linux-kernel")
db_dir        = os.path.join(supporting, "data", "kuzu_db", "ag2")
os.makedirs(db_dir, exist_ok=True)

# ─── Kùzu DB Setup ───────────────────────────────────────────────────────────────
db   = kuzu.Database(str(db_dir))
conn = kuzu.Connection(db)


In [14]:
# intialize tables
conn.execute("CREATE NODE TABLE Dir(name STRING, path STRING, PRIMARY KEY(path));")
conn.execute("CREATE NODE TABLE File(name STRING, path STRING, content STRING, PRIMARY KEY(path));")
conn.execute("CREATE REL TABLE SUBDIR(FROM Dir TO Dir);")
conn.execute("CREATE REL TABLE CONTAINS(FROM Dir TO File);")

<kuzu.query_result.QueryResult at 0x1069dfa80>

In [15]:
for node in nodes:
    if node["type"] == "Dir":
        conn.execute(
            "CREATE (:Dir {name: $name, path: $path});",
            parameters={"name": node["name"], "path": node["path"]}
        )
    elif node["type"] == "File":
        conn.execute(
            "CREATE (:File {name: $name, path: $path, content: $content});",
            parameters={"name": node["name"], "path": node["path"], "content": node["content"]}
        )

for edge in edges:
    if edge["type"] == "SUBDIR":
        conn.execute(
            """
            MATCH (a:Dir {path: $from}), (b:Dir {path: $to})
            CREATE (a)-[:SUBDIR]->(b);
            """,
            parameters={"from": edge["from"], "to": edge["to"]}
        )
    elif edge["type"] == "CONTAINS":
        conn.execute(
            """
            MATCH (a:Dir {path: $from}), (b:File {path: $to})
            CREATE (a)-[:CONTAINS]->(b);
            """,
            parameters={"from": edge["from"], "to": edge["to"]}
        )


#### Visualizations

##### Using pyviz

In [8]:
# Fetch Dir nodes
dirs = []
res = conn.execute("MATCH (d:Dir) RETURN d.name AS name, d.path AS path")
while res.has_next():
    row = res.get_next()
    dirs.append({"name": row[0], "path": row[1]})

# Fetch File nodes
files = []
res = conn.execute("MATCH (f:File) RETURN f.name AS name, f.path AS path")
while res.has_next():
    row = res.get_next()
    files.append({"name": row[0], "path": row[1]})

# Fetch SUBDIR edges (Dir→Dir)
subdirs = []
res = conn.execute("MATCH (a:Dir)-[:SUBDIR]->(b:Dir) RETURN a.path AS src, b.path AS tgt")
while res.has_next():
    row = res.get_next()
    subdirs.append({"src": row[0], "tgt": row[1]})

# Fetch CONTAINS edges (Dir→File)
contains = []
res = conn.execute("MATCH (a:Dir)-[:CONTAINS]->(b:File) RETURN a.path AS src, b.path AS tgt")
while res.has_next():
    row = res.get_next()
    contains.append({"src": row[0], "tgt": row[1]})


In [None]:
from pyvis.network import Network

# Limit number of nodes for demo (optional, prevents overload for huge graphs)
MAX_NODES = 200
displayed_nodes = set()
node_count = 0

net = Network(notebook=True, height="700px", width="100%", bgcolor="#222222", font_color="white")

# Add Dir nodes (blue boxes)
for d in dirs:
    if node_count >= MAX_NODES: break
    net.add_node(d['path'], label=d['name'], color="deepskyblue", shape="box")
    displayed_nodes.add(d['path'])
    node_count += 1

# Add File nodes (orange ellipses)
for f in files:
    if node_count >= MAX_NODES: break
    net.add_node(f['path'], label=f['name'], color="orange", shape="ellipse")
    displayed_nodes.add(f['path'])
    node_count += 1

# Add SUBDIR edges
for e in subdirs:
    if e['src'] in displayed_nodes and e['tgt'] in displayed_nodes:
        net.add_edge(e['src'], e['tgt'], title="SUBDIR")

# Add CONTAINS edges
for e in contains:
    if e['src'] in displayed_nodes and e['tgt'] in displayed_nodes:
        net.add_edge(e['src'], e['tgt'], title="CONTAINS")

# Show in notebook or save as HTML
net.show("ag2_graph.html")

#### Using yfiles

In [17]:
from yfiles_jupyter_graphs import GraphWidget
dirs_df = conn.execute("MATCH (d:Dir) RETURN d.name, d.path").get_as_df()
files_df = conn.execute("MATCH (f:File) RETURN f.name, f.path").get_as_df()
subdirs_df = conn.execute("MATCH (a:Dir)-[:SUBDIR]->(b:Dir) RETURN a.path, b.path").get_as_df()
contains_df = conn.execute("MATCH (a:Dir)-[:CONTAINS]->(b:File) RETURN a.path, b.path").get_as_df()

nodes = []
edges = []
node_ids = {}
index = 0

# Add Dir nodes
for _, row in dirs_df.iterrows():
    node_ids[f"Dir:{row['d.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['d.name'], "type": "Dir"}})
    index += 1

# Add File nodes
for _, row in files_df.iterrows():
    node_ids[f"File:{row['f.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['f.name'], "type": "File"}})
    index += 1

# Add SUBDIR edges (Dir to Dir)
for _, row in subdirs_df.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"Dir:{row['b.path']}"],
        "properties": {"label": "SUBDIR"}
    })

# Add CONTAINS edges (Dir to File)
for _, row in contains_df.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"File:{row['b.path']}"],
        "properties": {"label": "CONTAINS"}
    })

w = GraphWidget()
w.nodes = nodes
w.edges = edges

# Set custom node styles for type clarity
w.node_styles = {
    "Dir": {"color": "#2980b9", "shape": "rectangle"},
    "File": {"color": "#f39c12", "shape": "ellipse"},
}

w.set_graph_layout("hierarchic")  # or "organic" for a force-directed look
w.show()



GraphWidget(layout=Layout(height='800px', width='100%'))

In [4]:
dir_nodes = conn.execute("MATCH (d:Dir) RETURN d.path, d.name").get_as_df()
file_nodes = conn.execute("MATCH (f:File) RETURN f.path, f.name").get_as_df()
subdir_rels = conn.execute("MATCH (a:Dir)-[:SUBDIR]->(b:Dir) RETURN a.path, b.path").get_as_df()
contains_rels = conn.execute("MATCH (a:Dir)-[:CONTAINS]->(b:File) RETURN a.path, b.path").get_as_df()


In [None]:
nodes = []
node_ids = {}
index = 0

# Dir nodes (type "Dir")
for _, row in dir_nodes.iterrows():
    node_ids[f"Dir:{row['d.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['d.name'], "type": "Dir"}})
    index += 1

# File nodes (type "File")
for _, row in file_nodes.iterrows():
    node_ids[f"File:{row['f.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['f.name'], "type": "File"}})
    index += 1


In [6]:
edges = []
# SUBDIR (Dir→Dir)
for _, row in subdir_rels.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"Dir:{row['b.path']}"],
        "properties": {"label": "SUBDIR"}
    })
# CONTAINS (Dir→File)
for _, row in contains_rels.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"File:{row['b.path']}"],
        "properties": {"label": "CONTAINS"}
    })


In [7]:
import ipywidgets as widgets
widgets.IntSlider()
from yfiles_jupyter_graphs import GraphWidget

w = GraphWidget()
w.nodes = nodes
w.edges = edges

# Node styles by type
w.node_styles = {
    "Dir": {"color": "deepskyblue", "shape": "rectangle"},
    "File": {"color": "orange", "shape": "ellipse"}
}

w.set_graph_layout("organic")  # Or 'organic'
w.show()


GraphWidget(layout=Layout(height='800px', width='100%'))

## Linux Kernel

In [5]:
import kuzu
# ─── Paths ───────────────────────────────────────────────────────────────────────
current_dir   = Path(__file__).resolve().parent if "__file__" in locals() else Path.cwd()
supporting    = os.path.join(current_dir.parents[1], "00-supporting-files")
linux_fp      = os.path.join(supporting, "data", "linux-kernel")
db_dir        = os.path.join(supporting, "data", "kuzu_db", "linux")
os.makedirs(db_dir, exist_ok=True)

# ─── Kùzu DB Setup ───────────────────────────────────────────────────────────────
db   = kuzu.Database(str(db_dir))
conn = kuzu.Connection(db)


In [None]:
# intialize tables
conn.execute("CREATE NODE TABLE Dir(name STRING, path STRING, PRIMARY KEY(path));")
conn.execute("CREATE NODE TABLE File(name STRING, path STRING, content STRING, PRIMARY KEY(path));")
conn.execute("CREATE REL TABLE SUBDIR(FROM Dir TO Dir);")
conn.execute("CREATE REL TABLE CONTAINS(FROM Dir TO File);")

In [12]:
for node in nodes:
    if node["type"] == "Dir":
        conn.execute(
            "CREATE (:Dir {name: $name, path: $path});",
            parameters={"name": node["name"], "path": node["path"]}
        )
    elif node["type"] == "File":
        conn.execute(
            "CREATE (:File {name: $name, path: $path, content: $content});",
            parameters={"name": node["name"], "path": node["path"], "content": node["content"]}
        )

for edge in edges:
    if edge["type"] == "SUBDIR":
        conn.execute(
            """
            MATCH (a:Dir {path: $from}), (b:Dir {path: $to})
            CREATE (a)-[:SUBDIR]->(b);
            """,
            parameters={"from": edge["from"], "to": edge["to"]}
        )
    elif edge["type"] == "CONTAINS":
        conn.execute(
            """
            MATCH (a:Dir {path: $from}), (b:File {path: $to})
            CREATE (a)-[:CONTAINS]->(b);
            """,
            parameters={"from": edge["from"], "to": edge["to"]}
        )


In [23]:
q = """
MATCH (f:File)
RETURN f.name, COUNT(*) AS count
"""

res = conn.execute(q)
df = res.get_as_df()

# Extract extensions in Python
df['extension'] = df['f.name'].apply(lambda name: 
                                     name.split('.')[-1] if '.' in name 
                                     else name)  # Use full filename if no extension

extension_counts = df.groupby('extension').sum().reset_index().sort_values('count', ascending=False)
display(extension_counts)

Unnamed: 0,extension,f.name,count
184,c,setup.cmemcpy.csrm_printk.carc_hostlink.cfpu.c...,35587
432,h,err_ev6.hextable.hserial.hunistd.himx6q-pinfun...,26992
337,dts,abilis_tb100_dvk.dtsvdk_hs38_smp.dtssun4i-a10-...,6611
338,dtsi,sun5i-gr8.dtsiarm-realview-eb-mp.dtsiaspeed-g5...,4954
1525,yaml,"kho.yamlsnps,archs-pct.yamlarm,coresight-etb10...",4854
...,...,...,...
570,miniconfig,Makefile.miniconfig,1
569,milbeaut_m10v_defconfig,milbeaut_m10v_defconfig,1
568,migor_defconfig,migor_defconfig,1
567,microwatt_defconfig,microwatt_defconfig,1


In [29]:
extension_counts["count"].value_counts()

count
1        1357
2          80
3          13
5           8
6           6
4           5
8           5
10          5
7           3
11          3
12          2
15          2
35          2
29          2
33          1
31          1
27          1
30          1
13          1
24          1
23          1
22          1
14          1
26992       1
9           1
46          1
35587       1
54          1
383         1
6611        1
4954        1
4854        1
3785        1
3236        1
1769        1
1353        1
1300        1
976         1
971         1
355         1
59          1
295         1
187         1
158         1
141         1
137         1
131         1
82          1
79          1
74          1
72          1
53          1
Name: count, dtype: int64

In [35]:
over_5 = extension_counts[extension_counts["count"]>=5]
over_5

Unnamed: 0,extension,f.name,count
184,c,setup.cmemcpy.csrm_printk.carc_hostlink.cfpu.c...,35587
432,h,err_ev6.hextable.hserial.hunistd.himx6q-pinfun...,26992
337,dts,abilis_tb100_dvk.dtsvdk_hs38_smp.dtssun4i-a10-...,6611
338,dtsi,sun5i-gr8.dtsiarm-realview-eb-mp.dtsiaspeed-g5...,4954
1525,yaml,"kho.yamlsnps,archs-pct.yamlarm,coresight-etb10...",4854
...,...,...,...
316,defconfig,defconfig,5
1477,uc,altivec.ucneon.ucint.ucs390vx.ucvpermxor.uc,5
769,scr,vmlinux.scr53c700.scr,5
745,rules,xe_wa_oob.rulesMakefile.rulesrules99-nx-gzip.r...,5


In [47]:
print(over_5[["extension", "count", ]].to_markdown(index=False))

| extension       |   count |
|:----------------|--------:|
| c               |   35587 |
| h               |   26992 |
| dts             |    6611 |
| dtsi            |    4954 |
| yaml            |    4854 |
| rst             |    3785 |
| Makefile        |    3236 |
| Kconfig         |    1769 |
| S               |    1353 |
| txt             |    1300 |
| json            |     976 |
| sh              |     971 |
| gitignore       |     383 |
| dtso            |     355 |
| py              |     295 |
| config          |     187 |
| rs              |     158 |
| j2              |     141 |
| Kbuild          |     137 |
| tc              |     131 |
| svg             |      82 |
| Build           |      79 |
| cocci           |      74 |
| pkt             |      72 |
| README          |      59 |
| pl              |      54 |
| settings        |      53 |
| litmus          |      46 |
| TODO            |      35 |
| boot            |      35 |
| tbl             |      33 |
| debug   

#### Visualizations

In [4]:
dir_nodes = conn.execute("MATCH (d:Dir) RETURN d.path, d.name").get_as_df()
file_nodes = conn.execute("MATCH (f:File) RETURN f.path, f.name").get_as_df()
subdir_rels = conn.execute("MATCH (a:Dir)-[:SUBDIR]->(b:Dir) RETURN a.path, b.path").get_as_df()
contains_rels = conn.execute("MATCH (a:Dir)-[:CONTAINS]->(b:File) RETURN a.path, b.path").get_as_df()

In [5]:
nodes = []
node_ids = {}
index = 0

# Dir nodes (type "Dir")
for _, row in dir_nodes.iterrows():
    node_ids[f"Dir:{row['d.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['d.name'], "type": "Dir"}})
    index += 1

# File nodes (type "File")
for _, row in file_nodes.iterrows():
    node_ids[f"File:{row['f.path']}"] = index
    nodes.append({"id": index, "properties": {"label": row['f.name'], "type": "File"}})
    index += 1


In [6]:
edges = []
# SUBDIR (Dir→Dir)
for _, row in subdir_rels.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"Dir:{row['b.path']}"],
        "properties": {"label": "SUBDIR"}
    })
# CONTAINS (Dir→File)
for _, row in contains_rels.iterrows():
    edges.append({
        "start": node_ids[f"Dir:{row['a.path']}"],
        "end": node_ids[f"File:{row['b.path']}"],
        "properties": {"label": "CONTAINS"}
    })


In [7]:
import ipywidgets as widgets
widgets.IntSlider()
from yfiles_jupyter_graphs import GraphWidget

w = GraphWidget()
w.nodes = nodes
w.edges = edges

# Node styles by type
w.node_styles = {
    "Dir": {"color": "deepskyblue", "shape": "rectangle"},
    "File": {"color": "orange", "shape": "ellipse"}
}

w.set_graph_layout("circular")  # Or 'organic'
# w.show()


In [None]:
supporting    = os.path.join(current_dir.parents[1], "00-supporting-files")
linux_graph = os.path.join(supporting, "graphs", "linuz-kernel.graphml")
graph_json = w.to_json()
with open(linux_graph, "w") as f:
    f.write(graph_json)

In [13]:
w.show()

GraphWidget(layout=Layout(height='800px', width='100%'))

In [10]:
from yfiles_jupyter_graphs_for_kuzu import KuzuGraphWidget
g = KuzuGraphWidget(conn)


In [11]:
g.show_cypher("""
  MATCH (d:Dir)-[r:SUBDIR|CONTAINS]->(x)
  RETURN d, r, x
  LIMIT 1000
""")


GraphWidget(layout=Layout(height='800px', width='100%'))

In [None]:
g.show_cypher("""
  MATCH (d:Dir)-[r:SUBDIR|CONTAINS]->(x)
  RETURN d, r, x
""")
