## Imports

In [3]:
import os
import json
from pathlib import Path
import yaml
import json
import os
import time
from google import genai
from google.genai import types
from dotenv import load_dotenv
from pathlib import Path
from tqdm import tqdm
import random
from helix.client import Query, Client
from helix.instance import Instance
from helix.types import Payload
from collections import defaultdict

load_dotenv()

True

## Create docs folder

In [4]:
DOCS_DIR = Path("../docs")
OUTPUT_DIR = Path("./processed_docs")
OUTPUT_DIR.mkdir(exist_ok=True)

In [5]:
with open("../mkdocs.yml", "r") as f:
    mkdocs_config = yaml.safe_load(f)

mkdocs_config

{'INHERIT': './docs/_global/mkdocs.yml',
 'site_name': 'After Effects Scripting Guide',
 'site_url': 'https://ae-scripting.docsforadobe.dev/',
 'repo_url': 'https://github.com/docsforadobe/after-effects-scripting-guide/',
 'repo_name': 'after-effects-scripting-guide',
 'nav': [{'Home': 'index.md'},
  {'Introduction': [{'Overview': 'introduction/overview.md'},
    {'Javascript for After Effects': 'introduction/javascript.md'},
    {'After Effects Scripting Changlog': 'introduction/changelog.md'},
    {'After Effects Object Model': 'introduction/objectmodel.md'},
    {'After Effects Class Hierarchy': 'introduction/classhierarchy.md'}]},
  {'General': [{'Globals': 'general/globals.md'},
    {'Application': 'general/application.md'},
    {'Project': 'general/project.md'},
    {'System': 'general/system.md'}]},
  {'Item': [{'Item object': 'item/item.md'},
    {'ItemCollection': 'item/itemcollection.md'},
    {'AVItem': 'item/avitem.md'},
    {'CompItem': 'item/compitem.md'},
    {'FolderIte

In [6]:
nav = mkdocs_config.get("nav", {})
nav

[{'Home': 'index.md'},
 {'Introduction': [{'Overview': 'introduction/overview.md'},
   {'Javascript for After Effects': 'introduction/javascript.md'},
   {'After Effects Scripting Changlog': 'introduction/changelog.md'},
   {'After Effects Object Model': 'introduction/objectmodel.md'},
   {'After Effects Class Hierarchy': 'introduction/classhierarchy.md'}]},
 {'General': [{'Globals': 'general/globals.md'},
   {'Application': 'general/application.md'},
   {'Project': 'general/project.md'},
   {'System': 'general/system.md'}]},
 {'Item': [{'Item object': 'item/item.md'},
   {'ItemCollection': 'item/itemcollection.md'},
   {'AVItem': 'item/avitem.md'},
   {'CompItem': 'item/compitem.md'},
   {'FolderItem': 'item/folderitem.md'},
   {'FootageItem': 'item/footageitem.md'}]},
 {'Layer': [{'Layer object': 'layer/layer.md'},
   {'LayerCollection': 'layer/layercollection.md'},
   {'AVLayer': 'layer/avlayer.md'},
   {'CameraLayer': 'layer/cameralayer.md'},
   {'LightLayer': 'layer/lightlayer.md'

In [7]:
#extract all markdown files from the mkdocs.yml file
def extract_md_files(nav_item, parent_section=""):
    md_files = []
    
    if isinstance(nav_item, dict):
        for section, content in nav_item.items():
            section_files = extract_md_files(content, section)
            md_files.extend(section_files)
    elif isinstance(nav_item, list):
        for item in nav_item:
            section_files = extract_md_files(item, parent_section)
            md_files.extend(section_files)
    elif isinstance(nav_item, str) and nav_item.endswith(".md"):
    
        md_files.append({
            "path": nav_item,
            "section": parent_section
        })
    
    return md_files

all_md_files = []
for item in nav:
    all_md_files.extend(extract_md_files(item))


In [8]:
all_md_files

[{'path': 'index.md', 'section': 'Home'},
 {'path': 'introduction/overview.md', 'section': 'Overview'},
 {'path': 'introduction/javascript.md',
  'section': 'Javascript for After Effects'},
 {'path': 'introduction/changelog.md',
  'section': 'After Effects Scripting Changlog'},
 {'path': 'introduction/objectmodel.md',
  'section': 'After Effects Object Model'},
 {'path': 'introduction/classhierarchy.md',
  'section': 'After Effects Class Hierarchy'},
 {'path': 'general/globals.md', 'section': 'Globals'},
 {'path': 'general/application.md', 'section': 'Application'},
 {'path': 'general/project.md', 'section': 'Project'},
 {'path': 'general/system.md', 'section': 'System'},
 {'path': 'item/item.md', 'section': 'Item object'},
 {'path': 'item/itemcollection.md', 'section': 'ItemCollection'},
 {'path': 'item/avitem.md', 'section': 'AVItem'},
 {'path': 'item/compitem.md', 'section': 'CompItem'},
 {'path': 'item/folderitem.md', 'section': 'FolderItem'},
 {'path': 'item/footageitem.md', 'sect

## Chunk MD files

In [9]:
import re

# chunk by separators
def chunk_by_separators(content):

    sections = re.split(r'\n---\n', content)
    
    chunks = []
    for i, section in enumerate(sections):
        section = section.strip()
        if not section:
            continue
        
        if i > 0:
            section = "---\n\n" + section
            
        chunks.append(section)
    
    return chunks


# determine which chunking method to use
def smart_api_chunking(content, file_path):
    
    separator_count = len(re.findall(r'\n---\n', content))
    
    if separator_count >= 3:
        return chunk_by_separators(content)
    else:
        #fall back to chonkie if separator-based chunking fails
        from chonkie import SentenceChunker
        chunker = SentenceChunker(chunk_size=1024, chunk_overlap=0)
        return [chunk.text for chunk in chunker.chunk(content)]

In [10]:
#extract embeddable content and code blocks
def extract_embeddable_content_and_code(chunk_text):

    code_blocks = []
    
    #extract code blocks
    code_fence_pattern = r'```[\s\S]*?```'
    code_fences = re.findall(code_fence_pattern, chunk_text)
    code_blocks.extend(code_fences)
    
    #remove code blocks from chunk_text
    embeddable_text = re.sub(code_fence_pattern, '', chunk_text)
    
    #remove paragraph markers
    embeddable_text = re.sub(r'¶', '', embeddable_text) 
    embeddable_text = re.sub(r'\n\s*\n', '\n\n', embeddable_text) 
    embeddable_text = embeddable_text.strip()
    
    #extract signatures
    signature_pattern = r'`[^`\n]+\([^)]*\)`'
    signatures = re.findall(signature_pattern, embeddable_text)
    
    #extract title
    title_match = re.search(r'^###?\s*(.+)', chunk_text, re.MULTILINE)
    title = title_match.group(1).strip() if title_match else "Unknown"
    
    #extract metadata
    metadata = {
        'title': title,
        'signatures': signatures,
        'has_code_examples': len(code_fences) > 0,
        'has_inline_code': len(re.findall(r'`[^`\n]+`', embeddable_text)) > 0
    }
    
    return embeddable_text, code_blocks, metadata

In [11]:
processed_docs = []

#process each markdown file
for md_file in all_md_files:
    file_path = DOCS_DIR / md_file["path"]
    section = md_file["section"]
    
    #skip if file doesn't exist
    if not file_path.exists():
        print(f"Warning: File {file_path} not found, skipping.")
        continue
    
    #skip index.md file
    if file_path.name == "index.md":
        print(f"Skipping index.md file: {file_path}")
        continue

    #skip introduction folder files
    if "introduction" in file_path.parts:
        print(f"Skipping introduction folder file: {file_path}")
        continue
    
    
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    #chunk the content
    chunks = smart_api_chunking(content, file_path)
    
    #process each chunk
    for i, chunk_text in enumerate(chunks):
        embeddable_text, code_blocks, metadata = extract_embeddable_content_and_code(chunk_text)
        
        processed_docs.append({
            "title": metadata['title'],
            "content": embeddable_text,  
            "full_content": chunk_text,  
            "code_blocks": code_blocks,  
            "signatures": metadata['signatures'],
            "has_code_examples": metadata['has_code_examples'],
            "path": str(file_path),
            "section": section,
            "category": file_path.parent.name,
            "chunk_id": i,
            "total_chunks": len(chunks)
        })


print(f"\nTotal processed documents: {len(processed_docs)}")

Skipping index.md file: ../docs/index.md
Skipping introduction folder file: ../docs/introduction/overview.md
Skipping introduction folder file: ../docs/introduction/javascript.md
Skipping introduction folder file: ../docs/introduction/changelog.md
Skipping introduction folder file: ../docs/introduction/objectmodel.md
Skipping introduction folder file: ../docs/introduction/classhierarchy.md


  from .autonotebook import tqdm as notebook_tqdm



Total processed documents: 748


In [12]:
#save to processed_docs.json
with open(OUTPUT_DIR / "processed_docs.json", "w", encoding="utf-8") as f:
    json.dump(processed_docs, f, indent=2, ensure_ascii=False)

#create a summary of the processed docs
summary = {}
for doc in processed_docs:
    category = doc['category']
    if category not in summary:
        summary[category] = {
            'count': 0,
            'files': set()
        }
    summary[category]['count'] += 1
    summary[category]['files'].add(doc['path'].split('/')[-1])

#create a summary of the processed docs
for category in summary:
    summary[category]['files'] = list(summary[category]['files'])

#save the summary to processing_summary.json
with open(OUTPUT_DIR / "processing_summary.json", "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)


print("summary:")
for category, info in summary.items():
    print(f"  {category}: {info['count']} chunks from {len(info['files'])} files")

summary:
  general: 116 chunks from 4 files
  item: 86 chunks from 6 files
  layer: 150 chunks from 10 files
  property: 98 chunks from 4 files
  renderqueue: 51 chunks from 5 files
  sources: 21 chunks from 4 files
  text: 130 chunks from 6 files
  other: 73 chunks from 10 files
  effects: 23 chunks from 1 files


## Generate Gemini Embeddings For Chunks

In [13]:
INPUT_FILE = OUTPUT_DIR / "processed_docs.json"
OUTPUT_FILE = OUTPUT_DIR / "embedded_docs.json"
CHECKPOINT_FILE = OUTPUT_DIR / "embedding_checkpoint.json"

api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY environment variable not set")

client = genai.Client(api_key=api_key)

In [14]:
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    processed_docs = json.load(f)

print(f"{len(processed_docs)} processed documents")

748 processed documents


In [15]:
embedded_docs = []
processed_indices = set()
if CHECKPOINT_FILE.exists():
    try:
        with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
            embedded_docs = json.load(f)
            processed_indices = {doc.get("original_index") for doc in embedded_docs if "original_index" in doc}
    except json.JSONDecodeError:
        if CHECKPOINT_FILE.exists():
            backup_file = CHECKPOINT_FILE.with_suffix('.json.bak')
            CHECKPOINT_FILE.rename(backup_file)
            print(f"usingcheckpoint {backup_file}")

In [16]:
# adding batch size and pause because gemini rate limits
total_docs = len(processed_docs)
BATCH_SIZE = 145
BATCH_PAUSE = 60

In [17]:
for batch_start in range(0, len(processed_docs), BATCH_SIZE):
    batch_end = min(batch_start + BATCH_SIZE, len(processed_docs))
    current_batch = processed_docs[batch_start:batch_end]
    
    print(f"\nbatch {batch_start//BATCH_SIZE + 1}/{(len(processed_docs) + BATCH_SIZE - 1)//BATCH_SIZE}")
    print(f"documents {batch_start} to {batch_end-1}")
    
    #process each document in the current batch
    for idx, doc in enumerate(tqdm(current_batch, desc=f"batch {batch_start//BATCH_SIZE + 1}")):
        success = False
        max_retries = 3
        retry_count = 0
        
        #retry if rate limit is hit
        while not success and retry_count < max_retries:
            try:
                #generate the embedding
                result = client.models.embed_content(
                    model="models/text-embedding-004",
                    contents=doc["content"],
                    config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
                )
                #extract the embedding
                embedding_list = result.embeddings[0].values
                #add the embedding to the document
                doc_copy = doc.copy()
                doc_copy["embedding"] = embedding_list
                doc_copy["original_index"] = batch_start + idx
                embedded_docs.append(doc_copy)
                success = True
                
                #save the checkpoint 
                if len(embedded_docs) % 10 == 0:
                    temp_file = CHECKPOINT_FILE.with_suffix('.tmp')
                    with open(temp_file, "w", encoding="utf-8") as f:
                        json.dump(embedded_docs, f, ensure_ascii=False)
                    temp_file.replace(CHECKPOINT_FILE)
                    
            except Exception as e:
                error_message = str(e)
                if "RESOURCE_EXHAUSTED" in error_message or "429" in error_message:
                    retry_count += 1
                    sleep_time = (2 ** retry_count) + random.uniform(0, 1)
                    print(f"rate limit hit, retrying in {sleep_time:.1f} seconds")
                    time.sleep(sleep_time)
                else:
                    print(f"error generating embedding for document {batch_start + idx}: {error_message}")
                    retry_count += 1
                    time.sleep(1)
        
        if not success:
            print(f"failed to generate embedding for document {batch_start + idx} after {max_retries} retries")
        
    
    temp_file = CHECKPOINT_FILE.with_suffix('.tmp')
    with open(temp_file, "w", encoding="utf-8") as f:
        json.dump(embedded_docs, f, ensure_ascii=False)
    temp_file.replace(CHECKPOINT_FILE)
    
    if batch_end < len(processed_docs):
        print(f"\nbatch complete, waiting {BATCH_PAUSE} seconds before starting next batch...")
        for remaining in range(BATCH_PAUSE, 0, -1):
            print(f"Next batch starting in {remaining} seconds...", end="\r")
            time.sleep(1)
        print("\nstarting next batch...")


temp_file = OUTPUT_FILE.with_suffix('.tmp')
with open(temp_file, "w", encoding="utf-8") as f:
    json.dump(embedded_docs, f, ensure_ascii=False)
temp_file.replace(OUTPUT_FILE)

print(f"\generated embeddings for {len(embedded_docs)}/{total_docs} documents")
print(f"saved to: {OUTPUT_FILE}")


  print(f"\generated embeddings for {len(embedded_docs)}/{total_docs} documents")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



batch 1/6
documents 0 to 144


batch 1: 100%|██████████| 145/145 [00:31<00:00,  4.63it/s]



batch complete, waiting 60 seconds before starting next batch...
Next batch starting in 1 seconds....
starting next batch...

batch 2/6
documents 145 to 289


batch 2: 100%|██████████| 145/145 [00:30<00:00,  4.73it/s]



batch complete, waiting 60 seconds before starting next batch...
Next batch starting in 1 seconds....
starting next batch...

batch 3/6
documents 290 to 434


batch 3: 100%|██████████| 145/145 [00:36<00:00,  3.94it/s]



batch complete, waiting 60 seconds before starting next batch...
Next batch starting in 1 seconds....
starting next batch...

batch 4/6
documents 435 to 579


batch 4: 100%|██████████| 145/145 [00:32<00:00,  4.52it/s]



batch complete, waiting 60 seconds before starting next batch...
Next batch starting in 1 seconds....
starting next batch...

batch 5/6
documents 580 to 724


batch 5: 100%|██████████| 145/145 [00:32<00:00,  4.52it/s]



batch complete, waiting 60 seconds before starting next batch...
Next batch starting in 1 seconds....
starting next batch...

batch 6/6
documents 725 to 747


batch 6: 100%|██████████| 23/23 [00:06<00:00,  3.78it/s]


\generated embeddings for 748/748 documents
saved to: processed_docs/embedded_docs.json


## Setup HelixDB

In [19]:
db = Client(local=True)

[32m[HELIX][0m Helix instance found at 'http://0.0.0.0:6969'


## Python SDK

In [20]:
class load_docs_rag(Query):
    def __init__(self, chapters):
        super().__init__()
        self.chapters = chapters
    
    def query(self):
        return [{"chapters": self.chapters}]
    
    def response(self, response):
        return response
    
class get_chapter_content(Query):
    def __init__(self, chapter_id):
        super().__init__()
        self.chapter_id = chapter_id
    
    def query(self):
        return [{"chapter_id": self.chapter_id}]
    
    def response(self, response):
        return response

class search_docs_rag(Query):
    def __init__(self, query_vector, k=5):
        super().__init__()
        self.query_vector = query_vector
        self.k = k
    
    def query(self):
        return [{"query": self.query_vector, "k": self.k}]
    
    def response(self, response):
        return response

In [21]:
#organize the data for helixdb
def organize_all_data_for_helix(docs):
    chapters_data = defaultdict(lambda: defaultdict(list))
    
    #organize the data by category and file name
    for doc in docs:
        category = doc['category']
        file_name = doc['path'].split('/')[-1]
        
        #add the data to the chapters_data dictionary
        chapters_data[category][file_name].append({
            'chunk': doc['content'],
            'vector': doc['embedding']
        })
    
    helix_chapters = []

    #get the category names
    category_names = list(chapters_data.keys())
    
    #create the subchapters
    for chapter_idx, (category, files) in enumerate(chapters_data.items()):
        subchapters = []
        #create the subchapters
        for file_name, chunks in files.items():
            #create the subchapters
            subchapters.append({
                'title': file_name,
                'content': f"After Effects {category} documentation for {file_name}",
                'chunks': chunks
            })
        
        #add the subchapters to the helix chapters
        helix_chapters.append({
            'id': chapter_idx,
            'subchapters': subchapters
        })
    
    return helix_chapters, category_names

all_helix_data, category_list = organize_all_data_for_helix(embedded_docs)

total_chunks = 0
for i, chapter in enumerate(all_helix_data):
    category_name = category_list[i]
    chapter_chunks = sum(len(sub['chunks']) for sub in chapter['subchapters'])
    total_chunks += chapter_chunks
    print(f"  Chapter {chapter['id']} ({category_name}): {len(chapter['subchapters'])} files, {chapter_chunks} chunks")

print(f"\n{len(all_helix_data)} chapters, {total_chunks} chunks across all categories")

try:
    load_all_query = load_docs_rag(all_helix_data)
    result = db.query(load_all_query)
    print(f"✅ HelixDB full load result: {result}")
    
except Exception as e:
    print(f"error loading all data: {e}")
    import traceback
    traceback.print_exc()  

  Chapter 0 (general): 4 files, 116 chunks
  Chapter 1 (item): 6 files, 86 chunks
  Chapter 2 (layer): 10 files, 150 chunks
  Chapter 3 (property): 4 files, 98 chunks
  Chapter 4 (renderqueue): 5 files, 51 chunks
  Chapter 5 (sources): 4 files, 21 chunks
  Chapter 6 (text): 6 files, 130 chunks
  Chapter 7 (other): 10 files, 73 chunks
  Chapter 8 (effects): 1 files, 23 chunks

9 chapters, 748 chunks across all categories


[32m[HELIX][0m Querying 'http://0.0.0.0:6969/load_docs_rag': 100%|██████████| 1/1 [00:01<00:00,  1.91s/it]

✅ HelixDB full load result: [{'Success': 'Success'}]





In [22]:
#search the adobe after effects docs
def search_ae_docs(user_question, top_k=5):

    try:
        #embed the user question
        result = client.models.embed_content(
            model="models/text-embedding-004",
            contents=user_question,
            config=types.EmbedContentConfig(task_type="QUESTION_ANSWERING")
        )
        #extract the embedding
        query_embedding = result.embeddings[0].values
        #search the docs

        ## NOTE: there is a little bug in helixdb that is currently being fixed - the total chunks is hardcoded for now
        total_chunks = 748
        search_query = search_docs_rag(query_embedding, k=total_chunks)
        search_results = db.query(search_query)
        
        if search_results and search_results[0]:
            all_results = search_results[0].get('embedding_edges', [])
            
            # NOTE: bug that is being fixed - the results are reversed for now (list is flipped currently in Rust implementation)
            reversed_results = list(reversed(all_results))
            top_results = reversed_results[:top_k]
        
            
            #print the results
            for i, result in enumerate(top_results):
                chunk = result.get('chunk', 'No chunk content')
                if isinstance(chunk, list) and len(chunk) > 0:
                    chunk = chunk[0]
                #print the subchapter title
                subchapter_title = result.get('subchapter_title', 'Unknown file')
                if isinstance(subchapter_title, list) and len(subchapter_title) > 0:
                    subchapter_title = subchapter_title[0]
                
                print(f"Result {i+1}: {subchapter_title}")
                print(f"Content: {chunk}")
                print("─" * 80)
        else:
            print("No results found")
            
    except Exception as e:
        print(f"Error searching: {e}")
        import traceback
        traceback.print_exc()

# question to test the search
search_ae_docs("I imported a bunch of footage into my After Effects project but some of the files got moved or deleted from their original location. Now I have these red question mark placeholders showing up in my project panel. Can you write a script that goes through all my footage items and tells me which ones have broken file links? I also want to see the original file paths of the missing footage so I know where they used to be located.", top_k=5)

[32m[HELIX][0m Querying 'http://0.0.0.0:6969/search_docs_rag': 100%|██████████| 1/1 [00:00<00:00, 28.92it/s]

Result 1: avitem.md
Content: ---

### AVItem.footageMissing

`app.project.item(index).footageMissing`

#### Description

When `true`, the AVItem is a placeholder, or represents footage with a source file that cannot be found. In this case, the path of the missing source file is in the `missingFootagePath` attribute of the footage item's source-file object. See [FootageItem.mainSource](footageitem.md#footageitemmainsource) and [FileSource.missingFootagePath](../sources/filesource.md#filesourcemissingfootagepath).

#### Type

Boolean; read-only.
────────────────────────────────────────────────────────────────────────────────
Result 2: avitem.md
Content: ---

## Methods

### AVItem.setProxy()

`app.project.item(index).setProxy(file)`

#### Description

Sets a file as the proxy of this AVItem.

Loads the specified file into a new FileSource object, sets this as the value of the `proxySource` attribute, and sets `useProxy` to `true`.

It does not preserve the interpretation parameters, inst




In [25]:
if 'api_key' in locals():
    del api_key