In [2]:
import os
from collections import defaultdict
import json

def notebook_to_markdown(path: str = None, notebook: str = None) -> str:
    """Load a Jupyter notebook from a given path and convert it to Markdown format."""
    with open(path, 'r', encoding='utf-8') as file:
        notebook = json.load(file)
    markdown_content = []
    for cell in notebook['cells']:
        if cell['cell_type'] == 'code':          # Combine code into one block
            markdown_content += [f'```python\n{"".join(cell["source"])}\n```']
        elif cell['cell_type'] == 'markdown':    # Directly append markdown source
            markdown_content += ["".join(cell["source"])]
        # for output in cell.get('outputs', []):   # Optionally, you can include cell outputs
        #     if output['output_type'] == 'stream':
        #         markdown_content.append(f'```\n{"".join(output["text"])}\n```')
    return '\n\n'.join(markdown_content)

def process_files(directory, file_filter):
    """
    Generator that finds, processes, and yields file content.

    Args:
        directory (str): The directory to scan.
        file_filter (callable): A function that returns True if a file should be processed.

    Yields:
        tuple: (content_type, filename, processed_content)
    """
    for filename in sorted(os.listdir(directory)):
        path = os.path.join(directory, filename)
        if os.path.isfile(path) and file_filter(filename):
            with open(path, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            if filename.endswith(".ipynb"):
                content_type = 'notebook'
                processed_content = notebook_to_markdown(path)
            elif filename.endswith(".srt"):
                content_type = 'script'
                # Join dialogue, skipping timestamps and indices
                processed_content = " ".join(
                    line.strip() for line in lines if not (line.strip().isdigit() or '-->' in line or not line.strip())
                )
            else:
                content_type = 'code'
                processed_content = "".join(lines)
            
            yield content_type, filename, processed_content

In [3]:
def combine_and_write_files(group, directory, file_filter=(lambda f: True), orig_path=None):
    """
    Orchestrates the processing and writing of combined files by consuming the generator.
    """
    buffers = defaultdict(str)
    file_generator = process_files(directory, file_filter)

    # Trivial combining syntax, as requested
    orig_path = orig_path or os.path.abspath(directory)
    print(f"Observing Files in {orig_path}")
    for content_type, filename, content in file_generator:
        block = f"################\n### <FILENAME>{os.path.join(orig_path, filename)}</FILENAME>\n\n{content}\n\n"
        buffers[f'{group}_{content_type}'] += block
        # buffers['combined_context'] += block

    # Write final files
    for name, content in buffers.items():
        with open(f"{name}.txt", "w", encoding='utf-8') as f:
            f.write(content.replace(" ", " ")) # Replace non-breaking spaces
        print(f" - Wrote {name}.txt")
    print("Context Generation Complete.")

non_nb_filter = lambda f: not f.endswith(".ipynb")

# Run the process
combine_and_write_files('content', directory="../..")

Observing Files in /dli/task
 - Wrote content_notebook.txt
Context Generation Complete.


In [4]:
combine_and_write_files('compose', directory="..", file_filter=non_nb_filter)

Observing Files in /dli/task/composer
 - Wrote compose_code.txt
Context Generation Complete.
