# Original

## Code

In [25]:
import json
import os
from typing import List, Dict, Any

from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader
import ollama
from litellm import completion, success_callback

from prefect import flow, task
from prefect.logging import get_logger

# Import Rich components
from rich.console import Console
from rich.table import Table
from rich import box
from pprint import pprint

# Initialize Rich console
console = Console()

# Initialize Prefect logger
logger = get_logger()

# Load environment variables
load_dotenv('.env.local') 

# Define model-related classes (unchanged)
class ModelDetails:
    def __init__(self, parent_model, format, family, families, parameter_size, quantization_level):
        self.parent_model = parent_model
        self.format = format
        self.family = family
        self.families = families
        self.parameter_size = parameter_size
        self.quantization_level = quantization_level

class Model:
    def __init__(self, model, modified_at, digest, size, details):
        self.model = model
        self.modified_at = modified_at
        self.digest = digest
        self.size = size
        self.details = details

class ListResponse:
    def __init__(self, models):
        self.models = models

# Global cost tracker
COST_TRACKER = {"cost": 0.0}

@task
def list_ollama_models():
    """
    Fetch and display Ollama models using Rich Table.
    """
    try:
        ollama_models = ollama.list()
        if not ollama_models.models:
            logger.warning("No Ollama models found.")
            console.print("[bold yellow]No Ollama models found.[/bold yellow]")
            return
        
        # Create a Rich table
        table = Table(title="Available Ollama Models", box=box.MINIMAL_DOUBLE_HEAD)
        table.add_column("Model Name", style="cyan", no_wrap=True)
        table.add_column("Modified At", style="magenta")
        table.add_column("Digest", style="green")
        table.add_column("Size (bytes)", justify="right", style="yellow")
        table.add_column("Param Size", justify="right", style="yellow")
        table.add_column("Quant Level", style="blue")
        table.add_column("Family", style="red")
        table.add_column("Families", style="red")

        for model in ollama_models.models:
            table.add_row(
                model.model,
                str(model.modified_at),
                model.digest,
                str(model.size),
                str(model.details.parameter_size),
                model.details.quantization_level,
                model.details.family,
                ", ".join(model.details.families)
            )
        
        console.print(table)
    except Exception as e:
        logger.error(f"Error fetching Ollama models: {e}")
        console.print(f"[bold red]Error fetching Ollama models:[/bold red] {e}")

@task
def track_cost_callback(kwargs, completion_response, start_time, end_time, stream=False):
    """
    Callback function to track and calculate the cost based on token usage.
    """
    try:
        response_dict = completion_response.to_dict() if hasattr(completion_response, "to_dict") else json.loads(str(completion_response))
        usage = response_dict.get("usage", {})
        total_tokens = usage.get("total_tokens", 0)
        COST_TRACKER["cost"] = (total_tokens / 1000.0) * 0.003  # Example cost calculation
        logger.info(f"Calculated cost: {COST_TRACKER['cost']}")
        console.print(f"[bold green]Calculated cost:[/bold green] [yellow]{COST_TRACKER['cost']}[/yellow]")
    except Exception as e:
        logger.error(f"Error in track_cost_callback: {e}")
        console.print(f"[bold red]Error in track_cost_callback:[/bold red] {e}")

@task
def set_success_callback():
    """
    Set the global success callback for cost tracking.
    """
    try:
        success_callback[:] = [track_cost_callback]
        logger.info("Success callback set successfully.")
    except Exception as e:
        logger.error(f"Error setting success callback: {e}")
        console.print(f"[bold red]Error setting success callback:[/bold red] {e}")

@task
def load_documents(path: str) -> List[Dict[str, Any]]:
    """
    Load documents from the specified path.
    """
    try:
        reader = SimpleDirectoryReader(input_dir=path)
        documents = reader.load_data()
        logger.info(f"Loaded {len(documents)} documents from {path}")
        console.print(f"[bold green]Loaded {len(documents)} documents from {path}[/bold green]")
        return [{"content": d.text, **d.metadata} for d in documents]
    except Exception as e:
        logger.error(f"Error loading documents from {path}: {e}")
        console.print(f"[bold red]Error loading documents from {path}:[/bold red] {e}")
        return []

@task
def process_metadata(doc_dicts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Process metadata to remove duplicate file entries.
    """
    try:
        file_seen = set()
        metadata_list = []
        for doc in doc_dicts:
            if doc["file_path"] not in file_seen:
                file_seen.add(doc["file_path"])
                metadata_list.append(doc)
        logger.info(f"Processed metadata: {len(metadata_list)} unique documents")
        console.print(f"[bold green]Processed metadata: {len(metadata_list)} unique documents[/bold green]")
        return metadata_list
    except Exception as e:
        logger.error(f"Error processing metadata: {e}")
        console.print(f"[bold red]Error processing metadata:[/bold red] {e}")
        return []

@task
def query_summaries(
    doc_dicts: List[Dict[str, Any]],
    host: str,
    port: int,
    model: str,
    api_base: str = None,
    stream: bool = False
) -> Dict[str, Any]:
    """
    Generate summaries for the provided document dictionaries.
    """
    PROMPT = f""" 
    The following is a list of file contents, along with their metadata. For each file, provide a summary of the contents. The purpose of the summary is to organize files based on their content. To this end provide a concise but informative summary. Try to make the summary as specific to the file as possible. {doc_dicts}
    
    Return a JSON object with the following schema:
    
    ```json
    {{
      "files": [
        {{
          "file_path": "path to the file including name",
          "summary": "summary of the content"
        }}
      ]
    }}
    ```
    """.strip()

    if not api_base:
        api_base = f"http://{host}:{port}"
        logger.info(f"API Base set to: {api_base}")
        console.print(f"[bold blue]API Base set to: {api_base}[/bold blue]")

    try:
        response = completion(
            model=model, 
            messages=[
                {
                    "role": "system", 
                    "content": "Always return JSON. Do not include any other text or formatting characters."
                },
                {
                    "role": "user", 
                    "content": PROMPT
                }
            ],
            api_base=api_base,
            stream=stream
        )
    except Exception as e:
        logger.error(f"LiteLLM Error >>> {e}")
        console.print(f"[bold red]LiteLLM Error:[/bold red] {e}")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    if response is None:
        logger.warning("No response received from the API.")
        console.print("[bold yellow]No response received from the API.[/bold yellow]")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    try:
        response_dict = response.to_dict() if hasattr(response, "to_dict") else json.loads(str(response))
    except (TypeError, json.JSONDecodeError) as e:
        logger.error(f"Error parsing response: {e}")
        console.print(f"[bold red]Error parsing response:[/bold red] {e}")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    content = response_dict.get("choices", [{}])[0].get("message", {}).get("content", "")
    print(">>>> Content summary:")
    print(content)

    try:
        summaries = json.loads(content)
        print("Summaries:")
        pprint(summaries)
    except json.JSONDecodeError:
        logger.error("Error decoding JSON content from summaries.")
        console.print("[bold red]Error decoding JSON content from summaries.[/bold red]")
        summaries = {"files": []}

    if isinstance(summaries, list) and summaries and isinstance(summaries[0], dict):
        summaries = summaries[0]

    usage = response_dict.get("usage", {})

    if usage and isinstance(summaries, dict):
        summaries["usage"] = {
            "completion_tokens": usage.get("completion_tokens"),
            "prompt_tokens": usage.get("prompt_tokens"),
            "total_tokens": usage.get("total_tokens")
        }

    if isinstance(summaries, dict):
        summaries["cost"] = COST_TRACKER["cost"]
    else:
        summaries = {
            "files": [],
            "cost": COST_TRACKER["cost"]
        }

    logger.info(f"Generated summaries for {len(summaries.get('files', []))} files with cost {summaries.get('cost')}")
    console.print(f"[bold green]Generated summaries for {len(summaries.get('files', []))} files with cost {summaries.get('cost')}[/bold green]")
    return summaries

@task
def create_file_tree(
    summaries: List[Dict[str, Any]],
    host: str,
    port: int,
    source_path: str,
    destination_path: str,
    model: str = "llama-3.1-70b-versatile",
    api_base: str = None,
    stream: bool = False
) -> List[Dict[str, str]]:
    """
    Create a file tree based on the provided summaries.
    Returns src_path, dst_path, and dst_path_new for each file.
    """
    # Define the helper function within the task
    def find_key(obj: Any, key: str) -> Any:
        if isinstance(obj, dict):
            if key in obj:
                return obj[key]
            for value in obj.values():
                result = find_key(value, key)
                if result is not None:
                    return result
        elif isinstance(obj, list):
            for item in obj:
                result = find_key(item, key)
                if result is not None:
                    return result
        return None

    # Update the prompt with explicit instructions
    PROMPT = f"""
    You will be provided with a list of source files and a summary of their contents. The source files are located in '{source_path}', and the destination directory is '{destination_path}'.
    
    For each file, propose:
    1. 'dst_path': A new file path under the destination directory with the same file name.
    2. 'dst_path_new': A new file path under the destination directory with an updated file name (e.g., adding a version number or timestamp).
    
    Follow good naming conventions and organizational best practices. Here are guidelines:
    - Group related files together.
    - Incorporate metadata such as date, version, or experiment details into folder names.
    - Use clear and descriptive names without spaces or special characters.
    - Do not change the file extension.
    - If the file is already well-named or follows a known convention, retain its name for 'dst_path'.
    
    **Example**:
    ```json
    {{
        "files": [
            {{
                "src_path": "/home/user/source/file1.txt",
                "dst_path": "/home/user/destination/2024/04/file1.txt",
                "dst_path_new": "/home/user/destination/2024/04/file1_v2.txt"
            }}
        ]
    }}
    ```
    
    **Important:** Your response **must** be a JSON object with the following schema **at the top level**:
    ```json
    {{
        "files": [
            {{
                "src_path": "original file path",
                "dst_path": "new file path under destination directory with same file name",
                "dst_path_new": "new file path under destination directory with updated file name"
            }}
        ]
    }}
    ```
    
    Do **not** wrap the "files" key inside any other keys.
    """.strip()

    if not api_base:
        api_base = f"http://{host}:{port}"
        logger.info(f"API Base set to: {api_base}")
        console.print(f"[bold blue]API Base set to: {api_base}[/bold blue]")

    try:
        response = completion(
            model=model,
            messages=[
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": json.dumps(summaries)},
            ],
            response_format={"type": "json_object"},
            api_base=api_base,
            stream=stream
        )
    except Exception as e:
        logger.error(f"LiteLLM Error >>> {e}")
        console.print(f"[bold red]LiteLLM Error:[/bold red] {e}")
        return []

    if response is None:
        logger.warning("No response received from the API.")
        console.print("[bold yellow]No response received from the API.[/bold yellow]")
        return []

    try:
        response_dict = response.to_dict() if hasattr(response, "to_dict") else json.loads(str(response))
    except (TypeError, json.JSONDecodeError) as e:
        logger.error(f"Error parsing response: {e}")
        console.print(f"[bold red]Error parsing response:[/bold red] {e}")
        return []

    content = response_dict.get("choices", [{}])[0].get("message", {}).get("content", "")

    try:
        parsed_content = json.loads(content)
        file_tree = find_key(parsed_content, "files")
        if file_tree is None:
            raise KeyError("'files' key not found in the response.")
    except (json.JSONDecodeError, KeyError) as e:
        logger.error(f"Error decoding JSON content: {e}")
        console.print(f"[bold red]Error decoding JSON content:[/bold red] {e}")
        # Optionally, print the raw content for debugging
        console.print(f"[bold yellow]Raw Content:[/bold yellow]\n{content}")
        return []

    logger.info(f"Created file tree for {len(file_tree)} files")
    console.print(f"[bold green]Created file tree for {len(file_tree)} files[/bold green]")
    return file_tree

@task
def concatenate_summaries_and_file_tree(
    summaries: List[Dict[str, Any]],
    file_tree: List[Dict[str, str]]
) -> List[Dict[str, Any]]:
    """
    Concatenate summaries and file_tree into a single dictionary for each file.
    """
    concatenated = []
    summary_dict = {item['file_path']: item['summary'] for item in summaries}

    for item in file_tree:
        src_path = item.get("src_path")
        dst_path = item.get("dst_path")
        dst_path_new = item.get("dst_path_new")
        summary = summary_dict.get(src_path, "No summary available.")
        concatenated.append({
            "file_path": src_path,
            "summary": summary,
            "dst_path": dst_path,
            "dst_path_new": dst_path_new
        })

    logger.info(f"Concatenated summary and file tree for {len(concatenated)} files")
    console.print(f"[bold green]Concatenated summary and file tree for {len(concatenated)} files[/bold green]")
    return concatenated

@task
def create_subdirectories(file_tree: List[Dict[str, str]]):
    """
    Create all necessary subdirectories in the destination paths.
    """
    try:
        for file in file_tree:
            dst_path = file.get("dst_path")
            dst_path_new = file.get("dst_path_new")
            
            # Extract directories from the destination paths
            dst_dir = os.path.dirname(dst_path)
            dst_new_dir = os.path.dirname(dst_path_new)
            
            # Create the directories if they don't exist
            os.makedirs(dst_dir, exist_ok=True)
            os.makedirs(dst_new_dir, exist_ok=True)
        
        logger.info("All necessary subdirectories created.")
        console.print("[bold green]All necessary subdirectories created.[/bold green]")
    except Exception as e:
        logger.error(f"Error creating subdirectories: {e}")
        console.print(f"[bold red]Error creating subdirectories:[/bold red] {e}")

@task
def display_organized_files(organized_files: List[Dict[str, str]]):
    """
    Display organized files using Rich Table.
    """
    if not organized_files:
        console.print("[bold red]No organized files to display.[/bold red]")
        return

    table = Table(title="Organized Files", box=box.MINIMAL_DOUBLE_HEAD)
    table.add_column("Source Path", style="cyan", no_wrap=True)
    table.add_column("Destination Path", style="green")

    for file in organized_files:
        table.add_row(file.get("file_path", ""), file.get("dst_path", ""))
    
    console.print(table)

@task
def display_concatenated_dict(concatenated_dict: List[Dict[str, Any]]):
    """
    Display concatenated summaries and file tree using Rich Table.
    """
    if not concatenated_dict:
        console.print("[bold red]No concatenated data to display.[/bold red]")
        return

    table = Table(title="Summaries and Organized Files", box=box.MINIMAL_DOUBLE_HEAD)
    table.add_column("File Path", style="cyan", no_wrap=True)
    table.add_column("Summary", style="green")
    table.add_column("Destination Path", style="magenta")
    table.add_column("Destination Path New", style="yellow")

    for item in concatenated_dict:
        table.add_row(
            item.get("file_path", ""), 
            item.get("summary", ""), 
            item.get("dst_path", ""), 
            item.get("dst_path_new", "")
        )
    
    console.print(table)

@flow(name="Document Processing Workflow")
def document_processing_workflow(
    source_path: str,
    destination_path: str,
    api_host: str,
    api_port: int,
    summary_model: str,
    tree_model: str,
    api_base: str = None,
    stream: bool = False
) -> Dict[str, Any]:
    """
    Orchestrates the document processing workflow: loading documents, querying summaries, creating a file tree, and concatenating results.

    Args:
        source_path (str): Path to the source documents directory.
        destination_path (str): Path to the destination directory for organized files.
        api_host (str): API host address.
        api_port (int): API port number.
        summary_model (str): Model name for summarizing documents.
        tree_model (str): Model name for creating file tree.
        api_base (str, optional): Base URL for the API. Defaults to None.
        stream (bool, optional): Whether to use streaming. Defaults to False.

    Returns:
        Dict[str, Any]: Dictionary containing summaries, file_tree, and concatenated data.
    """
    # Initial setup
    set_success_callback()
    list_ollama_models()

    # Load and process documents
    loaded_docs = load_documents(source_path)
    unique_docs = process_metadata(loaded_docs)

    # Generate summaries
    summaries = query_summaries(
        doc_dicts=unique_docs,
        host=api_host,
        port=api_port,
        model=summary_model,
        api_base=api_base,
        stream=stream
    )

    # Create file tree
    file_tree = create_file_tree(
        summaries=summaries.get("files", []),
        host=api_host,
        port=api_port,
        source_path=source_path,
        destination_path=destination_path,
        model=tree_model,
        api_base=api_base,
        stream=stream
    )

    # Create necessary subdirectories
    create_subdirectories(file_tree)

    # Concatenate summaries and file_tree
    concatenated_dict = concatenate_summaries_and_file_tree(summaries.get("files", []), file_tree)

    # Display organized files using Rich
    display_organized_files(file_tree)

    # Display concatenated summaries and file_tree
    display_concatenated_dict(concatenated_dict)

    # Return all results
    return {
        "summaries": summaries,
        "file_tree": file_tree,
        "concatenated_data": concatenated_dict
    }


## >>>>> LLM Tests

### Anthropic - claude-3-haiku-20240307

In [2]:

if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "anthropic/claude-3-haiku-20240307"
    tree_model = "anthropic/claude-3-haiku-20240307"
    api_base_url = "https://api.anthropic.com"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))


Summaries:
{'files': [{'file_path': '/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt',
            'summary': 'This file discusses the rise of Vertical Large '
                       'Language Model (LLM) Agents as the next big wave in '
                       'Software as a Service (SaaS) innovations. It '
                       'highlights how these AI agents tailored to specific '
                       'industries can revolutionize workflows and complex '
                       "tasks, using the success story of Case Text's "
                       'Co-Counsel as an example.'},
           {'file_path': '/home/rakesh/Downloads/sample_data/four/random_file.txt',
            'summary': 'This file contains a test file with some basic '
                       'information, including a bank account number.'},
           {'file_path': '/home/rakesh/Downloads/sample_data/four/shad.txt',
            'summary': 'This file provides an overview of various Next.js '
                      

('{\n'
 '    "files": [\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '            "summary": "This file discusses the rise of Vertical Large '
 'Language Model (LLM) Agents as the next big wave in Software as a Service '
 '(SaaS) innovations. It highlights how these AI agents tailored to specific '
 'industries can revolutionize workflows and complex tasks, using the success '
 'story of Case Text\'s Co-Counsel as an example."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '            "summary": "This file contains a test file with some basic '
 'information, including a bank account number."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '            "summary": "This file provides an overview of various Next.js '
 'starter projects, including information about their f

('[\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/vertical_llms/dsflsdflj.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/vertical_llms/dsflsdflj_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized/misc/random_file.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized/misc/random_file_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/nextjs_starters/shad.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/nextjs_starters

('[\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "summary": "This file discusses the rise of Vertical Large Language '
 'Model (LLM) Agents as the next big wave in Software as a Service (SaaS) '
 'innovations. It highlights how these AI agents tailored to specific '
 'industries can revolutionize workflows and complex tasks, using the success '
 'story of Case Text\'s Co-Counsel as an example.",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/vertical_llms/dsflsdflj.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized/technology/vertical_llms/dsflsdflj_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "summary": "This file contains a test file with some basic '
 'information, including a bank account number.",\n'
 '        "dst_path": '
 '"/home/rakesh/Download

### Openrouter - gemini-2.0-flash-exp:free

In [26]:

if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized_or/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "openrouter/google/gemini-2.0-flash-exp:free"
    tree_model = "openrouter/google/gemini-2.0-flash-exp:free"
    api_base_url = "https://openrouter.ai/api/v1"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming
    # model="ollama/smollm2:135m",

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))


>>>> Content summary:
```json
{
  "files": [
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",
      "summary": "Discusses the emergence of Vertical Large Language Model (LLM) Agents in SaaS, using Case Text's Co-Counsel as an example of success in the legal industry."
    },
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/random_file.txt",
      "summary": "Contains a short text with a bank account number: 1234567890."
    },
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",
      "summary": "Lists and describes 8 Next.js starter projects, including their tech stacks (framework, styling, UI libraries, databases, and other features)"
    }
  ]
}
```



('{\n'
 '    "files": [],\n'
 '    "usage": {\n'
 '        "completion_tokens": 221,\n'
 '        "prompt_tokens": 1629,\n'
 '        "total_tokens": 1850\n'
 '    },\n'
 '    "cost": 0.0\n'
 '}')


'[]'


'[]'


### Ollama

In [7]:

if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized_o/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "ollama/llama3.2:latest"
    tree_model = "ollama/llama3.2:latest"
    api_base_url = "http://localhost:11434"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming
    # model="ollama/smollm2:135m",

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



('{\n'
 '    "files": [],\n'
 '    "usage": {\n'
 '        "completion_tokens": 140,\n'
 '        "prompt_tokens": 1436,\n'
 '        "total_tokens": 1576\n'
 '    },\n'
 '    "cost": 0.0\n'
 '}')


'[]'


'[]'


### Groq

In [None]:

if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "groq/llama3-8b-8192"
    tree_model = "groq/llama3-8b-8192"
    api_base_url = "https://api.groq.com/openai/v1"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))


# OLLAMA TEST

In [27]:
import json
import os
from typing import List, Dict, Any

from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader
import ollama
from litellm import completion, success_callback

from prefect import flow, task
from prefect.logging import get_logger

# Import Rich components
from rich.console import Console
from rich.table import Table
from rich import box
from pprint import pprint

# Initialize Rich console
console = Console()

# Initialize Prefect logger
logger = get_logger()

# Load environment variables
load_dotenv('.env.local') 

# Define model-related classes (unchanged)
class ModelDetails:
    def __init__(self, parent_model, format, family, families, parameter_size, quantization_level):
        self.parent_model = parent_model
        self.format = format
        self.family = family
        self.families = families
        self.parameter_size = parameter_size
        self.quantization_level = quantization_level

class Model:
    def __init__(self, model, modified_at, digest, size, details):
        self.model = model
        self.modified_at = modified_at
        self.digest = digest
        self.size = size
        self.details = details

class ListResponse:
    def __init__(self, models):
        self.models = models

# Global cost tracker
COST_TRACKER = {"cost": 0.0}

@task
def list_ollama_models():
    """
    Fetch and display Ollama models using Rich Table.
    """
    try:
        ollama_models = ollama.list()
        if not ollama_models.models:
            logger.warning("No Ollama models found.")
            console.print("[bold yellow]No Ollama models found.[/bold yellow]")
            return
        
        # Create a Rich table
        table = Table(title="Available Ollama Models", box=box.MINIMAL_DOUBLE_HEAD)
        table.add_column("Model Name", style="cyan", no_wrap=True)
        table.add_column("Modified At", style="magenta")
        table.add_column("Digest", style="green")
        table.add_column("Size (bytes)", justify="right", style="yellow")
        table.add_column("Param Size", justify="right", style="yellow")
        table.add_column("Quant Level", style="blue")
        table.add_column("Family", style="red")
        table.add_column("Families", style="red")

        for model in ollama_models.models:
            table.add_row(
                model.model,
                str(model.modified_at),
                model.digest,
                str(model.size),
                str(model.details.parameter_size),
                model.details.quantization_level,
                model.details.family,
                ", ".join(model.details.families)
            )
        
        console.print(table)
    except Exception as e:
        logger.error(f"Error fetching Ollama models: {e}")
        console.print(f"[bold red]Error fetching Ollama models:[/bold red] {e}")

@task
def track_cost_callback(kwargs, completion_response, start_time, end_time, stream=False):
    """
    Callback function to track and calculate the cost based on token usage.
    """
    try:
        if hasattr(completion_response, "to_dict"):
            response_dict = completion_response.to_dict()
        elif isinstance(completion_response, dict):
            response_dict = completion_response
        else:
            response_dict = json.loads(str(completion_response))

        usage = response_dict.get("usage", {})
        total_tokens = usage.get("total_tokens", 0)
        COST_TRACKER["cost"] = (total_tokens / 1000.0) * 0.003  # Example cost calculation
        logger.info(f"Calculated cost: {COST_TRACKER['cost']}")
        console.print(f"[bold green]Calculated cost:[/bold green] [yellow]{COST_TRACKER['cost']}[/yellow]")
    except Exception as e:
        logger.error(f"Error in track_cost_callback: {e}")
        console.print(f"[bold red]Error in track_cost_callback:[/bold red] {e}")

@task
def set_success_callback():
    """
    Set the global success callback for cost tracking.
    """
    try:
        # Ensure success_callback is treated as a list
        # If success_callback is a list-like structure from litellm, we can do this:
        success_callback.clear()
        success_callback.append(track_cost_callback)
        logger.info("Success callback set successfully.")
    except Exception as e:
        logger.error(f"Error setting success callback: {e}")
        console.print(f"[bold red]Error setting success callback:[/bold red] {e}")

@task
def load_documents(path: str) -> List[Dict[str, Any]]:
    """
    Load documents from the specified path.
    """
    try:
        reader = SimpleDirectoryReader(input_dir=path)
        documents = reader.load_data()
        logger.info(f"Loaded {len(documents)} documents from {path}")
        console.print(f"[bold green]Loaded {len(documents)} documents from {path}[/bold green]")
        return [{"content": d.text, **d.metadata} for d in documents]
    except Exception as e:
        logger.error(f"Error loading documents from {path}: {e}")
        console.print(f"[bold red]Error loading documents from {path}:[/bold red] {e}")
        return []

@task
def process_metadata(doc_dicts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Process metadata to remove duplicate file entries.
    """
    try:
        file_seen = set()
        metadata_list = []
        for doc in doc_dicts:
            if doc["file_path"] not in file_seen:
                file_seen.add(doc["file_path"])
                metadata_list.append(doc)
        logger.info(f"Processed metadata: {len(metadata_list)} unique documents")
        console.print(f"[bold green]Processed metadata: {len(metadata_list)} unique documents[/bold green]")
        return metadata_list
    except Exception as e:
        logger.error(f"Error processing metadata: {e}")
        console.print(f"[bold red]Error processing metadata:[/bold red] {e}")
        return []

@task
def query_summaries(
    doc_dicts: List[Dict[str, Any]],
    host: str,
    port: int,
    model: str,
    api_base: str = None,
    stream: bool = False
) -> Dict[str, Any]:
    PROMPT = f""" 
    The following is a list of file contents, along with their metadata. For each file, provide a summary of the contents. The purpose of the summary is to organize files based on their content. To this end provide a concise but informative summary. Try to make the summary as specific to the file as possible. {doc_dicts}
    
    Do not call any functions. Do not return a function call. Only return the requested JSON.
    Return a JSON object with the following schema:
    
    ```json
    {{
      "files": [
        {{
          "file_path": "path to the file including name",
          "summary": "summary of the content"
        }}
      ]
    }}
    ```
    """.strip()

    if not api_base:
        api_base = f"http://{host}:{port}"
        logger.info(f"API Base set to: {api_base}")
        console.print(f"[bold blue]API Base set to: {api_base}[/bold blue]")

    try:
        response = completion(
            model=model, 
            messages=[
                {
                    "role": "system", 
                    "content": "Always return JSON. Do not include any other text or formatting characters."
                },
                {
                    "role": "user", 
                    "content": PROMPT
                }
            ],
            api_base=api_base,
            stream=stream,
            response_format={"type": "json_object"},  # Ensures the response is JSON
            # functions=[],  # Disable function calls
            # function_call="none"  # Do not allow the LLM to return a function call
        )
    except Exception as e:
        logger.error(f"LiteLLM Error >>> {e}")
        console.print(f"[bold red]LiteLLM Error:[/bold red] {e}")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    if response is None:
        logger.warning("No response received from the API.")
        console.print("[bold yellow]No response received from the API.[/bold yellow]")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    try:
        response_dict = response.to_dict() if hasattr(response, "to_dict") else json.loads(str(response))
    except (TypeError, json.JSONDecodeError) as e:
        logger.error(f"Error parsing response: {e}")
        console.print(f"[bold red]Error parsing response:[/bold red] {e}")
        return {"files": [], "cost": COST_TRACKER["cost"]}

    content = response_dict.get("choices", [{}])[0].get("message", {}).get("content", "")
    print(">>>> Content summary:")
    print(content)

    try:
        summaries = json.loads(content)
        print("Summaries:")
        print(summaries)
    except json.JSONDecodeError:
        logger.error("Error decoding JSON content from summaries.")
        console.print("[bold red]Error decoding JSON content from summaries.[/bold red]")
        summaries = {"files": []}

    if isinstance(summaries, list) and summaries and isinstance(summaries[0], dict):
        summaries = summaries[0]

    usage = response_dict.get("usage", {})
    if usage and isinstance(summaries, dict):
        summaries["usage"] = {
            "completion_tokens": usage.get("completion_tokens"),
            "prompt_tokens": usage.get("prompt_tokens"),
            "total_tokens": usage.get("total_tokens")
        }

    if isinstance(summaries, dict):
        summaries["cost"] = COST_TRACKER["cost"]
    else:
        summaries = {"files": [], "cost": COST_TRACKER["cost"]}

    logger.info(f"Generated summaries for {len(summaries.get('files', []))} files with cost {summaries.get('cost')}")
    console.print(f"[bold green]Generated summaries for {len(summaries.get('files', []))} files with cost {summaries.get('cost')}[/bold green]")
    return summaries


@task
def create_file_tree(
    summaries: List[Dict[str, Any]],
    host: str,
    port: int,
    source_path: str,
    destination_path: str,
    model: str = "llama-3.1-70b-versatile",
    api_base: str = None,
    stream: bool = False
) -> List[Dict[str, str]]:
    # Define the helper function within the task
    def find_key(obj: Any, key: str) -> Any:
        if isinstance(obj, dict):
            if key in obj:
                return obj[key]
            for value in obj.values():
                result = find_key(value, key)
                if result is not None:
                    return result
        elif isinstance(obj, list):
            for item in obj:
                result = find_key(item, key)
                if result is not None:
                    return result
        return None

    PROMPT = f"""
    You will be provided with a list of source files and a summary of their contents. The source files are located in '{source_path}', and the destination directory is '{destination_path}'.
    
    For each file, propose:
    1. 'dst_path': A new file path under the destination directory with the same file name.
    2. 'dst_path_new': A new file path under the destination directory with an updated file name (e.g., adding a version number or timestamp).
    
    Follow good naming conventions and organizational best practices. Here are guidelines:
    - Group related files together.
    - Incorporate metadata such as date, version, or experiment details into folder names.
    - Use clear and descriptive names without spaces or special characters.
    - Do not change the file extension.
    - If the file is already well-named or follows a known convention, retain its name for 'dst_path'.
    
    **Example**:
    ```json
    {{
        "files": [
            {{
                "src_path": "/home/user/source/file1.txt",
                "dst_path": "/home/user/destination/2024/04/file1.txt",
                "dst_path_new": "/home/user/destination/2024/04/file1_v2.txt"
            }}
        ]
    }}
    ```
    
    **Important:** Your response **must** be a JSON object with the following schema **at the top level**:
    ```json
    {{
        "files": [
            {{
                "src_path": "original file path",
                "dst_path": "new file path under destination directory with same file name",
                "dst_path_new": "new file path under destination directory with updated file name"
            }}
        ]
    }}
    ```
    
    Do **not** wrap the "files" key inside any other keys.
    """.strip()

    if not api_base:
        api_base = f"http://{host}:{port}"
        logger.info(f"API Base set to: {api_base}")
        console.print(f"[bold blue]API Base set to: {api_base}[/bold blue]")

    try:
        response = completion(
            model=model,
            messages=[
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": json.dumps(summaries)},
            ],
            api_base=api_base,
            stream=stream,
            response_format={"type": "json_object"}  # Ensures the response is JSON
        )
    except Exception as e:
        logger.error(f"LiteLLM Error >>> {e}")
        console.print(f"[bold red]LiteLLM Error:[/bold red] {e}")
        return []

    if response is None:
        logger.warning("No response received from the API.")
        console.print("[bold yellow]No response received from the API.[/bold yellow]")
        return []

    try:
        response_dict = response.to_dict() if hasattr(response, "to_dict") else json.loads(str(response))
    except (TypeError, json.JSONDecodeError) as e:
        logger.error(f"Error parsing response: {e}")
        console.print(f"[bold red]Error parsing response:[/bold red] {e}")
        return []

    content = response_dict.get("choices", [{}])[0].get("message", {}).get("content", "")

    try:
        parsed_content = json.loads(content)
        file_tree = find_key(parsed_content, "files")
        if file_tree is None:
            raise KeyError("'files' key not found in the response.")
    except (json.JSONDecodeError, KeyError) as e:
        logger.error(f"Error decoding JSON content: {e}")
        console.print(f"[bold red]Error decoding JSON content:[/bold red] {e}")
        console.print(f"[bold yellow]Raw Content:[/bold yellow]\n{content}")
        return []

    logger.info(f"Created file tree for {len(file_tree)} files")
    console.print(f"[bold green]Created file tree for {len(file_tree)} files[/bold green]")
    return file_tree

    """
    Create a file tree based on the provided summaries.
    Returns src_path, dst_path, and dst_path_new for each file.
    """
    # Define the helper function within the task
    def find_key(obj: Any, key: str) -> Any:
        if isinstance(obj, dict):
            if key in obj:
                return obj[key]
            for value in obj.values():
                result = find_key(value, key)
                if result is not None:
                    return result
        elif isinstance(obj, list):
            for item in obj:
                result = find_key(item, key)
                if result is not None:
                    return result
        return None

    PROMPT = f"""
    You will be provided with a list of source files and a summary of their contents. The source files are located in '{source_path}', and the destination directory is '{destination_path}'.
    
    For each file, propose:
    1. 'dst_path': A new file path under the destination directory with the same file name.
    2. 'dst_path_new': A new file path under the destination directory with an updated file name (e.g., adding a version number or timestamp).
    
    Follow good naming conventions and organizational best practices. Here are guidelines:
    - Group related files together.
    - Incorporate metadata such as date, version, or experiment details into folder names.
    - Use clear and descriptive names without spaces or special characters.
    - Do not change the file extension.
    - If the file is already well-named or follows a known convention, retain its name for 'dst_path'.
    
    **Example**:
    ```json
    {{
        "files": [
            {{
                "src_path": "/home/user/source/file1.txt",
                "dst_path": "/home/user/destination/2024/04/file1.txt",
                "dst_path_new": "/home/user/destination/2024/04/file1_v2.txt"
            }}
        ]
    }}
    ```
    
    **Important:** Your response **must** be a JSON object with the following schema **at the top level**:
    ```json
    {{
        "files": [
            {{
                "src_path": "original file path",
                "dst_path": "new file path under destination directory with same file name",
                "dst_path_new": "new file path under destination directory with updated file name"
            }}
        ]
    }}
    ```
    
    Do **not** wrap the "files" key inside any other keys.
    """.strip()

    if not api_base:
        api_base = f"http://{host}:{port}"
        logger.info(f"API Base set to: {api_base}")
        console.print(f"[bold blue]API Base set to: {api_base}[/bold blue]")

    try:
        response = completion(
            model=model,
            messages=[
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": json.dumps(summaries)},
            ],
            api_base=api_base,
            stream=stream
        )
    except Exception as e:
        logger.error(f"LiteLLM Error >>> {e}")
        console.print(f"[bold red]LiteLLM Error:[/bold red] {e}")
        return []

    if response is None:
        logger.warning("No response received from the API.")
        console.print("[bold yellow]No response received from the API.[/bold yellow]")
        return []

    try:
        if hasattr(response, "to_dict"):
            response_dict = response.to_dict()
        elif isinstance(response, dict):
            response_dict = response
        else:
            response_dict = json.loads(str(response))
    except (TypeError, json.JSONDecodeError) as e:
        logger.error(f"Error parsing response: {e}")
        console.print(f"[bold red]Error parsing response:[/bold red] {e}")
        return []

    content = response_dict.get("choices", [{}])[0].get("message", {}).get("content", "")

    try:
        parsed_content = json.loads(content)
        file_tree = find_key(parsed_content, "files")
        if file_tree is None:
            raise KeyError("'files' key not found in the response.")
    except (json.JSONDecodeError, KeyError) as e:
        logger.error(f"Error decoding JSON content: {e}")
        console.print(f"[bold red]Error decoding JSON content:[/bold red] {e}")
        console.print(f"[bold yellow]Raw Content:[/bold yellow]\n{content}")
        return []

    logger.info(f"Created file tree for {len(file_tree)} files")
    console.print(f"[bold green]Created file tree for {len(file_tree)} files[/bold green]")
    return file_tree

@task
def concatenate_summaries_and_file_tree(
    summaries: List[Dict[str, Any]],
    file_tree: List[Dict[str, str]]
) -> List[Dict[str, Any]]:
    """
    Concatenate summaries and file_tree into a single dictionary for each file.
    """
    concatenated = []
    summary_dict = {item['file_path']: item['summary'] for item in summaries}

    for item in file_tree:
        src_path = item.get("src_path")
        dst_path = item.get("dst_path")
        dst_path_new = item.get("dst_path_new")
        summary = summary_dict.get(src_path, "No summary available.")
        concatenated.append({
            "file_path": src_path,
            "summary": summary,
            "dst_path": dst_path,
            "dst_path_new": dst_path_new
        })

    logger.info(f"Concatenated summary and file tree for {len(concatenated)} files")
    console.print(f"[bold green]Concatenated summary and file tree for {len(concatenated)} files[/bold green]")
    return concatenated

@task
def create_subdirectories(file_tree: List[Dict[str, str]]):
    """
    Create all necessary subdirectories in the destination paths.
    """
    try:
        for file in file_tree:
            dst_path = file.get("dst_path")
            dst_path_new = file.get("dst_path_new")
            
            # Extract directories from the destination paths
            dst_dir = os.path.dirname(dst_path)
            dst_new_dir = os.path.dirname(dst_path_new)
            
            # Create the directories if they don't exist
            os.makedirs(dst_dir, exist_ok=True)
            os.makedirs(dst_new_dir, exist_ok=True)
        
        logger.info("All necessary subdirectories created.")
        console.print("[bold green]All necessary subdirectories created.[/bold green]")
    except Exception as e:
        logger.error(f"Error creating subdirectories: {e}")
        console.print(f"[bold red]Error creating subdirectories:[/bold red] {e}")

@task
def display_organized_files(organized_files: List[Dict[str, str]]):
    """
    Display organized files using Rich Table.
    """
    if not organized_files:
        console.print("[bold red]No organized files to display.[/bold red]")
        return

    table = Table(title="Organized Files", box=box.MINIMAL_DOUBLE_HEAD)
    table.add_column("Source Path", style="cyan", no_wrap=True)
    table.add_column("Destination Path", style="green")

    for file in organized_files:
        table.add_row(file.get("file_path", ""), file.get("dst_path", ""))
    
    console.print(table)

@task
def display_concatenated_dict(concatenated_dict: List[Dict[str, Any]]):
    """
    Display concatenated summaries and file tree using Rich Table.
    """
    if not concatenated_dict:
        console.print("[bold red]No concatenated data to display.[/bold red]")
        return

    table = Table(title="Summaries and Organized Files", box=box.MINIMAL_DOUBLE_HEAD)
    table.add_column("File Path", style="cyan", no_wrap=True)
    table.add_column("Summary", style="green")
    table.add_column("Destination Path", style="magenta")
    table.add_column("Destination Path New", style="yellow")

    for item in concatenated_dict:
        table.add_row(
            item.get("file_path", ""), 
            item.get("summary", ""), 
            item.get("dst_path", ""), 
            item.get("dst_path_new", "")
        )
    
    console.print(table)

@flow(name="Document Processing Workflow")
def document_processing_workflow(
    source_path: str,
    destination_path: str,
    api_host: str,
    api_port: int,
    summary_model: str,
    tree_model: str,
    api_base: str = None,
    stream: bool = False
) -> Dict[str, Any]:
    """
    Orchestrates the document processing workflow: loading documents, querying summaries, creating a file tree, and concatenating results.

    Args:
        source_path (str): Path to the source documents directory.
        destination_path (str): Path to the destination directory for organized files.
        api_host (str): API host address.
        api_port (int): API port number.
        summary_model (str): Model name for summarizing documents.
        tree_model (str): Model name for creating file tree.
        api_base (str, optional): Base URL for the API. Defaults to None.
        stream (bool, optional): Whether to use streaming. Defaults to False.

    Returns:
        Dict[str, Any]: Dictionary containing summaries, file_tree, and concatenated data.
    """
    # Initial setup
    set_success_callback()
    list_ollama_models()

    # Load and process documents
    loaded_docs = load_documents(source_path)
    unique_docs = process_metadata(loaded_docs)

    # Generate summaries
    summaries = query_summaries(
        doc_dicts=unique_docs,
        host=api_host,
        port=api_port,
        model=summary_model,
        api_base=api_base,
        stream=stream
    )

    # Create file tree
    file_tree = create_file_tree(
        summaries=summaries.get("files", []),
        host=api_host,
        port=api_port,
        source_path=source_path,
        destination_path=destination_path,
        model=tree_model,
        api_base=api_base,
        stream=stream
    )

    # Create necessary subdirectories
    create_subdirectories(file_tree)

    # Concatenate summaries and file_tree
    concatenated_dict = concatenate_summaries_and_file_tree(summaries.get("files", []), file_tree)

    # Display organized files using Rich
    display_organized_files(file_tree)

    # Display concatenated summaries and file_tree
    display_concatenated_dict(concatenated_dict)

    # Return all results
    return {
        "summaries": summaries,
        "file_tree": file_tree,
        "concatenated_data": concatenated_dict
    }


## LLM Tests

### OLLama 

In [30]:

if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized_o/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "ollama/smollm2:135m"
    # tree_model = "ollama/llama3.2:latest"
    tree_model = "ollama/smollm2:135m"
    api_base_url = "http://localhost:11434"                        # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.




[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.



'{\n    "files": [],\n    "cost": 0.003129\n}'


'[]'


'[]'


### OpenRouter - gemini-2.0-flash-exp:free

In [28]:
if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized_or/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "openrouter/google/gemini-2.0-flash-exp:free"
    tree_model = "openrouter/google/gemini-2.0-flash-exp:free"
    api_base_url = "https://openrouter.ai/api/v1"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming
    # model="ollama/smollm2:135m",

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))


>>>> Content summary:
{
"files": [
    {
        "file_path": "/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",
        "summary": "This article discusses the rise of Vertical Large Language Model (LLM) Agents in SaaS, highlighting Case Text’s Co-Counsel as a successful example. It explains why these agents are becoming billion-dollar opportunities and how organizations can use them."
    },
    {
        "file_path": "/home/rakesh/Downloads/sample_data/four/random_file.txt",
        "summary": "This is a test file containing a bank account number: 1234567890."
    },
    {
        "file_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",
        "summary": "This article introduces Next.js and discusses several open-source Next.js starter projects for web development, including tailwind-nextjs-starter-blog, Vercel Platforms, next-forge, fragments, next-shadcn-dashboard-starter, langchain-nextjs-template, and nextjs-starter-kit. It details the tech stack for each project."

('{\n'
 '    "files": [\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '            "summary": "This article discusses the rise of Vertical Large '
 'Language Model (LLM) Agents in SaaS, highlighting Case Text\\u2019s '
 'Co-Counsel as a successful example. It explains why these agents are '
 'becoming billion-dollar opportunities and how organizations can use them."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '            "summary": "This is a test file containing a bank account '
 'number: 1234567890."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '            "summary": "This article introduces Next.js and discusses '
 'several open-source Next.js starter projects for web development, including '
 'tailwind-nextjs-starter-blog, Vercel Platforms, next-forge, fragmen

('[\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/llm_agents/dsflsdflj.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/llm_agents/dsflsdflj_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/misc/random_file.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/misc/random_file_sensitive.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/nextjs_projects/shad.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/nextjs_projects/shad_project_list.txt"\n

('[\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "summary": "This article discusses the rise of Vertical Large '
 'Language Model (LLM) Agents in SaaS, highlighting Case Text\\u2019s '
 'Co-Counsel as a successful example. It explains why these agents are '
 'becoming billion-dollar opportunities and how organizations can use them.",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/llm_agents/dsflsdflj.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/llm_agents/dsflsdflj_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "summary": "This is a test file containing a bank account number: '
 '1234567890.",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/misc/random_file.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/

### OpenRouter - gpt-4o-mini

In [31]:
if __name__ == "__main__":
    # Define your parameters
    source_directory = "/home/rakesh/Downloads/sample_data/four/"  # Replace with your actual source documents path
    destination_directory = "/home/rakesh/Downloads/sample_data/four_organized_or/"  # Replace with your desired destination path
    api_host = "localhost"                                         # API host address (not needed if api_base is provided)
    api_port = 8111                                                # API port number
    summary_model = "openrouter/openai/gpt-4o-mini"
    tree_model = "openrouter/openai/gpt-4o-mini"
    api_base_url = "https://openrouter.ai/api/v1"                     # Base URL for the API
    use_streaming = False                                          # Set to True if you want streaming
    # model="ollama/smollm2:135m",

    # Ensure destination directory exists
    os.makedirs(destination_directory, exist_ok=True)

    # Run the workflow
    organized_files_result = document_processing_workflow(
        source_path=source_directory,
        destination_path=destination_directory,
        api_host=api_host,
        api_port=api_port,
        summary_model=summary_model,
        tree_model=tree_model,
        api_base=api_base_url,                                  # Providing the API base URL
        stream=use_streaming
    )

    # Extract the results
    summaries = organized_files_result.get("summaries", {})
    file_tree = organized_files_result.get("file_tree", [])
    concatenated_data = organized_files_result.get("concatenated_data", [])

    # Optionally, pretty-print the results using Rich
    console.print("[bold yellow]Summaries:[/bold yellow]")
    pprint(json.dumps(summaries, indent=4))

    console.print("\n[bold yellow]File Tree:[/bold yellow]")
    pprint(json.dumps(file_tree, indent=4))

    console.print("\n[bold yellow]Concatenated Data:[/bold yellow]")
    pprint(json.dumps(concatenated_data, indent=4))


>>>> Content summary:
{
  "files": [
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",
      "summary": "This file discusses the rise of Vertical Large Language Model (LLM) Agents in the SaaS space, highlighting the success of Case Text’s AI-driven legal assistant, Co-Counsel, and the potential for billion-dollar opportunities in this technology."
    },
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/random_file.txt",
      "summary": "This is a test file containing critical information, including a bank account number, which should be handled with care."
    },
    {
      "file_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",
      "summary": "The file provides a detailed overview of Next.js, an open source web development framework, along with a list of eight recommended boilerplates for building Next.js applications, including features and technology stacks for each starter project."
    }
  ]
}
Summaries:
{'files': [{'fi

('{\n'
 '    "files": [\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '            "summary": "This file discusses the rise of Vertical Large '
 'Language Model (LLM) Agents in the SaaS space, highlighting the success of '
 'Case Text\\u2019s AI-driven legal assistant, Co-Counsel, and the potential '
 'for billion-dollar opportunities in this technology."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '            "summary": "This is a test file containing critical information, '
 'including a bank account number, which should be handled with care."\n'
 '        },\n'
 '        {\n'
 '            "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '            "summary": "The file provides a detailed overview of Next.js, an '
 'open source web development framework, along with a list of eight '
 'recommended boilerplates f

('[\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Vertical_Large_Language_Model_LLM_Agents.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Vertical_Large_Language_Model_LLM_Agents_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Critical_Information_Test_File.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Critical_Information_Test_File_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "src_path": "/home/rakesh/Downloads/sample_data/four/shad.txt",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Next.js_Overview.txt",\n'
 '        "dst_path_new": '

('[\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/dsflsdflj.txt",\n'
 '        "summary": "This file discusses the rise of Vertical Large Language '
 'Model (LLM) Agents in the SaaS space, highlighting the success of Case '
 'Text\\u2019s AI-driven legal assistant, Co-Counsel, and the potential for '
 'billion-dollar opportunities in this technology.",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Vertical_Large_Language_Model_LLM_Agents.txt",\n'
 '        "dst_path_new": '
 '"/home/rakesh/Downloads/sample_data/four_organized_or/2023/10/Vertical_Large_Language_Model_LLM_Agents_v1.txt"\n'
 '    },\n'
 '    {\n'
 '        "file_path": '
 '"/home/rakesh/Downloads/sample_data/four/random_file.txt",\n'
 '        "summary": "This is a test file containing critical information, '
 'including a bank account number, which should be handled with care.",\n'
 '        "dst_path": '
 '"/home/rakesh/Downloads/sample_dat