<a href="https://colab.research.google.com/github/reeperx/AI-Image-Art-Starter-File/blob/main/n8n_Chunking_(Public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pinecone tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
import os
import re
import openai
from google.colab import drive
from pinecone import Pinecone, ServerlessSpec

################################################################################
# 1) Mount Google Drive
################################################################################
drive.mount('/content/drive')

################################################################################
# 2) Set API Keys
################################################################################
PINECONE_API_KEY = "ENTER_YOUR_KEY"
PINECONE_REGION  = "ENTER_LOCATION"
OPENAI_API_KEY   = "ENTER_YOUR_KEY"

INDEX_NAME = "ENTER_NAME"        # your Pinecone index name
EMBED_MODEL = "text-embedding-3-large"  # using '3-large' with ~8k context

openai.api_key = OPENAI_API_KEY

################################################################################
# 3) Initialize Pinecone (New Approach)
################################################################################
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_REGION)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=3072,            # same dimension as 'text-embedding-3-large'
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=PINECONE_REGION)
    )
index = pc.Index(INDEX_NAME)

################################################################################
# 4) Utility Functions
################################################################################

def sanitize_vector_id(text: str) -> str:
    """
    Remove non-ASCII characters from text to produce a safe vector ID.
    """
    return re.sub(r'[^\x00-\x7F]+', '', text)

def chunk_text_by_tokens(text: str, chunk_size: int = 6000, model_name: str = EMBED_MODEL) -> list:
    """
    Splits 'text' into chunks of up to 'chunk_size' tokens each,
    using the specified 'model_name' for tokenization (via tiktoken).
    """
    import tiktoken

    try:
        enc = tiktoken.encoding_for_model(model_name)
    except KeyError:
        # If for some reason tiktoken doesn't recognize the model, default to cl100k_base
        enc = tiktoken.get_encoding("cl100k_base")

    tokens = enc.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i+chunk_size]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def generate_title_from_json(json_text: str) -> str:
    """
    Uses a large language model to generate a concise, descriptive title from the JSON content.
    The title will be in lowercase and use underscores, e.g. "agent_google_sheet_slack".
    """
    prompt = f"""
You are given a JSON representation of an automation workflow.
Generate a concise, descriptive, and uniform title that captures the automation's main functionality.
The title should be in lowercase and use underscores to separate words.
For example, if the automation reads from Google Sheets, calls an LLM, and sends a message to Slack,
you might return: agent_google_sheet_slack.
Only output the title.

JSON Content:
{json_text}
    """
    try:
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="o3-mini",
            reasoning_effort="medium",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating title: {e}")
        return None

def generate_tldr_from_json(json_text: str) -> str:
    """
    Uses a large language model to generate a one-sentence TLDR summary of the automation.
    This summary describes the core functionality in plain language.
    """
    prompt = f"""
You are given a JSON representation of an automation workflow.
Generate a concise one-sentence summary (TLDR) of what this automation does.
It should capture the core functionality in plain language.
Only output the summary.

JSON Content:
{json_text}
    """
    try:
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_API_KEY)
        response = client.chat.completions.create(
            model="o3-mini",
            reasoning_effort="medium",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating TLDR: {e}")
        return None

################################################################################
# 5) Process Files: Generate Title & TLDR, Chunk by Tokens, Embed & Upsert
################################################################################
folder_path = "/content/drive/MyDrive/n8n Workflows"  # Adjust if needed

for file_name in os.listdir(folder_path):
    if file_name.lower().endswith(".txt") or file_name.lower().endswith(".json"):
        full_path = os.path.join(folder_path, file_name)
        with open(full_path, "r", encoding="utf-8") as f:
            file_text = f.read()

        # Generate a descriptive title for this automation
        generated_title = generate_title_from_json(file_text)
        if generated_title:
            base_vector_id = sanitize_vector_id(generated_title)
            print(f"\nGenerated title for '{file_name}': {generated_title} (vector base ID: {base_vector_id})")
        else:
            base_vector_id = sanitize_vector_id(file_name)
            generated_title = file_name
            print(f"\nUsing fallback title for '{file_name}': {generated_title} (vector base ID: {base_vector_id})")

        # Generate a TLDR (agent summary) for this automation
        generated_tldr = generate_tldr_from_json(file_text)
        if generated_tldr:
            print(f"Generated TLDR for '{file_name}': {generated_tldr}")
        else:
            generated_tldr = ""
            print(f"Using empty TLDR for '{file_name}'.")

        # Token-based chunking: up to 6000 tokens each
        chunks = chunk_text_by_tokens(file_text, chunk_size=6000, model_name=EMBED_MODEL)
        print(f"Processing {len(chunks)} chunk(s) for file '{file_name}'.")

        # Process each chunk: embed and upsert into Pinecone
        for idx, chunk in enumerate(chunks):
            vector_id = base_vector_id if len(chunks) == 1 else f"{base_vector_id}_{idx}"

            try:
                embed_resp = openai.embeddings.create(
                    input=[chunk],
                    model=EMBED_MODEL
                )
                embedding = embed_resp.data[0].embedding
            except Exception as e:
                print(f"Error embedding chunk {idx} of '{file_name}': {e}")
                continue

            # Updated metadata
            metadata = {
                "generated_title": generated_title,
                "agent_summary": generated_tldr,   # renamed from agent_breakdown
                "chunk_index": idx,
                "json_file": chunk                 # renamed snippet -> json_file
            }

            try:
                index.upsert(vectors=[(vector_id, embedding, metadata)])
                print(f"Upserted chunk {idx} of '{file_name}' as vector ID '{vector_id}'.")
            except Exception as e:
                print(f"Error upserting chunk {idx} of '{file_name}': {e}")

print("\nAll done! Your files have been processed with token-based chunking, descriptive titles, and agent summaries.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Generated title for '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt': agent_chat_memory_notes_telegram (vector base ID: agent_chat_memory_notes_telegram)
Generated TLDR for '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt': This automation listens for chat messages, retrieves and stores long-term memories and notes in Google Docs, processes the conversation with an AI agent, and sends responses via Telegram.
Processing 1 chunk(s) for file '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt'.
Upserted chunk 0 of '🤖🧠 AI Agent Chatbot + LONG TERM Memory + Note Storage + Telegram.txt' as vector ID 'agent_chat_memory_notes_telegram'.

Generated title for 'Open Deep Research - AI-Powered Autonomous Research Workflow.txt': agent_llm_autonomous_research (vector base ID: agent_llm_autonomous_research)
Gener