## Prepare NLP

In [1]:
import google.generativeai as genai


class GeminiNLP:
    def __init__(self, gemini_client: genai):
        self.genai = gemini_client

    def chat(self, model_name, instructions, messages):
        model = self.genai.GenerativeModel(
            model_name=model_name, system_instruction=instructions
        )
        response = model.generate_content(messages)
        return response.text

    def struct_output(self, model_name, instructions, messages, structure):
        model = self.genai.GenerativeModel(
            model_name=model_name, system_instruction=instructions
        )
        response = model.generate_content(
            contents=messages,
            generation_config={
                "response_mime_type": "application/json",
                "response_schema": structure,
            },
        )
        return structure.model_validate_json(response.text)

    def func_call(self, model_name, messages, instructions, func):
        model = self.genai.GenerativeModel(
            model_name=model_name, system_instruction=instructions
        )
        try:
            response = model.generate_content(messages, tools=[func])
            call = response.candidates[0].content.parts[0].function_call

            if call:
                try:
                    result = func(**call.args)
                    return result
                except Exception as e:
                    args_dict = dict(call.args)
                    return f"Error when calling {call.name} with args {args_dict}: {e}"

        except Exception as e:
            return f"Error during model generation: {e}"

        return None

In [2]:
import numpy as np


def normalize_embeddings(vectors: list[list[float]]):
    return [
        (vec / np.linalg.norm(vec)) if np.linalg.norm(vec) != 0 else vec
        for vec in vectors
    ]


class CohereEmbeddings:
    def __init__(self, cohere_client):
        self.cohere_client = cohere_client

    async def embed(
        self,
        list_of_text: list[str],
        model_name="embed-v4.0",
        batch_size=10,
    ) -> list[list[float]]:
        vectors = []
        for i in range(0, len(list_of_text), batch_size):
            batch = list_of_text[i : i + batch_size]
            response = await self.cohere_client.embed(
                texts=batch,
                model=model_name,
                input_type="search_document",
                embedding_types=["float"],
                output_dimension=1024,
            )
            batch_vectors = response.embeddings.float
            normalized = normalize_embeddings(batch_vectors)
            vectors.extend(normalized)
        return vectors

In [3]:
import google.generativeai as genai
from cohere import AsyncClientV2

genai.configure(api_key="AIzaSyChTcVwL9R2wWC6GnYeRI1pE4BDaHoIYLU")
nlp = GeminiNLP(gemini_client=genai)

cohere_client = AsyncClientV2(api_key="oIbnS15GCHU7Q7QJ8gMSWlsSxxaangk8WGsUiDac")
embedding = CohereEmbeddings(cohere_client=cohere_client)

## Prepare Data

In [4]:
import chromadb


class ChromaProvider:
    def __init__(self, path):
        self.path = path
        self.client = None

    def connect(self):
        self.client = chromadb.PersistentClient(path=self.path)

    def create_collection(self, name):
        self.client.create_collection(
            name=name,
            configuration={"hnsw": {"space": "cosine", "ef_construction": 200}},
        )

    def add_points(self, collection_name, ids, embeddings, metadata):
        collection = self.client.get_collection(name=collection_name)
        collection.add(ids=ids, embeddings=embeddings, metadatas=metadata)

    def semantic_search(self, collection_name, vector, top_k):
        collection = self.client.get_collection(name=collection_name)
        results = collection.query(query_embeddings=vector, n_results=top_k)
        return results["metadatas"]

    def metadata_filter(self, collection_name, key, value):
        collection = self.client.get_collection(name=collection_name)
        results = collection.get(where={key: value})
        return results["metadatas"]

In [5]:
import json


with open("data/qa.json", "r") as f:
    data = json.load(f)

In [6]:
ids = [f"id_{i+1}" for i in range(len(data))]
text_chunks = [chunk["text"] for chunk in data]

In [7]:
import asyncio

embeddings_list = list()
for i in range(0, len(text_chunks), 20):
    batch = text_chunks[i : i + 20]
    batch_embeddings = await embedding.embed(batch)
    embeddings_list.extend(batch_embeddings)
    await asyncio.sleep(5)

In [8]:
vectordb = ChromaProvider(path="chromadb")
vectordb.connect()

In [9]:
vectordb.create_collection("sales_qa_2")
vectordb.add_points("sales_qa_2", ids, embeddings_list, data)

## Agentic Workflow

### Prompts

In [10]:
PROMPT_PLAN = """You are an AI agent that enhances youser question about an ERP system.
Take user question and generate an Expanded question for a semantic search process.
user question: {user_message}
"""


PROMPT_SQL = """You atr a SQL agent, You will take user question with relevant examples to take inspiration from.
You need to use wanted columns only to generate the SQL statement. Make sure that your query follows **Oracle 11g**.

User Question: {user_message}

Examples:

{examples}
"""

### Agents

In [11]:
from typing_extensions import TypedDict
from pydantic import BaseModel, Field


class Queries(BaseModel):
    queries: list[str] = Field(
        ..., description="Expanded ERP question."
    )


class SQL(BaseModel):
    sql: str = Field(..., description="Correct Oracle 11g SQL statement")


class State(TypedDict):
    user_message: str
    queries: Queries
    schema: str
    sql_query: SQL

In [12]:
def planner(state: State) -> State:
    user_message = state.get("user_message", "")
    prompt = PROMPT_PLAN.format(user_message=user_message)
    response = nlp.struct_output(
        "gemini-2.5-flash",
        "you are an sql generation planner agent",
        prompt,
        Queries,
    )
    return {"queries": response}

In [13]:
async def search(state: State) -> State:
    queries = state.get("queries").queries
    vectors = await embedding.embed(queries)

    results = vectordb.semantic_search("sales_qa", vectors, 5)
    flatten = [item for sublist in results for item in sublist]

    seen = set()
    unique = []
    for res in flatten:
        key = res["text"]
        if key not in seen:
            seen.add(key)
            unique.append(res)

    schema_list = [
        f"Question: {res['text']}\nSQL Statement: {res['sql']}" for res in unique
    ]
    schema_text = "\n\n---\n\n".join(schema_list)

    return {"schema": "Examples\n\n" + schema_text}

In [14]:
def sql(state: State) -> State:
    user_message = state.get("user_message")
    schema = state.get("schema")
    prompt = PROMPT_SQL.format(user_message=user_message, examples=schema)
    response = nlp.struct_output(
        "gemini-2.5-flash",
        "you are an Oracle 11g sql generation agent",
        prompt,
        SQL,
    )
    return {"sql_query": response}

In [15]:
from langgraph.graph import StateGraph, END

workflow = StateGraph(State)
workflow.add_node("plan", planner)
workflow.add_node("search", search)
workflow.add_node("sql", sql)

workflow.set_entry_point("plan")
workflow.add_edge("plan", "search")
workflow.add_edge("search", "sql")
workflow.add_edge("sql", END)

graph = workflow.compile()

In [16]:
async for event in graph.astream({"user_message": "عايز اعرف الربح اخر شهر"}):
    print(event)

{'plan': {'queries': Queries(queries=['ما هو صافي الربح للشهر الماضي؟', 'كم كان الربح الإجمالي للشهر السابق؟', 'عرض تقرير الربح والخسارة للشهر الأخير.', 'استعلام عن أداء الربحية للشهر المنصرم.', 'ما هي إيرادات ومصروفات الشهر الماضي؟', 'أرغب في رؤية ملخص الربح التشغيلي للشهر الماضي.', 'معرفة الأرباح المحققة خلال آخر فترة محاسبية مغلقة.'])}}
{'search': {'schema': 'Examples\n\nQuestion: Calculate the profit margin for each item sold in the last year by subtracting total purchase cost from sales amount, using sales and purchase details.\nSQL Statement: SELECT SBD.ITM_CODE AS "Item Code", SUM(SBD.ITM_TOTL_AMT - SBD.ITM_TOTL_STK_CST) AS "Total Profit", (SUM(SBD.ITM_TOTL_AMT - SBD.ITM_TOTL_STK_CST) / SUM(SBD.ITM_TOTL_AMT)) * 100 AS "Profit Margin %" FROM SALES_BILL_DTL_AI_VW SBD WHERE EXISTS (SELECT 1 FROM PURCHS_BILL_DTL_AI_VW PBD WHERE PBD.ITM_CODE = SBD.ITM_CODE) AND SBD.YR_NO = EXTRACT(YEAR FROM SYSDATE) - 1 GROUP BY SBD.ITM_CODE HAVING SUM(SBD.ITM_TOTL_AMT) > 0 ORDER BY "Profit Margin %"

In [None]:
SELECT SUM(ITM_TOTL_AMT - ITM_TOTL_STK_CST) AS Total_Profit_Last_Month FROM SALES_BILL_DTL_AI_VW WHERE DOC_DATE >= ADD_MONTHS(SYSDATE, -1)')