### Chat with Markdown

In [1]:
import os
from langchain_core.documents import Document
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema import BaseRetriever
from typing import List
import oracledb
import base64
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from collections import Counter


from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.chat_models import ChatOCIGenAI
from langchain_community.document_loaders import UnstructuredExcelLoader

_ = load_dotenv(find_dotenv())
oracledb.init_oracle_client()

UN = os.getenv("UN")
PW = os.getenv("PW")
DSN = os.getenv("DSN")
OCI_COMPARTMENT_ID = os.getenv("OCI_COMPARTMENT_ID")

In [2]:

# Utils

def get_embedding(text: str) -> list:
  embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-multilingual-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=OCI_COMPARTMENT_ID,
  )
  return embeddings.embed_query(text)

In [3]:
def chat_with_image(image_path: str, prompt: str, system_prompt: str = None) -> str:
  with open("/home/opc/multimodal_oci_genai/" + image_path, "rb") as img_file:
    image_data = base64.b64encode(img_file.read()).decode("utf-8")
  
  prompt_with_image = [
    SystemMessage(
        content=system_prompt
    ),
    HumanMessage(
        content=[
            {"type": "text", "text": prompt},
            {
              "type": "image_url",
              "image_url": {
                "url": "data:image/png;base64,"+image_data,
            }
        },
        ]
      )
  ]

  llm = ChatOCIGenAI(
      model_id="meta.llama-3.2-90b-vision-instruct",
      service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
      compartment_id=OCI_COMPARTMENT_ID,
      )
  result = llm.invoke(prompt_with_image)
  print(f"Result: {result}") 
  return result.content

#### Create Markdown Retriever


In [4]:
class CustomMarkdownRetriever(BaseRetriever):
    """
    Custom retriever.
    """

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        docs: List[Document] = []
        embed_query = str(get_embedding(query))
        try:
            with oracledb.connect(user=UN, password=PW, dsn=DSN) as connection:
                with connection.cursor() as cursor:
                    df = pd.DataFrame()
                    cursor.setinputsizes(oracledb.DB_TYPE_VECTOR)
                    select_sql = f"""
                        SELECT
                            file_id,
                            markdown
                        FROM
                            docs_contents
                        ORDER BY VECTOR_DISTANCE(embedding, to_vector(:1, 1024, FLOAT32), COSINE)
                    """
                    cursor.execute(select_sql, [embed_query])
                    for row in cursor:
                        df_tmp = pd.DataFrame([[row[0], row[1].read()]],
                                                columns=["file_id", "markdown"])
                        df = pd.concat([df, df_tmp], ignore_index=True)
                    
                    for i in range(len(df)):
                        file_id = df.iloc[i, 0]
                        markdown = df.iloc[i, 1]
                        doc = Document(
                            page_content=markdown,
                            metadata={'file_id':file_id, 'vector_index': i}
                            )
                        docs.append(doc)
                connection.close()
                # print(f"content {docs}")
        except oracledb.DatabaseError as e:
            print(f"Database error: {e}")
            raise
        except Exception as e:
            print("Error Vector Search:", e)

        return docs

### Create Chain

In [5]:
def get_text_by_text_with_markdown(query: str):
    llm = ChatOCIGenAI(
        model_id="cohere.command-a-03-2025",
        service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
        compartment_id=OCI_COMPARTMENT_ID,
        )
    
    prompt = ChatPromptTemplate([
        ("system", "あなたは質疑応答のAIアシスタントです。必ず日本語で答えてください。"),
        ("human", """
         以下のMarkdownのコンテキストに基づいて質問に答えてください。
         回答は数字だけを回答してください。
         ** 質問 **
          {query} 
          
        ** コンテキスト **
        {context}
        """),
    ])

    retriever = CustomMarkdownRetriever()
    chain = {'query': RunnablePassthrough(), 'context': retriever} | prompt | llm | StrOutputParser()

    result = chain.invoke(query)
    return result

In [6]:
# response = get_text_by_text_with_markdown("2024年5月のハードウェアシステムズのTotalの売上を教えてください。")
response = get_text_by_text_with_markdown("2024/5のQ4のFacilityの経費を教えてください。")
print(response)

コンテキストに情報がないため、回答できません。


In [None]:
# results = []
# for _ in range(30):
#     res = get_text_by_text_with_markdown("2024/5のQ4のFacilityの経費を教えてください。")
#     results.append(res)

# result_counts = Counter(results)
# result_list = list(result_counts.items())

# for value, count in result_list:
#     print(f"出現回数: {count} 回\n値:\n{value}\n{'-'*40}")
    

出現回数: 13 回
値:
コンテキストに情報がないため、回答できません。
----------------------------------------
出現回数: 10 回
値:
情報が不足しています。
----------------------------------------
出現回数: 7 回
値:
情報が不足しているため、回答できません。
----------------------------------------


In [8]:
def get_text_by_text(query: str):
    llm = ChatOCIGenAI(
        model_id="cohere.command-a-03-2025",
        service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
        compartment_id=OCI_COMPARTMENT_ID,
        )
    loader = UnstructuredExcelLoader("../../data/fy25q3-supplemental.xlsx")
    docs = loader.load()
    context = "\n".join([doc.page_content for doc in docs])
    # for idx, doc in enumerate(docs):
    #     print(f"page: {idx}, content: {doc.page_content}")

    prompt = ChatPromptTemplate([
        ("system", "あなたは質疑応答のAIアシスタントです。必ず日本語で答えてください。"),
        ("human", """
         以下のコンテキストに基づいて質問に答えてください。
         回答は数字だけを回答してください。
         ** 質問 **
          {query} 
          
        ** コンテキスト **
        """ + context),
    ])
    chain = {'query': RunnablePassthrough()} | prompt | llm | StrOutputParser()

    result = chain.invoke(query)
    return result

In [9]:
# response = get_text_by_text("2025年のハードウェアシステムズのTotalの売上を教えてください。")
response = get_text_by_text("2024/5のQ4のFacilityの経費を教えてください。")
print(response)

KeyboardInterrupt: 

In [None]:
# results = []
# for _ in range(30):
#     res = get_text_by_text("2024/5のQ4のFacilityの経費を教えてください。")
#     results.append(res)

# result_counts = Counter(results)
# result_list = list(result_counts.items())

# for value, count in result_list:
#     print(f"出現回数: {count} 回\n値:\n{value}\n{'-'*40}")

出現回数: 30 回
値:
1622
----------------------------------------


In [None]:
query = "2024/5のQ4のFacilityの経費を教えてください。"

results_with_markdown = []
results_without_markdown = []

for _ in range(20):
    res = get_text_by_text_with_markdown(query)
    results_with_markdown.append(res)

for _ in range(20):
    res = get_text_by_text(query)
    results_without_markdown.append(res)

result_counts_with_markdown = Counter(results_with_markdown)
result_counts_without_markdown = Counter(results_without_markdown)

print("Results for get_text_by_text_with_markdown:")
for value, count in result_counts_with_markdown.items():
    print(f"出現回数: {count} 回\n値:\n{value}\n{'-'*40}")

print("\nResults for get_text_by_text:")
for value, count in result_counts_without_markdown.items():
    print(f"出現回数: {count} 回\n値:\n{value}\n{'-'*40}")