### Chat with Markdown

In [16]:
import os
from langchain_core.documents import Document
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.schema import BaseRetriever
from typing import List
import oracledb
import base64
import json
import pandas as pd
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as XLImage
from PIL import Image
import io
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from collections import Counter


from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.chat_models import ChatOCIGenAI
from langchain_community.document_loaders import UnstructuredExcelLoader

_ = load_dotenv(find_dotenv())
oracledb.init_oracle_client()

UN = os.getenv("UN")
PW = os.getenv("PW")
DSN = os.getenv("DSN")
OCI_COMPARTMENT_ID = os.getenv("OCI_COMPARTMENT_ID")

In [17]:

# Utils
def get_embedding(text: str) -> list:
  embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-multilingual-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=OCI_COMPARTMENT_ID,
  )
  return embeddings.embed_query(text)

In [18]:
def extract_tables_and_images(file_path, output_dir="images"):
  os.makedirs(output_dir, exist_ok=True)
  wb = load_workbook(file_path)
  e_wb = pd.ExcelFile(file_path)
  result = []

  for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]
    print(f"worksheet: {ws}")
    # --- 表の抽出 ---
    df = pd.read_excel(e_wb, sheet_name=sheet_name)
    markdown = df.to_markdown(index=False)
    print(f"Markdown Table:\n{markdown}")
    result.append({
      "type": "text",
      "sheet": sheet_name,
      "markdown": markdown,
      "raw": df
    })
  return result

In [19]:
def summarize_text(text: str) -> str:
  prompt = PromptTemplate(
      input_variables=["text"],
      template="以下の内容を要約してください:\n{text}"
  )
  llm = ChatOCIGenAI(
    model_id="cohere.command-a-03-2025",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id=OCI_COMPARTMENT_ID,
    model_kwargs={"temperature": 0.7, "max_tokens": 500},
    )

  chain = ({"text": RunnablePassthrough() } 
           | prompt 
           | llm 
           | StrOutputParser()
  )
  result = chain.invoke(text)
  return result

In [20]:
def save_markdown_content(file_name: str, file_path: str, summary: str, embedding: list, markdown: str):
  sql = """
        INSERT INTO embedding_contents (name, summary, embedding, file_path, content_type, markdown)
        VALUES (:name, :summary, :embedding, :file_path, :content_type, :markdown)
      """

  try:
    with oracledb.connect(user=UN, password=PW, dsn=DSN) as conn:
      with conn.cursor() as cursor:
        # cursor.setinputsizes(oracledb.DB_TYPE_VECTOR)
        params = {
          'name': file_name,
          'summary': summary,
          'embedding': str(embedding),
          'file_path': file_path,
          'content_type': 'text',
          'markdown': markdown
        }
        cursor.execute(sql, params)
        
        print(f"Success insert {file_name} into embedding_contents")
      conn.commit()
  except oracledb.DatabaseError as e:
    error, = e.args
    print(f"Error at save_markdown_content: {error.code}")
    print(f"Oracle error code: {error.code}")
    print(f"Oracle error message: {error.message}")
  except Exception as e:
    print(f"Error:save_image_content: {e}")
    return None

In [21]:
def process_excel(file_path):
  contents = extract_tables_and_images(file_path)

  for content in contents:
    if content["type"] == "text":
      print(f"table: {content['sheet']}")
      summary = summarize_text(content["markdown"])
      print(summary)
      embedding = get_embedding(summary)
      print(f"file_path: {file_path}")
      print(f"file_name: {content['sheet']}")
      save_markdown_content(
        file_name=content["sheet"],
        file_path=file_path,
        summary=summary,
        embedding=embedding,
        markdown=content["markdown"]
        )
  return contents
      


In [22]:
process_excel("../../data/fy25q3-supplemental.xlsx")

worksheet: <Worksheet "Cover">
Markdown Table:
|   Unnamed: 0 | Unnamed: 1                                                       | Unnamed: 2                                                 |   Unnamed: 3 |   Unnamed: 4 |   Unnamed: 5 |   Unnamed: 6 |   Unnamed: 7 | Unnamed: 8                                                       |
|-------------:|:-----------------------------------------------------------------|:-----------------------------------------------------------|-------------:|-------------:|-------------:|-------------:|-------------:|:-----------------------------------------------------------------|
|          nan | nan                                                              | nan                                                        |          nan |          nan |          nan |          nan |          nan | nan                                                              |
|          nan | nan                                                              | nan     

[{'type': 'text',
  'sheet': 'Cover',
  'markdown': '|   Unnamed: 0 | Unnamed: 1                                                       | Unnamed: 2                                                 |   Unnamed: 3 |   Unnamed: 4 |   Unnamed: 5 |   Unnamed: 6 |   Unnamed: 7 | Unnamed: 8                                                       |\n|-------------:|:-----------------------------------------------------------------|:-----------------------------------------------------------|-------------:|-------------:|-------------:|-------------:|-------------:|:-----------------------------------------------------------------|\n|          nan | nan                                                              | nan                                                        |          nan |          nan |          nan |          nan |          nan | nan                                                              |\n|          nan | nan                                                              |

#### Create Embedding Retriever


In [23]:
class CustomEmbeddingRetriever(BaseRetriever):
    """
    Custom image retriever.
    """

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        docs: List[Document] = []
        embed_query = str(get_embedding(query))
        try:
            with oracledb.connect(user=UN, password=PW, dsn=DSN) as connection:
                with connection.cursor() as cursor:
                    cursor.setinputsizes(oracledb.DB_TYPE_VECTOR)
                    select_sql = f"""
                        SELECT
                            id,
                            name,
                            content_type,
                            file_path,
                            summary
                        FROM
                            embedding_contents
                        ORDER BY VECTOR_DISTANCE(embedding, to_vector(:1, 1024, FLOAT32), COSINE)
                        FETCH FIRST 3 ROWS ONLY
                    """
                    cursor.execute(select_sql, [embed_query])
                    index = 1
                    for row in cursor:
                        doc = Document(
                            page_content=row[3],
                            metadata={
                                'id':row[0],
                                'name': row[1],
                                'content_type': row[2],
                                'file_path': row[3],
                                'vector_index': index
                                }
                            )
                        docs.append(doc)
                        index += 1
                connection.close()
                        
        except oracledb.DatabaseError as e:
            print(f"Database error: {e}")
            raise
        except Exception as e:
            print("Error Vector Search:", e)
        
        return docs

### Create Chain

In [24]:
def get_text_with_markdown(query: str):
    retriever = CustomEmbeddingRetriever()
    result_content = retriever.invoke(query)
    id = result_content[0].metadata['id']

    docs: List[Document] = []
    df = pd.DataFrame()
    try:
        with oracledb.connect(user=UN, password=PW, dsn=DSN) as connection:
            with connection.cursor() as cursor:
                cursor.setinputsizes(oracledb.DB_TYPE_VECTOR)
                select_sql = f"""
                    SELECT
                        id,
                        markdown
                    FROM
                        embedding_contents
                    WHERE id = :1
                """
                cursor.execute(select_sql, [id])
                for row in cursor:
                    df_tmp = pd.DataFrame([[row[0], row[1].read()]],
                                            columns=["id", "markdown"])
                    df = pd.concat([df, df_tmp], ignore_index=True)
                
                for i in range(len(df)):
                    id = df.iloc[i, 0]
                    markdown = df.iloc[i, 1]
                    doc = Document(
                        page_content=markdown,
                        metadata={'id':id, 'vector_index': i}
                        )
                    docs.append(doc)
            connection.close()
                    
    except oracledb.DatabaseError as e:
        print(f"Database error: {e}")
        raise
    except Exception as e:
        print("Error Vector Search:", e)
    
    llm = ChatOCIGenAI(
        model_id="cohere.command-a-03-2025",
        service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
        compartment_id=OCI_COMPARTMENT_ID,
        )
    
    prompt = ChatPromptTemplate([
        ("system", "あなたは質疑応答のAIアシスタントです。必ず日本語で答えてください。"),
        ("human", """
         以下のMarkdownのコンテキストに基づいて質問に答えてください。
         回答は数字だけを回答してください。
         ** 質問 **
          {query} 
          
        ** コンテキスト **
        """
        f"{docs[0].page_content}"
        ),
    ])

    chain = {'query': RunnablePassthrough()} | prompt | llm | StrOutputParser()

    result = chain.invoke(query)
    return result

In [25]:
# response = get_text_by_text_with_markdown("2024年5月のハードウェアシステムズのTotalの売上を教えてください。")
response = get_text_with_markdown("2024/5のQ4のFacilityの経費を教えてください。")
print(response)

コンテキストに2024/5のQ4のFacilityの経費に関する


In [26]:
def get_text_by_text(query: str):
    llm = ChatOCIGenAI(
        model_id="cohere.command-a-03-2025",
        service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
        compartment_id=OCI_COMPARTMENT_ID,
        )
    loader = UnstructuredExcelLoader("../../data/fy25q3-supplemental.xlsx")
    docs = loader.load()
    context = "\n".join([doc.page_content for doc in docs])

    prompt = ChatPromptTemplate([
        ("system", "あなたは質疑応答のAIアシスタントです。必ず日本語で答えてください。"),
        ("human", """
         以下のコンテキストに基づいて質問に答えてください。
         回答は数字だけを回答してください。
         ** 質問 **
          {query} 
          
        ** コンテキスト **
        """ + context + """
        ** Output **
        value: int
        """),
    ])
    chain = {'query': RunnablePassthrough()} | prompt | llm | StrOutputParser()

    result = chain.invoke(query)
    return result

In [27]:
# response = get_text_by_text("2025年のハードウェアシステムズのTotalの売上を教えてください。")
response = get_text_by_text("2024/5のQ4のFacilityの経費を教えてください。")
print(response)

431


In [28]:
response = get_text_by_text("Please tell me the facility expenses for Q4 of 2024/5.")
print(response)

1622


In [29]:
query = "2024/5のQ4のFacilityの経費を教えてください。"

results_with_markdown = []
results_with_text = []

for _ in range(20):
    res = get_text_with_markdown(query)
    results_with_markdown.append(res)

for _ in range(20):
    res = get_text_by_text(query)
    results_with_text.append(res)

result_counts_with_markdown = Counter(results_with_markdown)
result_counts_with_text = Counter(results_with_text)

df_with_markdown = pd.DataFrame(
    list(result_counts_with_markdown.items()), 
    columns=["Result", "Count"]
).sort_values(by="Count", ascending=False)
df_with_text = pd.DataFrame(
    list(result_counts_with_text.items()), 
    columns=["Result", "Count"]
).sort_values(by="Count", ascending=False)

display(df_with_markdown.style.set_caption("Results for get_text_by_text_with_markdown"))
display(df_with_text.style.set_caption("Results for get_text_by_text"))

Unnamed: 0,Result,Count
0,データに2024年5月のQ4のFacilityの経費に関する情報は,4
2,該当する情報は見つかりませんでした。,4
1,該当するデータはありません。,3
3,コンテキストに2024/5のQ4のFacilityの経費に関する,2
7,コンテキストに2024年5月のQ4のFacilityの経費に関する,2
4,データに2024/5のQ4のFacilityの経費はありません,1
5,コンテキストには2024/5のQ4のFacilityの経費に関する,1
6,データに2024/5のQ4のFacilityの経費は記載されて,1
8,コンテキストには、2024年5月のQ4のFacilityの経費,1
9,コンテキスト内に2024/5のQ4のFacilityの経費に関する,1


Unnamed: 0,Result,Count
0,1622,13
1,431,7
