In [1]:
import re
import os
import random
import json
from functools import lru_cache
from typing import List, Any
import numpy as np

from openai import OpenAI

from langchain_openai import ChatOpenAI
from langchain_community.embeddings import InfinityEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from src.utils import load_env
from src.parser import ExcelParser
from src.processor import TableProcessor
from src.prompts import (
    TRANSFORM_DATA_TEMPLATE
)

load_env(".env")

# Setup components

In [2]:
LLM_BASE_URL=os.getenv("LLM_BASE_URL")
LLM_MODEL=os.getenv("LLM_MODEL")
LLM_API_KEY=os.getenv("LLM_API_KEY")

EMBED_BASE_URL=os.getenv("EMBED_BASE_URL")
EMBED_MODEL=os.getenv("EMBED_MODEL")


@lru_cache()
def get_llm_model():
    return ChatOpenAI(
        model=LLM_MODEL,
        base_url=LLM_BASE_URL,
        api_key=LLM_API_KEY,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1,
        extra_body = {
            'chat_template_kwargs': {'enable_thinking': False},
            "top_k": 20,
            "mip_p": 0,
        },
    )

@lru_cache()
def get_thinking_llm_model():
    return ChatOpenAI(
        model=LLM_MODEL,
        base_url=LLM_BASE_URL,
        api_key=LLM_API_KEY,
        temperature=0.6,
        top_p=0.95,
        presence_penalty=1,
        extra_body = {
            'chat_template_kwargs': {'enable_thinking': True},
            "top_k": 20,
            "mip_p": 0,
        },
    )

@lru_cache()
def get_embedding_model():
    return InfinityEmbeddings(
        model=EMBED_MODEL,
        infinity_api_url=EMBED_BASE_URL,
    )


@lru_cache()
def get_vector_store():
    client = QdrantClient(
        url="http://localhost",
        grpc_port=6334,
        prefer_grpc=True,
    )
    embedding_model = get_embedding_model()
    client.create_collection(
        collection_name="demo",
        vectors_config=VectorParams(
            size=len(embedding_model.embed_query("Hello")), 
            distance=Distance.COSINE
        ),
    )
    return QdrantVectorStore(
        client=client,
        collection_name="demo",
        embedding=embedding_model,
    )


# Process data

## Excel

In [2]:
# Example 4: Process multiple sheets
file_path = "/Users/vinhnguyen/Projects/Chatbot_code/DATA_hokkaido/data_raw/hokkaido_sachi_data_final.xlsx"
# file_path = "/Users/vinhnguyen/Projects/ext-chatbot/resources/aaai-26.xlsx"


all_sheets_data = {}

with ExcelParser(file_path) as parser:
    # Parse each sheet (you can get sheet names from openpyxl if needed)
    for sheet_idx in range(1, 4):  # or use sheet names
        try:
            tables = parser.parse_sheet(sheet_name=sheet_idx)
            all_sheets_data[f"Sheet_{sheet_idx}"] = tables
            print(f"Sheet {sheet_idx}: Found {len(tables)} tables")
        except Exception as e:
            print(f"Could not parse sheet {sheet_idx}: {e}")
        finally:
            break
            pass

Sheet 1: Found 1 tables


In [3]:
tables[0]

TableInfo(data=array([['Danh mục', 'Tên món', 'Giá', 'Ghi chú'],
       ['Sashimi', 'HOKKAIDO DELUXE', '566.000',
        'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
       ['Sashimi', 'TORO SALMON SASHIMI', '155.000',
        'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
       ...,
       ['Drink', 'Matcha Iced Blend', '79.000',
        'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
       ['Drink', 'Dalat Cider House Apple', '67.000',
        'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
       ['Drink', 'Fujiwa Hydrogen Water', '36.000',
        'Tất cả giá trên chưa bao gồm 10% thuế VAT.']],
      shape=(425, 4), dtype=object), start_row=0, end_row=424, start_col=0, num_cols=4, num_data_rows=424, merged_cells=[])

In [4]:
table_processor = TableProcessor(max_concurrent_requests=10)

In [5]:
result_dict = await table_processor(
    tables[0],
    max_retries=100
)

Identifying header and footer...
Designing schema...
{'title': 'RowData', 'type': 'object', 'properties': {'Danh mục': {'type': 'string', 'description': "Loại danh mục của món ăn, ví dụ: 'Drink', 'Sushi', 'Sushi'."}, 'Tên món': {'type': 'string', 'description': "Tên món ăn, ví dụ: 'Fruit Jumbo - Dragon Fruit', 'KOMOCHI AVOCADO ROLL', 'TOKACHI GRILLED ROLL'."}, 'Giá': {'type': 'number', 'description': 'Giá của món ăn (đơn vị VND, ví dụ: 89000.0, 157000.0, 180000.0).'}, 'Ghi chú': {'type': 'string', 'description': "Ghi chú chung cho tất cả các dòng, ví dụ: 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'"}}, 'required': ['Danh mục', 'Tên món', 'Giá', 'Ghi chú']}
Transforming data...
Attempt 1/100: Processing 424 rows...


Transforming data: 100%|██████████| 424/424 [01:30<00:00,  4.67it/s]


Attempt 2/100: Processing 4 rows...


Transforming data: 100%|██████████| 4/4 [00:11<00:00,  3.00s/it]


Attempt 3/100: Processing 2 rows...


Transforming data: 100%|██████████| 2/2 [00:22<00:00, 11.33s/it]


In [10]:
ind = 21

tables[0].data.tolist()[ind]

['Sashimi',
 'TAI ARANABE',
 '176.000',
 'Tất cả giá trên chưa bao gồm 10% thuế VAT.']

In [11]:
result_dict["transformed_data"][ind-1]

{'Danh mục': 'Sashimi',
 'Tên món': 'TAI ARANABE',
 'Giá': 176000,
 'Ghi chú': 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'}

In [40]:
result_dict["pydantic_schema"]

{'title': 'RowData',
 'type': 'object',
 'properties': {'Danh mục': {'type': 'string',
   'description': 'Loại danh mục của món ăn (ví dụ: Sushi, Drink, Dessert).'},
  'Tên món': {'type': 'string',
   'description': 'Tên đầy đủ của món ăn, bao gồm các loại hương vị (nếu có).'},
  'Giá': {'type': 'string',
   'description': 'Giá của món ăn trước thuế VAT, được lưu dưới dạng chuỗi để giữ nguyên định dạng.'},
  'Ghi chú': {'type': 'string',
   'description': 'Chú thích chung cho tất cả các mục trong bảng.'},
  'price_value': {'type': 'number',
   'description': 'Giá trị số của giá món ăn, đã chuyển đổi thành số thực.'},
  'price_currency': {'type': 'string',
   'description': 'Đơn vị tiền tệ của giá món ăn (VND).'},
  'Item Name': {'type': 'string',
   'description': 'Tên chính của món ăn, không bao gồm hương vị.'},
  'Flavor': {'type': 'string',
   'description': 'Danh sách các hương vị đi kèm với món ăn.'}},
 'required': ['Danh mục', 'Tên món', 'Giá', 'Ghi chú']}

In [20]:
result_dict["header_footer_info"]

{'header_indices': [0], 'footer_indices': []}

In [36]:
def _format_table_data_snippet(
    table_data: np.ndarray,
    sample_size: int = 3,
) -> str:
    """
    Format a snippet of the table data for the LLM prompt.
    Shows the first `sample_size` rows and the last `sample_size` rows.
    """
    table_data_snippet = "["
    rows = table_data.tolist()
    total_rows = len(rows)
    
    # If table is small enough, just show the whole thing
    if total_rows <= (sample_size * 3):
        for i, row in enumerate(rows):
            table_data_snippet += str(row)
            if i < total_rows - 1:
                table_data_snippet += ",\n "
        table_data_snippet += "]"
        return table_data_snippet

    # Format top rows
    top_rows = rows[:sample_size]
    table_data_snippet += str(top_rows[0])
    for row in top_rows[1:]:
        table_data_snippet += (",\n "+str(row))

    # Format middle rows
    middle_rows = random.sample(
        rows[sample_size:-sample_size], 
        sample_size
    )
    for row in middle_rows:
        table_data_snippet += (",\n "+str(row))
            
    # Format bottom rows
    bottom_rows = rows[-sample_size:]
    table_data_snippet += ",\n ...\n " + str(bottom_rows[0])
    for row in bottom_rows[1:]:
        table_data_snippet += (",\n "+str(row))
    table_data_snippet += "]"
    return table_data_snippet

def _format_rows(rows: List[List[Any]]) -> str:
    """
    Format the rows for the LLM prompt.
    """
    if len(rows) == 0:
        return "[]"
    if len(rows) == 1:
        return str(rows[0])
    rows_snippet = "[" + str(rows[0])
    for row in rows[1:]:
        rows_snippet += ",\n " + str(row)
    rows_snippet += "]"
    return rows_snippet

In [35]:
print(_format_rows(tables[0].data.tolist()[:5]))

[['Danh mục', 'Tên món', 'Giá', 'Ghi chú'],
 ['Sashimi', 'HOKKAIDO DELUXE', '566.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Sashimi', 'TORO SALMON SASHIMI', '155.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Sashimi', 'SHAKO SASHIMI', '172.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Sashimi', 'SANSHUMORI PREMIUM', '345.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.']]


In [25]:
print(_format_table_data_snippet(tables[0].data))

[['Danh mục', 'Tên món', 'Giá', 'Ghi chú'],
 ['Sashimi', 'HOKKAIDO DELUXE', '566.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Sashimi', 'TORO SALMON SASHIMI', '155.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Sushi', 'SHIME SABA NIGIRI', '73.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Special Set', 'SANMA TERIYAKI OR SHIOYAKI', '87.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Dessert', 'ICE CREAM PARFAIT - MATCHA', '99.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ...
 ['Drink', 'Matcha Iced Blend', '79.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Drink', 'Dalat Cider House Apple', '67.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.'],
 ['Drink', 'Fujiwa Hydrogen Water', '36.000', 'Tất cả giá trên chưa bao gồm 10% thuế VAT.']]


In [14]:
# Get 

client = OpenAI(
    base_url=os.getenv("LLM_BASE_URL"),
    api_key=os.getenv("LLM_API_KEY"),
)

resp_content = client.chat.completions.create(
    model=os.getenv("LLM_MODEL"),
    messages=[{"role": "user", "content": EXTRACT_STRUCTURE_TEMPLATE.replace(
        "{{table_data_snippet}}",
        _format_table_data_snippet(tables[0].data)
    )}],
    temperature=0.6,
    top_p=0.95,
    presence_penalty=1,
    extra_body = {
        "chat_template_kwargs": {'enable_thinking': True},
        "top_k": 20,
        "mip_p": 0,
    },
).choices[0].message.content


JSON_PATTERN = re.compile(r"```json\n(.*?)\n```", re.DOTALL)
print(JSON_PATTERN.findall(resp_content)[-1])

{
  "table_structure": {
    "header_indices": [0],
    "footer_indices": null
  },
  "pydantic_schema": {
    "title": "RowData",
    "type": "object",
    "properties": {
      "Danh_muc": {
        "type": "string",
        "description": "Category of the menu item (e.g., Sashimi, Special Set, Drink)"
      },
      "Ten_mon": {
        "type": "string",
        "description": "Name of the menu item"
      },
      "Gia": {
        "type": "number",
        "description": "Price of the item in Vietnamese Dong (VND), formatted as float"
      },
      "Ghi_chu": {
        "type": "string",
        "description": "General note about pricing (all prices exclude 10% VAT)"
      }
    },
    "required": ["Danh_muc", "Ten_mon", "Gia", "Ghi_chu"]
  }
}


In [18]:
tmp = json.loads(JSON_PATTERN.findall(resp_content)[-1])

In [32]:
tmp["table_structure"]

{'header_indices': [0], 'footer_indices': None}

In [37]:
str(tmp["pydantic_schema"])

"{'title': 'RowData', 'type': 'object', 'properties': {'Danh_muc': {'type': 'string', 'description': 'Category of the menu item (e.g., Sashimi, Special Set, Drink)'}, 'Ten_mon': {'type': 'string', 'description': 'Name of the menu item'}, 'Gia': {'type': 'number', 'description': 'Price of the item in Vietnamese Dong (VND), formatted as float'}, 'Ghi_chu': {'type': 'string', 'description': 'General note about pricing (all prices exclude 10% VAT)'}}, 'required': ['Danh_muc', 'Ten_mon', 'Gia', 'Ghi_chu']}"

In [42]:
data_list = tables[0].data.tolist()

transform_prompt = TRANSFORM_DATA_TEMPLATE.replace(
    "{{raw_header}}",
    _format_rows([
        data_list[i] for i in tmp["table_structure"]["header_indices"]
    ])
).replace(
    "{{raw_row}}",
    _format_rows(data_list[3:4])
).replace(
    "{{pydantic_schema}}",
    str(tmp["pydantic_schema"])
)

In [45]:
response = client.chat.completions.create(
    model=os.getenv("LLM_MODEL"),
    messages=[{"role": "user", "content": transform_prompt}],
    # stream=True,
    temperature=0.7,
    top_p=0.8,
    presence_penalty=1,
    extra_body = {
        "chat_template_kwargs": {'enable_thinking': False},
        "top_k": 20,
        "mip_p": 0,
        "guided_json": tmp["pydantic_schema"]
    },
)

content = response.choices[0].message.content

# for chunk in response:
#     print(chunk.choices[0].delta.content, end="", flush=True)



    

In [48]:
type(json.loads(content)["Gia"])

int

In [None]:
# import asyncio
# from typing import List, Dict, Any
# from openai import AsyncOpenAI

# # Initialize the async client
# # The client automatically picks up the OPENAI_API_KEY environment variable
# async_client = AsyncOpenAI(
#     base_url=os.getenv("LLM_BASE_URL"),
#     api_key=os.getenv("LLM_API_KEY"),
# )


# def format_user_prompt(input_data: Any) -> str:
#     """
#     Format the user input into a prompt.
#     You should implement this function based on your needs.
    
#     Args:
#         input_data: The input data to format
        
#     Returns:
#         str: The formatted prompt
#     """
#     # TODO: Implement your format_user_prompt logic here
#     # For now, returning a placeholder
#     return str(input_data)


# async def process_single_input(
#     input_data: Any,
# ) -> Dict[str, Any]:
#     """
#     Process a single input asynchronously.
    
#     Args:
#         input_data: The input data to process
        
#     Returns:
#         Dict containing:
#             - success: bool indicating if the request succeeded
#             - input_data: The original input data
#             - response: The API response content (if successful)
#             - error: Error message (if failed)
#     """
#     try:
#         # Format the user prompt
#         prompt = format_user_prompt(input_data)
        
#         # Make the async API call
#         chat_completion = await async_client.chat.completions.create(
#             model=os.getenv("LLM_MODEL"),
#             messages=[{"role": "user", "content": prompt}],
#         )
        
#         return {
#             "success": True,
#             "input_data": input_data,
#             "response": chat_completion.choices[0].message.content,
#             "error": None
#         }
#     except Exception as e:
#         return {
#             "success": False,
#             "input_data": input_data,
#             "response": None,
#             "error": str(e)
#         }


# async def process_queue_batch(
#     queue: List[Any],
# ) -> List[Dict[str, Any]]:
#     """
#     Process all items in the queue asynchronously.
    
#     Args:
#         queue: List of input items to process
        
#     Returns:
#         List of result dictionaries from process_single_input
#     """
#     # Create tasks for all items in the queue
#     tasks = [process_single_input(item) for item in queue]
    
#     # Run all tasks concurrently and wait for completion
#     results = await asyncio.gather(*tasks)
    
#     return results


# def get_error_samples(results: List[Dict[str, Any]]) -> List[Any]:
#     """
#     Extract input data from failed samples.
    
#     Args:
#         results: List of result dictionaries
        
#     Returns:
#         List of input data that failed
#     """
#     return [
#         result["input_data"]
#         for result in results
#         if not result["success"]
#     ]


# async def main(
#     initial_queue: List[Any],
#     max_retries: int = 3,
# ) -> List[Dict[str, Any]]:
#     """
#     Main function that processes a queue with retry logic.
    
#     Args:
#         initial_queue: Initial list of items to process
#         max_retries: Maximum number of retry attempts
        
#     Returns:
#         List of all result dictionaries
#     """
#     all_results = []
#     queue = initial_queue.copy()
    
#     for attempt in range(max_retries):
#         if not queue:
#             print(f"No more items to process after attempt {attempt + 1}")
#             break
        
#         print(f"Attempt {attempt + 1}/{max_retries}: Processing {len(queue)} items...")
        
#         # Process all items in the queue asynchronously
#         results = await process_queue_batch(queue)
#         all_results.extend(results)
        
#         # Check for error samples
#         error_samples = get_error_samples(results)
        
#         if not error_samples:
#             print(f"All items processed successfully after attempt {attempt + 1}")
#             break
        
#         print(f"Found {len(error_samples)} error samples. Retrying...")
        
#         # Push error samples back to queue for retry
#         queue = error_samples
    
#     # Print summary
#     successful = sum(1 for r in all_results if r["success"])
#     failed = len(all_results) - successful
#     print(f"\nSummary: {successful} successful, {failed} failed out of {len(all_results)} total")
    
#     return all_results


# # Example usage
# if __name__ == "__main__":
#     # Example initial queue
#     initial_queue = [
#         "Explain asynchronous programming in Python",
#         "What is machine learning?",
#         "Tell me about Python decorators",
#     ]
    
#     # Run the main function
#     results = asyncio.run(main(initial_queue, max_retries=3))
    
#     # Print results
#     for i, result in enumerate(results):
#         if result["success"]:
#             print(f"\nResult {i+1} (Success):")
#             print(result["response"])
#         else:
#             print(f"\nResult {i+1} (Failed):")
#             print(f"Error: {result['error']}")

# Utils

# Chains