In [None]:
import os
from functools import lru_cache

from langchain_openai import ChatOpenAI
from langchain_community.embeddings import InfinityEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from dotenv import load_dotenv

load_dotenv()

True

# Setup components

In [2]:
LLM_BASE_URL=os.getenv("LLM_BASE_URL")
LLM_MODEL=os.getenv("LLM_MODEL")
LLM_API_KEY=os.getenv("LLM_API_KEY")

EMBED_BASE_URL=os.getenv("EMBED_BASE_URL")
EMBED_MODEL=os.getenv("EMBED_MODEL")


@lru_cache()
def get_llm_model():
    return ChatOpenAI(
        model=LLM_MODEL,
        base_url=LLM_BASE_URL,
        api_key=LLM_API_KEY,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1,
        extra_body = {
            'chat_template_kwargs': {'enable_thinking': False},
            "top_k": 20,
            "mip_p": 0,
        },
    )

@lru_cache()
def get_thinking_llm_model():
    return ChatOpenAI(
        model=LLM_MODEL,
        base_url=LLM_BASE_URL,
        api_key=LLM_API_KEY,
        temperature=0.6,
        top_p=0.95,
        presence_penalty=1,
        extra_body = {
            'chat_template_kwargs': {'enable_thinking': True},
            "top_k": 20,
            "mip_p": 0,
        },
    )

@lru_cache()
def get_embedding_model():
    return InfinityEmbeddings(
        model=EMBED_MODEL,
        infinity_api_url=EMBED_BASE_URL,
    )


@lru_cache()
def get_vector_store():
    client = QdrantClient(
        url="http://localhost",
        grpc_port=6334,
        prefer_grpc=True,
    )
    embedding_model = get_embedding_model()
    client.create_collection(
        collection_name="demo",
        vectors_config=VectorParams(
            size=len(embedding_model.embed_query("Hello")), 
            distance=Distance.COSINE
        ),
    )
    return QdrantVectorStore(
        client=client,
        collection_name="demo",
        embedding=embedding_model,
    )


# Process data

## Excel

In [36]:
import glob
import json

tables = []
for filepath in glob.glob("/Users/vinhnguyen/Projects/ext-chatbot/resources/processed_data/batdongsan/*.json"):
    table_name = ".".join(filepath.split("/")[-1].split(".")[:-1])
    with open(filepath, "r") as f:
        table = json.load(f)
        table["pydantic_schema"]["title"] = table_name
        tables.append(table)

In [None]:
import sqlite3
import re
from typing import Dict, Any, List, Optional

from src.utils import pydantic_to_sqlite_type


def create_sqlite_table_from_schema(
    schema: Dict[str, Any],
    data: List[Dict[str, Any]],
    db_path: str,
    table_name: Optional[str] = None,
    if_exists: str = "replace"  # "replace", "append", or "fail"
) -> str:
    """
    Create a SQLite table from a Pydantic schema and import data.
    
    Args:
        schema: Pydantic schema in JSON schema format with 'title', 'type', and 'properties'
        data: List of dictionaries containing data to import
        db_path: Path to the SQLite database file
        table_name: Name of the table. If None, uses schema['title'] or 'RowData'
        if_exists: What to do if table exists: "replace" (drop and recreate), 
                  "append" (add to existing), or "fail" (raise error)
    
    Returns:
        The name of the created table
    
    Example:
        schema = {
            'title': 'RowData',
            'type': 'object',
            'properties': {
                'Loại thống kê': {'type': 'string', ...},
                'Giá trị': {'type': 'string', ...}
            }
        }
        data = [
            {'Giá trị': '500', 'Loại thống kê': 'Tổng số BĐS cho thuê'},
            ...
        ]
        create_sqlite_table_from_schema(schema, data, 'database.db')
    """
    # Determine table name
    if table_name is None:
        table_name = schema.get('title', 'RowData')
    
    # Sanitize table name (SQLite allows most characters, but we'll quote it)
    # Remove or replace problematic characters
    # table_name = re.sub(r'[^\w\s]', '_', table_name)
    # table_name = re.sub(r'\s+', '_', table_name)
    
    # Get properties from schema
    properties = schema.get('properties', {})
    if not properties:
        raise ValueError("Schema must contain 'properties'")
    
    # Build column definitions
    columns = []
    column_names = []
    for prop_name, prop_schema in properties.items():
        # Get type from schema
        prop_type = prop_schema.get('type', 'string')
        sqlite_type = pydantic_to_sqlite_type(prop_type)
        
        # Sanitize column name but keep original for data mapping
        safe_col_name = re.sub(r'[^\w\s]', '_', prop_name)
        safe_col_name = re.sub(r'\s+', '_', safe_col_name)
        
        # Use quoted identifiers to preserve original names if needed
        # SQLite supports quoted identifiers with square brackets or double quotes
        quoted_col_name = f'"{prop_name}"'
        
        columns.append(f'{quoted_col_name} {sqlite_type}')
        column_names.append(prop_name)
    
    # Create table SQL
    create_table_sql = '''
    CREATE TABLE IF NOT EXISTS "{table_name}" (\n\t{columns}\n)
    '''.format(
        table_name=table_name,
        columns=',\n\t'.join(columns)
    ).strip()
    
    # Connect to database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    try:
        # Handle if_exists option
        if if_exists == "replace":
            cursor.execute(f'DROP TABLE IF EXISTS "{table_name}"')
        elif if_exists == "append":
            # Check if table exists
            cursor.execute('''
                SELECT name FROM sqlite_master 
                WHERE type='table' AND name=?
            ''', (table_name,))
            if cursor.fetchone():
                # Table exists, we'll append
                pass
            else:
                # Table doesn't exist, create it
                cursor.execute(create_table_sql)
        elif if_exists == "fail":
            # Check if table exists
            cursor.execute('''
                SELECT name FROM sqlite_master 
                WHERE type='table' AND name=?
            ''', (table_name,))
            if cursor.fetchone():
                raise ValueError(f"Table '{table_name}' already exists")
            cursor.execute(create_table_sql)
        else:
            raise ValueError(f"Invalid if_exists value: {if_exists}. Must be 'replace', 'append', or 'fail'")
        
        # Create table if it doesn't exist (for append case where table might already exist)
        cursor.execute(create_table_sql)
        
        # Prepare insert statement
        placeholders = ', '.join(['?' for _ in column_names])
        quoted_column_names = ', '.join([f'"{col}"' for col in column_names])
        insert_sql = f'''
        INSERT INTO "{table_name}" ({quoted_column_names})
        VALUES ({placeholders})
        '''
        
        # Insert data
        if data:
            rows_to_insert = []
            for row in data:
                # Extract values in the order of column_names
                values = [row.get(col_name) for col_name in column_names]
                rows_to_insert.append(values)
            
            cursor.executemany(insert_sql, rows_to_insert)
        
        conn.commit()
        return table_name
        
    except Exception as e:
        conn.rollback()
        raise e
    finally:
        conn.close()

In [38]:
for table in tables:
    create_sqlite_table_from_schema(
        schema=table["pydantic_schema"],
        data=table["transformed_data"],
        db_path="/Users/vinhnguyen/Projects/ext-chatbot/resources/database/batdongsan.db",
    )

In [39]:
from langchain_community.utilities import SQLDatabase

db = SQLDatabase.from_uri("sqlite:////Users/vinhnguyen/Projects/ext-chatbot/resources/database/batdongsan.db")

In [41]:
print(db.get_table_info(["BĐS Bán 500"]).strip())

CREATE TABLE "BĐS Bán 500" (
	"ID" TEXT, 
	"Dự án" TEXT, 
	"Giá (tỷ VNĐ)" REAL, 
	"Giá/m²" TEXT, 
	"Chiều ngang (m)" REAL, 
	"Chiều dài (m)" REAL, 
	"Giá/m²_so_sanh" REAL, 
	"Giá/m²_don_vi" TEXT, 
	"Số phòng ngủ" INTEGER, 
	"Ngày hết hạn" TEXT, 
	"Số phòng tắm" INTEGER, 
	"Số tầng" INTEGER, 
	"Hướng nhà" TEXT, 
	"Hướng ban công" TEXT, 
	"Nội thất" TEXT, 
	"Pháp lý" TEXT, 
	"Tình trạng" TEXT, 
	"Năm xây dựng" TEXT, 
	"Chủ đầu tư" TEXT, 
	"Bãi đỗ xe" TEXT, 
	"Email" TEXT, 
	"Hoa hồng" TEXT, 
	"Ghi chú" TEXT, 
	"Diện tích (m²)" INTEGER, 
	"Loại BĐS" TEXT, 
	"Phường/Xã" TEXT, 
	"Quận/Huyện" TEXT, 
	"Tiêu đề" TEXT, 
	"Tỉnh/TP" TEXT, 
	"Địa chỉ" TEXT, 
	"Địa chỉ_Tên đường" TEXT, 
	"View" TEXT, 
	"Tiện ích" TEXT, 
	"Khoảng cách tới trung tâm" REAL, 
	"Ngày đăng" TEXT, 
	"Người liên hệ" TEXT, 
	"SĐT liên hệ" TEXT
)

/*
3 rows from BĐS Bán 500 table:
ID	Dự án	Giá (tỷ VNĐ)	Giá/m²	Chiều ngang (m)	Chiều dài (m)	Giá/m²_so_sanh	Giá/m²_don_vi	Số phòng ngủ	Ngày hết hạn	Số phòng tắm	Số tầng	Hướng nhà	H

In [38]:
# Request more files from PO to test

# Utils

# Chains