In [3]:
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

In [None]:
# Convert PDF to MD

import os
from markitdown import MarkItDown

input_dir = "./files/pdf"
output_dir = "./files/md"
os.makedirs(output_dir, exist_ok=True)

md = MarkItDown(enable_plugins=False)

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".pdf"):
        input_path = os.path.join(input_dir, filename)
        result = md.convert(input_path)
        output_filename = os.path.splitext(filename)[0] + ".md"
        output_path = os.path.join(output_dir, output_filename)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(result.text_content)

In [None]:
# Split MD

from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd

md_dir = "./files/md"
chunks_dir = "./files/chunks"
os.makedirs(chunks_dir, exist_ok=True)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=250,
)

for filename in os.listdir(md_dir):
    if filename.lower().endswith(".md"):
        file_path = os.path.join(md_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        chunks = splitter.split_text(text)
        df = pd.DataFrame({"chunk": chunks})
        output_csv = os.path.splitext(filename)[0] + ".csv"
        output_path = os.path.join(chunks_dir, output_csv)
        df.to_csv(output_path, index=False, encoding="utf-8")

In [None]:
# Init Embedding Function

from pymilvus import model

gemini_ef = model.dense.GeminiEmbeddingFunction(
    model_name="text-embedding-004",
    api_key=os.getenv("GOOGLE_API_KEY"),
)

In [None]:
# Embedding

import os
import pandas as pd
import time


chunks_dir = "./files/chunks"
embeddings_dir = "./files/embeddings"
os.makedirs(embeddings_dir, exist_ok=True)

BATCH_SIZE = 100
SLEEP_TIME = 1

for filename in os.listdir(chunks_dir):
    if filename.lower().endswith(".csv"):
        file_path = os.path.join(chunks_dir, filename)
        df = pd.read_csv(file_path)
        document_name = os.path.splitext(filename)[0]
        chunks = df["chunk"].tolist()
        all_embeddings = []
        for i in range(0, len(chunks), BATCH_SIZE):
            batch = chunks[i:i+BATCH_SIZE]
            embeddings = gemini_ef.encode_documents(batch)
            all_embeddings.extend([emb.tolist() for emb in embeddings])
            time.sleep(SLEEP_TIME)
        out_df = pd.DataFrame({
            "document_name": [document_name] * len(df),
            "chunk": df["chunk"],
            "embeddings": all_embeddings
        })
        output_path = os.path.join(
            embeddings_dir, f"{document_name}_embeddings.csv")
        out_df.to_csv(output_path, index=False, encoding="utf-8")

In [11]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

connections.connect(
    uri="https://" + os.getenv("MILVUS_ENDPOINT"),
    token=os.getenv("MILVUS_API_KEY"))

In [None]:
# Create Schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="document_name", dtype=DataType.VARCHAR, max_length=256, is_primary=False, auto_id=False),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=8192, is_primary=False, auto_id=False),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768, is_primary=False)
]
schema = CollectionSchema(fields, description="Document chunks and embeddings")

In [None]:
# Create Collection

from pymilvus import utility

collection_name = "llm_paper"
if collection_name not in utility.list_collections():
    collection = Collection(name=collection_name, schema=schema)
else:
    collection = Collection(name=collection_name)

In [None]:
# Insert embeddings

import pandas as pd

embeddings_dir = "./files/embeddings"
for filename in os.listdir(embeddings_dir):
    if filename.endswith("_embeddings.csv"):
        df = pd.read_csv(os.path.join(embeddings_dir, filename))
        df["embeddings"] = df["embeddings"].apply(eval)
        data = [
            df["document_name"].tolist(),
            df["chunk"].tolist(),
            df["embeddings"].tolist()
        ]
        collection.insert(data)

In [None]:
# Create index on embeddings column

collection.create_index(
    field_name="embeddings",
    index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
)

Status(code=0, message=)

In [5]:
# Query content

from pymilvus import Collection, MilvusClient, model

query_text = "best practice for text2sql"
collection_name = "llm_paper"
max_results = 5
gemini_ef = model.dense.GeminiEmbeddingFunction(
    model_name="text-embedding-004",
    api_key=os.getenv("GOOGLE_API_KEY"),
)
query_embedding = gemini_ef.encode_documents([query_text])[0]

client = MilvusClient(
    uri="https://" + os.getenv("MILVUS_ENDPOINT"),
    token=os.getenv("MILVUS_API_KEY")
)

results = client.search(
            collection_name=collection_name,
            anns_field="embeddings",
            data=[query_embedding],
            limit=max_results,
            output_fields=["document_name", "chunk"]
        )

for hits in results:
    for hit in hits:
        print(f"Score: {hit.distance:.4f}")
        print("Document:", hit.entity.get("document_name"))
        print("Chunk:", hit.entity.get("chunk"))
        print("---")

start to install package: google-genai>=1.7.0
successfully installed package: google-genai>=1.7.0
Score: 0.6788
Document: Zhang et al. - 2024 - Benchmarking the Text-to-SQL Capability of Large L
Chunk: from scratch, with less effort. They typically use smarter decoding techniques (e.g., constraining the
predictions of the decoder [39], or schema-aware denoising [50]) to prevent the production of invalid
SQLs. RESDSQL [21] further decouples the intertwined process of schema linking (determining
the schema items like tables and columns in a SQL) and skeleton parsing (determining the SQL
keywords) , which alleviates the difficulty of Text-to-SQL. Apart from network architectures, the
paradigms employed by these methods vary greatly in terms of input encoding, output decoding,
neural training, output refinement, making Text-to-SQL a flourishing research area [17]. However,
the highest accuracy on the Spider leaderboard achieved by traditional learning-based methods is
79.9%, which is still