In [1]:
def read_markdown_file(file_path):
    """
    อ่านเนื้อหาทั้งหมดจากไฟล์ Markdown ที่กำหนด

    Args:
        file_path (str): เส้นทางไปยังไฟล์ Markdown

    Returns:
        str: เนื้อหาของไฟล์เป็นข้อความ, หรือ None ถ้าเกิดข้อผิดพลาด
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            return content
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# ตัวอย่างการใช้งาน
file_name = r'data\resume.txt'

# สมมติว่ามีไฟล์ชื่อ sample_document.md อยู่ใน directory เดียวกัน
# สร้างไฟล์ sample_document.md เพื่อทดสอบ:
# with open(file_name, 'w', encoding='utf-8') as f:
#     f.write("# Hello, World!\n\nThis is a sample markdown file.")

markdown_content = read_markdown_file(file_name)

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

# Split into chunks
texts = text_splitter.create_documents([markdown_content])

print(f"Total chunks: {len(texts)}\n")

# Convert to JSON-like dict output
formatted_output = []

for i, doc in enumerate(texts, start=1):
    chunk_id = f"doc1#chunk{i}"
    formatted_output.append({
        "id": chunk_id,
        "text": doc.page_content
    })

# Show first chunk
print(formatted_output[0])

Total chunks: 40

{'id': 'doc1#chunk1', 'text': '<!-- image -->\n\n## ABOUT ME'}


In [16]:
from langchain_text_splitters import ExperimentalMarkdownSyntaxTextSplitter

# Markdown header rules
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

# Initialize markdown splitter
text_splitter = ExperimentalMarkdownSyntaxTextSplitter(
    headers_to_split_on=headers_to_split_on,
    return_each_line=False
)

# Perform markdown-based semantic chunking
docs = text_splitter.split_text(markdown_content)

print(f"Total semantic chunks: {len(docs)}\n")

# Convert to JSON-like output
formatted_output = []

for i, doc in enumerate(docs, start=1):
    chunk_id = f"doc1#chunk{i}"
    formatted_output.append({
        "id": chunk_id,
        "text": doc.page_content.strip()   # Document -> text
    })

# Show first chunk
print("First semantic chunk:")
print(formatted_output[0])


Total semantic chunks: 8

First semantic chunk:
{'id': 'doc1#chunk1', 'text': '<!-- image -->'}


In [17]:
formatted_output

[{'id': 'doc1#chunk1', 'text': '<!-- image -->'},
 {'id': 'doc1#chunk2',
  'text': 'As a recent Artificial Intelligence graduate, I am passionate about developing AI models and leveraging data to solve real-world  problems.  My  project  experience  includes  practical  applications  like  an  exam  test  for  pharmacists  and computer  vision  solutions  for  safety  compliance,  such  as  detecting  helmet  usage.  I  thrive  in  collaborative  team environments, enjoying theoretical discussions on optimizing algorithm selection for practical implementation.'},
 {'id': 'doc1#chunk3',
  'text': 'Huachiew Chalermprakiet University Bachelor of Artificial Intelligence ( GPAX 3.89 )'},
 {'id': 'doc1#chunk4',
  'text': '- Programming Languages: Python, JavaScript, TypeScript, HTML, CSS, Shell Script\n- Frameworks &amp; Libraries: YOLO, Hugging Face, Pandas, Scikit-Learn, TensorFlow, PyTorch (CUDA), LangChain, OpenCV, Next.js, React.js, Flask, Pyspark\n- Databases: MySQL, PostgreSQL (via ps

In [18]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("production")

In [19]:
index.upsert_records("markdown-namespace", formatted_output)

UpsertResponse(upserted_count=8, _response_info={'raw_headers': {'date': 'Mon, 01 Dec 2025 07:43:24 GMT', 'content-length': '0', 'connection': 'keep-alive', 'x-pinecone-request-lsn': '1', 'x-pinecone-api-version': '2025-10', 'x-envoy-upstream-service-time': '1064', 'server': 'envoy'}})