In [33]:
# Chuẩn bị dataset SFT từ Excel
import pandas as pd
from datasets import Dataset

file_path = "data.xlsx"  # file Excel vừa lưu ở bước 1
df_raw = pd.read_excel(file_path)

train_prompts = []
train_responses = []

for _, row in df_raw.iterrows():
    sender_email = str(row["Sender Email"]).strip()
    email_body = str(row["Body"]).strip()

    if not email_body:
        continue

In [4]:
import os, re
from typing import List, Set
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# ========= 1) Embedding =========
embed_model = SentenceTransformer(r"C:\Users\asus\venv\all-MiniLM-L6-v2", device="cpu")

class CustomEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        cleaned = [clean_text(t) for t in texts]
        return embed_model.encode(cleaned).tolist()
    def embed_query(self, text: str) -> List[float]:
        return embed_model.encode([clean_text(text)])[0].tolist()

def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", str(text))
    return text.strip()

embeddings = CustomEmbeddings()

# ========= 2) FAISS + Dữ liệu =========
raw_docs = [Document(page_content=email_body)]

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80)
doc_chunks = splitter.split_documents(raw_docs)

index_path = "faiss_index"
if os.path.exists(index_path):
    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
else:
    vectorstore = FAISS.from_documents(doc_chunks, embeddings)
    vectorstore.save_local(index_path)

# ========= 3) Model sinh văn bản =========
gen_model_id = r"D:\flan-t5-base"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_id, device_map="cpu")
gen_pipe = pipeline("text2text-generation", model=gen_model, tokenizer=gen_tokenizer)

# ========= 4) Prompt RAG =========
RAG_PROMPT_TMPL = """Bạn là trợ lý AI chính xác và ngắn gọn.
Dựa trên NGỮ CẢNH dưới đây, trả lời CÂU HỎI. Chỉ sử dụng thông tin trong NGỮ CẢNH.
Nếu không có thông tin, trả lời: "Không tìm thấy thông tin".

# NGỮ CẢNH:
{context}

# CÂU HỎI:
{question}
"""
rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TMPL)

# ========= 5) Regex patterns =========
ORDER_ID_PATTERNS = [
    r"\b(?:Mã(?:\s*đơn\s*hàng)?|Đơn(?:\s*hàng)?|Order(?:\s*ID)?|Mã)\s*[:#-]?\s*([A-Z]{1,5}[-_ ]?\d{3,})\b",
    r"#\s?(\d{3,})\b",
    r"\b(ORD[-_ ]?\d{3,})\b",
    r"\b(SO[-_ ]?\d{3,})\b"
]

PRODUCT_PATTERNS = [
    r"Subject:\s*(.*?)\s*(?:Order|Completed|Thank|$)",
    r"purchase\s+(?:of|from)\s+(?:the\s+)?([^\n\.]+)"
]

def extract_order_ids(text: str) -> List[str]:
    found: Set[str] = set()
    for pat in ORDER_ID_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            code = m.group(1).strip()
            code = re.sub(r"\s+", "-", code).upper()
            found.add(code)
    return sorted(found)

def extract_product_names(text: str) -> List[str]:
    found: Set[str] = set()
    for pat in PRODUCT_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            name = m.group(1).strip()
            found.add(name)
    return sorted(found)

# ========= 6) RAG & Extraction =========
def retrieve_context(question: str, k: int = 8) -> List[Document]:
    return vectorstore.similarity_search(question, k=k)

def format_context(docs: List[Document]) -> str:
    seen, parts = set(), []
    for d in docs:
        t = clean_text(d.page_content)
        if t not in seen:
            parts.append(t)
            seen.add(t)
    return "\n".join(parts)[:3500]

def answer_question(question: str) -> str:
    docs = retrieve_context(question, k=12)
    ctx = " \n".join([d.page_content for d in docs])

    # 1) Nếu hỏi mã đơn hàng
    if re.search(r"\bmã\b|\border\b", question, flags=re.IGNORECASE):
        ids = extract_order_ids(ctx)
        if ids:
            return "Mã đơn hàng tìm được:\n" + "\n".join(f"- {i}" for i in ids)
        return "Không tìm thấy mã đơn hàng."

    # 2) Nếu hỏi tên sản phẩm
    if re.search(r"\bsản\s*phẩm\b|\btên\b", question, flags=re.IGNORECASE):
        names = extract_product_names(ctx)
        if names:
            return "Tên sản phẩm tìm được:\n" + "\n".join(f"- {n}" for n in names)
        return "Không tìm thấy tên sản phẩm."

    # 3) Câu hỏi chung → dùng RAG
    prompt_full = rag_prompt.format(context=format_context(docs), question=question)
    out = gen_pipe(prompt_full, max_new_tokens=256, temperature=0.0, num_beams=4, early_stopping=True)
    return out[0]["generated_text"]

# ========= 7) Thử nghiệm =========
q1 = "Trích tất cả đơn hàng"
q2 = "Trích tên sản phẩm trong đơn hàng"

print(answer_question(q1))
print(answer_question(q2))


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (1529 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Subject: Ch s dng thông tin trong NG CNH. Nu không có thông tin, tr li: "Không tm thy thông tin". # NG CNH: You can change your order details and view the status of your order at https://nextix.cloud/nha-hat-nghe-thuat-phuong-nam/vo-xiec-vung-dat-ky-bi-4082/order/BLT9U/keqzoot284autzhh/open/273a1b844/
Tên sản phẩm tìm được:
- Global Clearance Warehouse
- Payment received for your


In [15]:
import re
import pandas as pd

# Đọc file Excel
file_path = "data.xlsx"
df_raw = pd.read_excel(file_path)

# Mở rộng patterns để match tốt hơn dựa trên dữ liệu
ORDER_ID_PATTERNS = [
    r"\b(?:Mã(?:\s*đơn\s*hàng)?|Đơn(?:\s*hàng)?|Order(?:\s*ID|\s*number|\s*code)?|Mã)\s*[:#-]?\s*([\w-]{5,})\b",
    r"#\s?([\w-]{5,})\b",
    r"\b(ORD[-_ ]?\d{3,})\b",
    r"\b(SO[-_ ]?\d{3,})\b",
    r"order\s*:\s*([\w-]{5,})\b",
    r"order\s*code\s*:\s*([\w-]{5,})\b"
]

PRODUCT_PATTERNS = [
    r"Subject:\s*(.*?)\s*(?:Order|Completed|Thank|confirmed|Payment|with|$)",
    r"purchase\s+(?:of|from)\s+(?:the\s+)?([^\n\.]+)",
    r"Event:\s*([^\n]+)",
    r"Order Summary.*?\n([^\n]+)",
    r"Thank you for your purchase!\s*\n\s*([^\n]+)"
]

def extract_order_ids(text: str) -> list:
    found = set()
    for pat in ORDER_ID_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            code = m.group(1).strip()
            code = re.sub(r"\s+", "-", code).upper()
            found.add(code)
    return sorted(found)

def extract_product_names(text: str) -> list:
    found = set()
    for pat in PRODUCT_PATTERNS:
        for m in re.finditer(pat, text, flags=re.IGNORECASE):
            name = m.group(1).strip()
            if name and len(name) > 5:  # Lọc tên ngắn
                found.add(name)
    return sorted(found)

# Extract từ từng row
results = []
for idx, row in df_raw.iterrows():
    sender_email = str(row["Sender Email"]).strip()
    email_body = str(row["Body"]).strip()
    if not email_body:
        continue
    order_ids = extract_order_ids(email_body)
    products = extract_product_names(email_body)
    results.append({
        'Sender Email': sender_email,
        'Order IDs': order_ids,
        'Product Names': products
    })

# In kết quả dưới dạng table
if results:
    df_results = pd.DataFrame(results)
    print(df_results.to_markdown(index=False))
else:
    print("Không tìm thấy patterns nào.")

|   Sender Email | Order IDs                                                    | Product Names                                                                                      |
|---------------:|:-------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|
|            nan | ['GREETINGNET82730', 'NUMBER']                               | ['Greetings From Africa service on', 'Greetings from Africa Video:']                               |
|            nan | ['402752216', 'M_5867432731087361829_']                      | ['Library of Congress!']                                                                           |
|            nan | ['GREETINGNET82730', 'SUMMARY']                              | ['African Fitness Team × 1', 'View your order']                                                    |
|            nan | ['BLT9U', 'DETAILS']                                         | ['*

In [21]:
import re
from typing import Dict, List
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Load LLM để fallback khi regex thất bại
gen_model_id = r"D:\flan-t5-base"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_id)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_id, device_map="cpu")
gen_pipe = pipeline("text2text-generation", model=gen_model, tokenizer=gen_tokenizer)

def extract_info_advanced(email_text: str) -> Dict[str, List[str]]:
    # 1) Order ID
    order_ids = re.findall(r"(?:Order\s*(?:ID|Number)?[:#-]?\s*)([A-Z0-9\-_]+)", email_text, flags=re.IGNORECASE)

    # 2) Product Name: lấy câu sau từ khóa, cắt tại ., !, hoặc xuống dòng
    product_patterns = [
        r"(?:Product|Purchase(?:d)?|Order\s+for|Item)[:\-]?\s*([A-Za-z0-9\s\-\+]+?)(?:[.!?\n]|$)"
    ]
    products = []
    for pat in product_patterns:
        for m in re.finditer(pat, email_text, flags=re.IGNORECASE):
            name = m.group(1).strip()
            if len(name.split()) <= 10:  # tránh lấy nguyên đoạn văn
                products.append(name)

    # 3) Date
    date_matches = re.findall(r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|[A-Za-z]+\s+\d{1,2},\s*\d{4})\b", email_text)

    # 4) Fallback LLM cho Product nếu regex thất bại
    if not products:
        prompt = f"Extract product names from this email:\n{email_text}\nOnly return product names."
        out = gen_pipe(prompt, max_new_tokens=64, temperature=0.0, num_beams=4, early_stopping=True)
        products = [out[0]["generated_text"].strip()]

    return {
        "order_ids": list(set(order_ids)),
        "products": list(set(products)),
        "dates": list(set(date_matches))
    }

def answer_question_with_info(question: str, info: Dict):
    if re.search(r"order|id", question, flags=re.IGNORECASE):
        return "Order IDs:\n" + "\n".join(info["order_ids"]) if info["order_ids"] else "No order IDs found."
    elif re.search(r"product|item", question, flags=re.IGNORECASE):
        return "Products:\n" + "\n".join(info["products"]) if info["products"] else "No products found."
    elif re.search(r"date|time", question, flags=re.IGNORECASE):
        return "Dates:\n" + "\n".join(info["dates"]) if info["dates"] else "No dates found."
    else:
        context = f"Orders: {info['order_ids']}\nProducts: {info['products']}\nDates: {info['dates']}"
        prompt = f"Based on this info:\n{context}\nAnswer: {question}"
        out = gen_pipe(prompt, max_new_tokens=128, temperature=0.0, num_beams=4, early_stopping=True)
        return out[0]["generated_text"]

# ==== Test ====
email_text = email_body

info = extract_info_advanced(email_text)
print(answer_question_with_info("What product was purchased?", info))
print(answer_question_with_info("What is the order ID?", info))
print(answer_question_with_info("When was the email sent?", info))


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Products:
Global Clearance Warehouse Order Confirmed
Order IDs:
Summary
Confirmed
Detail
product
GWJSL021983
Mark
s
thg 7, 2025


In [40]:
import subprocess


In [41]:
import spacy
from typing import List, Dict

# Load mô hình NER sẵn có
nlp = spacy.load("en_core_web_sm")

def extract_order_info_ner(email_text: str) -> List[Dict[str, str]]:
    blocks = re.split(r"-{5,}\s*Forwarded message\s*-{5,}", email_text, flags=re.IGNORECASE)
    results = []

    for block in blocks:
        if not block.strip():
            continue

        doc = nlp(block)
        order_id, product, date = None, None, None

        for ent in doc.ents:
            if ent.label_ in ["DATE", "TIME"] and not date:
                date = ent.text
            # Giả định Order ID là số hoặc mã viết hoa
            elif ent.label_ in ["CARDINAL", "PRODUCT", "ORG"] and not order_id:
                if re.match(r"[A-Z0-9\-]{4,}", ent.text):
                    order_id = ent.text
            # Product thường là tên tổ chức hoặc tên riêng
            elif ent.label_ in ["PRODUCT", "ORG", "WORK_OF_ART"] and not product:
                product = ent.text

        if order_id or product or date:
            results.append({
                "order_id": order_id,
                "product": product,
                "date": date
            })

    return results


def answer_question_ner(orders: List[Dict[str, str]], question: str) -> str:
    q = question.lower()

    if "product" in q:
        products = [o["product"] for o in orders if o["product"]]
        return "Purchased products: " + ", ".join(products) if products else "No product info found."

    elif "order id" in q or "order number" in q:
        ids = [o["order_id"] for o in orders if o["order_id"]]
        return "Order IDs: " + ", ".join(ids) if ids else "No order IDs found."

    elif "date" in q:
        dates = [o["date"] for o in orders if o["date"]]
        return "Order dates: " + ", ".join(dates) if dates else "No dates found."

    else:
        return "I don't have information for that question."


# ---- Ví dụ chạy thử ----
email_text = """(Dán nội dung email vào đây)"""
orders = extract_order_info_ner(email_text)

print(answer_question_ner(orders, "What product was purchased?"))
print(answer_question_ner(orders, "What is the order ID?"))
print(answer_question_ner(orders, "When was the order placed?"))


ModuleNotFoundError: No module named 'spacy'