In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
import re

# Load PDF and remove a known unwanted boilerplate sentence on load
loader = PyPDFLoader("INTENSIVE GRAMMAR.pdf")
docs = loader.load()

# Exact sentence to remove (leading newline optional). Case-insensitive match.
target = "M·ªçi thng tin v·ªÅ kho√° h·ªçc vui l√≤ng li√™n h·ªá Zalo official: The Forum Education ‚Äì c√≥ tick xanh t·∫°i √¥ search"
pattern = re.compile(r"\n?" + re.escape(target), flags=re.IGNORECASE)

cleaned_docs = []
for d in docs:
    text = getattr(d, 'page_content', str(d))
    # remove the unwanted sentence if present
    new_text = pattern.sub('', text)
    # normalize excessive blank lines
    new_text = re.sub(r"\n{3,}", "\n\n", new_text)
    if new_text != text:
        meta = d.metadata if hasattr(d, 'metadata') else {}
        cleaned_docs.append(Document(page_content=new_text, metadata=meta))
    else:
        cleaned_docs.append(d)

docs = cleaned_docs
print(f"Loaded {len(docs)} pages; removed unwanted boilerplate where found.")

Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 92 0 (offset 0)
Ignoring wrong pointing object 137 0 (offset 0)


Loaded 41 pages; removed unwanted boilerplate where found.


In [4]:
# Inspect the loaded documents from the PDF
print("Total pages loaded:", len(docs))
print(docs[0].page_content[:1000])
print(docs[1].page_content[:1000])


Total pages loaded: 41
       1  
PH·∫¶N 1: ƒê·ªòNG T·ª™ V√Ä TH√å (VERB TENSES) 1.1. T·ªïng quan v·ªÅ th√¨ ƒë·ªông t·ª´ trong ti·∫øng Anh Th√¨ ƒë·ªông t·ª´ trong ti·∫øng Anh th·ªÉ hi·ªán th·ªùi gian v√† tr·∫°ng th√°i c·ªßa h√†nh ƒë·ªông ho·∫∑c tr·∫°ng th√°i. C√≥ 12 th√¨ c∆° b·∫£n trong ti·∫øng Anh, chia th√†nh ba th·ªùi ch√≠nh: hi·ªán t·∫°i, qu√° kh·ª© v√† t∆∞∆°ng lai, m·ªói th·ªùi c√≥ b·ªën th√¨ nh·ªè (ƒë∆°n, ti·∫øp di·ªÖn, ho√†n th√†nh, ho√†n th√†nh ti·∫øp di·ªÖn). 1.1.1. Th√¨ hi·ªán t·∫°i (Present Tenses) ‚Ä¢ Hi·ªán t·∫°i ƒë∆°n (Present Simple): Di·ªÖn t·∫£ th√≥i quen, s·ª± th·∫≠t hi·ªÉn nhi√™n, l·ªãch tr√¨nh. o C·∫•u tr√∫c:  ¬ß Kh·∫≥ng ƒë·ªãnh: S + V(s/es) ¬ß Ph·ªß ƒë·ªãnh: S + do/does + not + V-nguy√™n th·ªÉ ¬ß Nghi v·∫•n: Do/Does + S + V-nguy√™n th·ªÉ? o V√≠ d·ª•:  ¬ß She works every day. ¬ß They do not play football. ¬ß Does he like coffee? ‚Ä¢ Hi·ªán t·∫°i ti·∫øp di·ªÖn (Present Continuous): Di·ªÖn t·∫£ h√†nh ƒë·ªông ƒëang di·ªÖn ra t·∫°i th·ªùi ƒëi·ªÉm n√≥i ho·∫∑c k·∫ø ho·∫°ch

In [5]:
# Imports for text chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from langchain.schema import Document

In [6]:
# --- Chunk the loaded PDF documents and test examples ---
print('Docs loaded:', len(docs))
# Use RecursiveCharacterTextSplitter to create chunks sized for embeddings/LLMs
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)
print('Total chunks produced:', len(chunks))

# Show a few sample chunks (preview)
for i, c in enumerate(chunks[:5]):
    src = c.metadata.get('source') if isinstance(c.metadata, dict) else None
    print(f'\n--- Chunk {i} (source={src}) ---')
    print(c.page_content[:500].replace('\n', ' '))

# Basic statistics about chunk sizes (word counts)
lengths = [len(c.page_content.split()) for c in chunks]
import statistics
if lengths:
    print('Chunk words: min', min(lengths), 'median', statistics.median(lengths), 'max', max(lengths))

# Save chunks to a JSONL file for later indexing (UTF-8)
import json
out_path = 'INTENSIVE_GRAMMAR_chunks.jsonl'
with open(out_path, 'w', encoding='utf-8') as f:
    for c in chunks:
        meta = c.metadata if hasattr(c, 'metadata') else {}
        obj = {'text': c.page_content, 'metadata': meta}
        f.write(json.dumps(obj, ensure_ascii=False) + '\n')
print('Saved chunks to', out_path)

# Quick test: load the saved JSONL and print first item to verify
with open(out_path, 'r', encoding='utf-8') as f:
    first = f.readline()
    print('First saved chunk preview:', first[:500])

Docs loaded: 41
Total chunks produced: 54

--- Chunk 0 (source=INTENSIVE GRAMMAR.pdf) ---
1   PH·∫¶N 1: ƒê·ªòNG T·ª™ V√Ä TH√å (VERB TENSES) 1.1. T·ªïng quan v·ªÅ th√¨ ƒë·ªông t·ª´ trong ti·∫øng Anh Th√¨ ƒë·ªông t·ª´ trong ti·∫øng Anh th·ªÉ hi·ªán th·ªùi gian v√† tr·∫°ng th√°i c·ªßa h√†nh ƒë·ªông ho·∫∑c tr·∫°ng th√°i. C√≥ 12 th√¨ c∆° b·∫£n trong ti·∫øng Anh, chia th√†nh ba th·ªùi ch√≠nh: hi·ªán t·∫°i, qu√° kh·ª© v√† t∆∞∆°ng lai, m·ªói th·ªùi c√≥ b·ªën th√¨ nh·ªè (ƒë∆°n, ti·∫øp di·ªÖn, ho√†n th√†nh, ho√†n th√†nh ti·∫øp di·ªÖn). 1.1.1. Th√¨ hi·ªán t·∫°i (Present Tenses) ‚Ä¢ Hi·ªán t·∫°i ƒë∆°n (Present Simple): Di·ªÖn t·∫£ th√≥i quen, s·ª± th·∫≠t hi·ªÉn nhi√™n, l·ªãch tr√¨nh. o C·∫•u tr√∫c:  ¬ß Kh·∫≥ng ƒë·ªãnh: S + V

--- Chunk 1 (source=INTENSIVE GRAMMAR.pdf) ---
2   ‚Ä¢ Hi·ªán t·∫°i ho√†n th√†nh (Present Perfect): Di·ªÖn t·∫£ h√†nh ƒë·ªông ƒë√£ x·∫£y ra v√† c√≥ k·∫øt qu·∫£ ƒë·∫øn hi·ªán t·∫°i ho·∫∑c tr·∫£i nghi·ªám. o C·∫•u tr√∫c:  ¬ß S + have/has + V3/V-ed o V√≠ d·ª•:  ¬ß She has visite

In [7]:
# --- Create embeddings and FAISS vector store ---
print("‚è≥ Creating embeddings for chunks (this may take a minute)...")

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Use a lightweight local embedding model (no API calls)
# all-MiniLM-L6-v2: 384-dim embeddings, fast and good quality
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store from chunks
vector_store = FAISS.from_documents(chunks, embedding_model)
print(f"‚úÖ Vector store created with {len(chunks)} chunks")

# Save the vector store to disk for reuse
vector_store_path = "INTENSIVE_GRAMMAR_faiss_index"
vector_store.save_local(vector_store_path)
print(f"‚úÖ Saved vector store to {vector_store_path}/")

# --- Test similarity search ---
print("\n" + "="*50)
print("Testing similarity search...")
print("="*50)

# Example queries to test
test_queries = [
    "Th√¨ qu√° kh·ª© ti·∫øp di·ªÖn",

]

for query in test_queries:
    print(f"\nüîç Query: '{query}'")
    # search: returns top-k most similar chunks
    results = vector_store.similarity_search(query, k=5)
    for i, result in enumerate(results, 1):
        content_preview = result.page_content[:300].replace('\n', ' ')
        meta = result.metadata if hasattr(result, 'metadata') else {}
        print(f"  Result {i} (page={meta.get('page')}):")
        print(f"    {content_preview}...")

# --- Test similarity_search_with_scores ---
print("\n" + "="*50)
print("Search with scores (lower is better)...")
print("="*50)

query = "grammar rules"
results_with_scores = vector_store.similarity_search_with_score(query, k=5)
for i, (doc, score) in enumerate(results_with_scores, 1):
    content_preview = doc.page_content[:200].replace('\n', ' ')
    print(f"  Result {i} (score={score:.4f}):")
    print(f"    {content_preview}...")

print("\n‚úÖ FAISS vector store is ready for RAG!")
print(f"   - {len(chunks)} chunks indexed")
print(f"   - Saved to: {vector_store_path}/")
print("   - Use vector_store.similarity_search(query, k=5) to retrieve top-k chunks")

‚è≥ Creating embeddings for chunks (this may take a minute)...


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


‚úÖ Vector store created with 54 chunks
‚úÖ Saved vector store to INTENSIVE_GRAMMAR_faiss_index/

Testing similarity search...

üîç Query: 'Th√¨ qu√° kh·ª© ti·∫øp di·ªÖn'
  Result 1 (page=7):
    1.2.5. Qu√° kh·ª© ti·∫øp di·ªÖn (Past Continuous) L·ªói 1: Kh√¥ng s·ª≠ d·ª•ng qu√° kh·ª© ti·∫øp di·ªÖn ƒë√∫ng ng·ªØ c·∫£nh Sai: When I arrived, they played football. S·ª≠a: When I arrived, they were playing football. Gi·∫£i th√≠ch: H√†nh ƒë·ªông ƒëang di·ªÖn ra trong qu√° kh·ª© ("they were playing") khi m·ªôt h√†nh ƒë·ªông kh√°c x·∫£y ƒë·∫øn ("I arrive...
  Result 2 (page=4):
    hi·ªán t·∫°i ƒë∆°n cho k·∫ø ho·∫°ch t∆∞∆°ng lai Sai: I go to the dentist tomorrow. S·ª≠a: I am going to the dentist tomorrow. Gi·∫£i th√≠ch: Khi n√≥i v·ªÅ k·∫ø ho·∫°ch t∆∞∆°ng lai ƒë√£ ƒë·ªãnh tr∆∞·ªõc, s·ª≠ d·ª•ng hi·ªán t·∫°i ti·∫øp di·ªÖn....
  Result 3 (page=7):
    ho√†n th√†nh khi kh√¥ng c·∫ßn thi·∫øt Sai: She had gone to the store yesterday. S·ª≠a: She went to the store yesterday. Gi·∫£i th√≠ch: N·∫øu ch·ªâ c√

In [8]:
# --- Create RAG chain with OpenAI and custom prompt ---
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough

if not openai_api_key:
    print("‚ö†Ô∏è OPENAI_API_KEY not found in environment. Please set it before running this cell.")
else:
    # Initialize OpenAI LLM
    llm = ChatOpenAI(
        model="gpt-4o",
        temperature=0.3,
        api_key=openai_api_key
    )
    
    # Custom prompt template for the grammar teacher role
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template=""""You are an English grammar teacher. "
            "A Vietnamese student has asked you a question about grammar.\n\n"
            "RETRIEVED CONTEXT:\n{context}\n\n"
            "STUDENT QUESTION:\n{question}\n\n"
            "Please answer using Vietnamese language following these steps:\n"
            "1. Carefully read the CONTEXT retrieved from the database. Only use information that appears in the CONTEXT."
            "2. Give a short and clear explanation of the grammar point the student is asking about. Explain the meaning, Explain the usage, Explain the structure (if included in the context).\n"
            "3. Provide an example (use examples from the context if available). If the retrieved context contains examples, include at least one example verbatim and label it exactly as 'V√≠ d·ª•:'.\n"
            "4. Re-explain the concept using simpler Vietnamese so that a language learner can understand it easily.\n"
            "5. If the concept does not exist in the retrieved context, tell me honestly.\n\n"
            "Your response:"
"""
    )
    
    # Create a retrieval function that gets context from vector store
    def get_context(query, k=5):
        """Retrieve top-k chunks from FAISS and format as context.
        If none of the top-k contain example markers (e.g. 'v√≠ d·ª•', 'ƒë√°p √°n'),
        try a lightweight fallback by scanning the saved JSONL chunks for a chunk
        containing example markers and append it to the context.
        """
        results = vector_store.similarity_search(query, k=5)
        texts = [doc.page_content for doc in results]

        # Check for example markers in retrieved chunks
        markers = ['v√≠ d·ª•', 'ƒë√°p √°n', 'v√≠-d·ª•', 'v√≠du', 'example', 'ans:']
        def has_example(text):
            t = text.lower()
            return any(m in t for m in markers)

        contains_example = any(has_example(t) for t in texts)

        # Fallback: scan saved JSONL for a chunk with an example marker and append it
        if not contains_example:
            try:
                import json
                with open('INTENSIVE_GRAMMAR_chunks.jsonl', 'r', encoding='utf-8') as f:
                    for ln in f:
                        obj = json.loads(ln)
                        txt = obj.get('text', '').lower()
                        if any(m in txt for m in markers):
                            # append this chunk's original text to the context and stop
                            texts.append(obj.get('text', ''))
                            contains_example = True
                            break
            except FileNotFoundError:
                # Saved JSONL not found; skip fallback
                pass

        context = "\n\n---\n\n".join(texts)
        return context
    
    # Strengthen prompt: require verbatim example labeled 'V√≠ d·ª•:' if available
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template=(
            "You are an English grammar teacher. "
            "A Vietnamese student has asked you a question about grammar.\n\n"
            "RETRIEVED CONTEXT:\n{context}\n\n"
            "STUDENT QUESTION:\n{question}\n\n"
            "Please answer using Vietnamese language following these steps:\n"
            "1. Carefully read the CONTEXT retrieved from the database. Only use information that appears in the CONTEXT."
            "2. Give a short and clear explanation of the grammar point the student is asking about. Explain the meaning, Explain the usage, Explain the structure (if included in the context).\n"
            "3. Provide an example (use examples from the context if available). If the retrieved context contains examples, include at least one example verbatim and label it exactly as 'V√≠ d·ª•:'.\n"
            "4. Re-explain the concept using simpler Vietnamese so that a language learner can understand it easily.\n"
            "5. If the concept does not exist in the retrieved context, tell me honestly.\n\n"
            "Your response:"
        )
    )

    # Build RAG chain using LangChain's pipe operator
    rag_chain = (
        {
            "context": lambda x: get_context(x["question"]),
            "question": RunnablePassthrough()
        }
        | prompt_template
        | llm
    )

    # Diagnostic helper to print top-k results and whether they contain example markers
    def diagnose_question(question, k=5):
        print(f"\n[DIAGNOSTIC] Running similarity_search_with_score for: '{question}' (k={k})")
        try:
            results_with_scores = vector_store.similarity_search_with_score(question, k=k)
        except Exception as e:
            print("Error running similarity_search_with_score:", e)
            return

        markers = ['v√≠ d·ª•', 'ƒë√°p √°n', 'v√≠-d·ª•', 'v√≠du', 'example', 'ans:']
        for i, (doc, score) in enumerate(results_with_scores, start=1):
            text = getattr(doc, 'page_content', str(doc))
            preview = text[:800].replace('\n', ' ')
            lowered = text.lower()
            contains_example = any(m in lowered for m in markers)
            print(f"--- Result {i} (score={score:.4f}) contains_example={contains_example} ---")
            print("metadata:", getattr(doc, 'metadata', {}))
            print(preview)
            print('\n')

        # Quick JSONL search for a common tense phrase to show where examples live
        try:
            import json
            pattern = 'hi·ªán t·∫°i ho√†n th√†nh'
            found = 0
            with open('INTENSIVE_GRAMMAR_chunks.jsonl', 'r', encoding='utf-8') as f:
                for ln in f:
                    obj = json.loads(ln)
                    txt = obj.get('text', '').lower()
                    if pattern in txt:
                        found += 1
                        print('\n--- Found chunk containing pattern (preview) ---')
                        print(obj.get('text', '')[:1200].replace('\n', ' '))
                        break
            if found == 0:
                print(f"No saved chunk containing '{pattern}' found in INTENSIVE_GRAMMAR_chunks.jsonl")
        except FileNotFoundError:
            print('INTENSIVE_GRAMMAR_chunks.jsonl not found; run the chunking cell first')
    
    print("‚úÖ RAG chain initialized!")
    print("\n" + "="*70)
    print("INTERACTIVE GRAMMAR TEACHER - Testing with Examples")
    print("="*70)
    
    # Test queries to demonstrate the RAG chain
    test_questions = [
        "s·ª≠ d·ª•ng th√¨ hi·ªán t·∫°i ho√†n th√†nh nh∆∞ th·∫ø n√†o?",
        "th√¨ t∆∞∆°ng lai ƒë∆°n l√† g√¨ ? "
    ]
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{'='*70}")
        print(f"‚ùì Student Question {i}: {question}")
        print(f"{'='*70}")
        
        try:
            # Retrieve context
            context = get_context(question)
            print(f"\nüìö Retrieved Context (first 500 chars):")
            print(context[:500] + "..." if len(context) > 500 else context)
            
            # Get response from RAG chain
            print(f"\nüéì Teacher Response:")
            print("-" * 70)
            # Pass a mapping matching the chain's input contract: {'question': ...}
            response = rag_chain.invoke({"question": question})
            # response may be a LangChain message object, a dict, or other runnable result.
            # Extract text content safely.
            if hasattr(response, 'content'):
                answer_text = response.content
            elif isinstance(response, dict) and 'content' in response:
                answer_text = response['content']
            else:
                answer_text = str(response)
            print(answer_text)
            print("-" * 70)
            
        except Exception as e:
            print(f"‚ùå Error processing question: {e}")
    
    print("\n" + "="*70)
    print("‚úÖ RAG Grammar Teacher is ready!")
    print("="*70)


  llm = ChatOpenAI(


‚úÖ RAG chain initialized!

INTERACTIVE GRAMMAR TEACHER - Testing with Examples

‚ùì Student Question 1: s·ª≠ d·ª•ng th√¨ hi·ªán t·∫°i ho√†n th√†nh nh∆∞ th·∫ø n√†o?

üìö Retrieved Context (first 500 chars):
16  
‚Ä¢ Khi nh·∫•n m·∫°nh ƒë·∫øn t·ª´ng th√†nh vi√™n, d√πng ƒë·ªông t·ª´ s·ªë nhi·ªÅu.  o V√≠ d·ª•: The team are arguing among themselves. L·ªói th∆∞·ªùng g·∫∑p: Kh√¥ng nh·∫•t qu√°n trong vi·ªác s·ª≠ d·ª•ng ƒë·ªông t·ª´ Sai: The staff is preparing their reports. S·ª≠a: The staff are preparing their reports. (N·∫øu nh·∫•n m·∫°nh t·ª´ng ng∆∞·ªùi trong nh√≥m) 2.2.9. Ch·ªß ng·ªØ l√† ph√¢n s·ªë ho·∫∑c ph·∫ßn trƒÉm Quy t·∫Øc: Khi ch·ªß ng·ªØ l√† ph√¢n s·ªë ho·∫∑c ph·∫ßn trƒÉm, ƒë·ªông t·ª´ ph√π h·ª£p v·ªõi danh t·ª´ theo sau "of". V√≠ d·ª•: ‚Ä¢ Fifty percent of the work is completed. ‚Ä¢ Two-thirds of the s...

üéì Teacher Response:
----------------------------------------------------------------------
1. Th√¨ hi·ªán t·∫°i ho√†n th√†nh (Present Perfect) kh√¥ng ƒë∆∞·ª£c ƒë·ªÅ c·∫≠p chi 