In [None]:
!pip install pdf2image Pillow opencv-python
!pip install pytesseract easyocr 
!pip install PyMuPDF pdfplumber
!pip install langchain langchain-community langchain-openai sentence_transformers
!pip install pinecone-client langchain-pinecone
!pip install openai tiktoken python-dotenv ragas
!pip install torch torchvision torchaudio 


In [1]:
import torch
print("Torch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA Device:", torch.cuda.get_device_name(0))

from dotenv import load_dotenv
import os

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
os.environ['PINECONE_ENVIRONMENT'] = os.getenv('PINECONE_ENVIRONMENT')


Torch CUDA available: True
CUDA Device: NVIDIA GeForce RTX 4080


In [8]:
from pdf2image import convert_from_path
import os

pdf_path = "Bangla1st.pdf"
output_img_dir = "pages/"
os.makedirs(output_img_dir, exist_ok=True)

images = convert_from_path(pdf_path, dpi=300, fmt="PNG")
for idx, img in enumerate(images):
    gray = img.convert("L")
    gray.save(f"{output_img_dir}/page_{idx+1:03d}.png")
print(f"{len(images)} grayscale images saved.")


49 grayscale images saved.


In [2]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.5.1+cu121
12.1
True


In [None]:
pip install easyocr[bn]


In [3]:
import easyocr
reader = easyocr.Reader(['bn'], gpu=True)


In [None]:
pip install pytesseract

In [None]:
pdf_path = "Bangla1st.pdf"
output_img_dir = "pages/"
images = convert_from_path(pdf_path, dpi=300, fmt="PNG")

In [5]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("CUDA device name:", torch.cuda.get_device_name(0))


CUDA available: True
CUDA device count: 1
Current device: 0
CUDA device name: NVIDIA GeForce RTX 4080


In [19]:
import torch
print("CUDA available:", torch.cuda.is_available())
import fitz  # PyMuPDF
import pytesseract

import easyocr
import numpy as np
from PIL import Image

import pytesseract

# Init OCR models

easyocr_reader = easyocr.Reader(['bn'], gpu=True)

pdf_doc = fitz.open(pdf_path)

extractions = []
for idx, img in enumerate(images):
    img_path = f"pages/page_{idx+1:03d}.png"
    
    
    pil_img = Image.open(img_path)
    np_img = np.array(pil_img)
    
    # 1. PyMuPDF text (if any)
    pdf_text = pdf_doc[idx].get_text()
    # 2. Tesseract OCR
    tesseract_text = pytesseract.image_to_string(pil_img, lang='ben', config='--psm 4')
    print("tesseract_text  is running")

    # 4. EasyOCR (GPU)
    easy_text = "\n".join(easyocr_reader.readtext(np_img, detail=0))
    print("easy_text  is running")
    extractions.append({
        "page": idx+1,
        "pdf_text": pdf_text,
        "tesseract": tesseract_text,
        "easy": easy_text
    })
print("Text extraction (all engines) complete.")


CUDA available: True
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running
tesseract_text  is running
easy_text  is running

In [20]:
import json
with open('ocr_results.json', 'w', encoding='utf-8') as f:
    json.dump(extractions, f, ensure_ascii=False, indent=2)


In [6]:
import json
with open('ocr_results.json', 'r', encoding='utf-8') as f:
    extractions = json.load(f)

In [None]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def clean_page_w_gpt(page_texts):
    prompt = f"""
You are an expert at reconstructing Bengali educational material from OCR/PDF extractions.

Below are 3 attempts from the same page:
PDF: '''{page_texts['pdf_text']}'''
Tesseract OCR: '''{page_texts['tesseract']}'''
EasyOCR: '''{page_texts['easy']}'''

Your job: Merge, correct, and output a clean markdown with all paragraph, MCQs, tables, and sections as in the original page.These outputs will be used for RAG system.So format it in a way that it becomes appropriate for Rag pipeline.
"""
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content



In [None]:
cleaned_dir = "cleaned_pages/"
os.makedirs(cleaned_dir, exist_ok=True)
cleaned_pages = []
for i, page_texts in enumerate(extractions):
    cleaned_md = clean_page_w_gpt(page_texts)
    with open(f"{cleaned_dir}/page_{i+1:03d}.md", "w", encoding="utf-8") as f:
        f.write(cleaned_md)
    cleaned_pages.append(cleaned_md)
    print(f"Page {i+1} cleaned.")


In [9]:
import traceback

start_page = 10  # Page 11 (0-indexed)
cleaned_dir = "cleaned_pages/"
os.makedirs(cleaned_dir, exist_ok=True)
cleaned_pages = []

for i, page_texts in enumerate(extractions[start_page:], start=start_page):
    print(f"\n--- Processing Page {i+1} ---")
    try:
        cleaned_md = clean_page_w_gpt(page_texts)
        print(f"Page {i+1} cleaned. Length: {len(cleaned_md)} characters.")
        with open(f"{cleaned_dir}/page_{i+1:03d}.md", "w", encoding="utf-8") as f:
            f.write(cleaned_md)
        cleaned_pages.append(cleaned_md)
    except Exception as e:
        print(f"Error cleaning Page {i+1}: {e}")
        traceback.print_exc()
        # Optionally, write an error log file for that page:
        with open(f"{cleaned_dir}/page_{i+1:03d}_error.log", "w", encoding="utf-8") as err_f:
            err_f.write(str(e) + "\n" + traceback.format_exc())



--- Processing Page 11 ---
Page 11 cleaned. Length: 2081 characters.

--- Processing Page 12 ---
Page 12 cleaned. Length: 2421 characters.

--- Processing Page 13 ---
Page 13 cleaned. Length: 2040 characters.

--- Processing Page 14 ---
Page 14 cleaned. Length: 2084 characters.

--- Processing Page 15 ---
Page 15 cleaned. Length: 2561 characters.

--- Processing Page 16 ---
Page 16 cleaned. Length: 1431 characters.

--- Processing Page 17 ---
Page 17 cleaned. Length: 1262 characters.

--- Processing Page 18 ---
Page 18 cleaned. Length: 2128 characters.

--- Processing Page 19 ---
Page 19 cleaned. Length: 1866 characters.

--- Processing Page 20 ---
Page 20 cleaned. Length: 1417 characters.

--- Processing Page 21 ---
Page 21 cleaned. Length: 3586 characters.

--- Processing Page 22 ---
Page 22 cleaned. Length: 1566 characters.

--- Processing Page 23 ---
Page 23 cleaned. Length: 2323 characters.

--- Processing Page 24 ---
Page 24 cleaned. Length: 2150 characters.

--- Processing Page

ADVANCED PROMPTING 

In [11]:
def clean_page_w_gpt(page_texts):
    prompt = f"""
You are an expert at reconstructing clean, well-structured Bengali educational material from noisy OCR/PDF extractions, specifically for use in a Retrieval-Augmented Generation (RAG) system.

Below are 3 extraction attempts for the same textbook page:
PDF: '''{page_texts['pdf_text']}'''
Tesseract OCR: '''{page_texts['tesseract']}'''
EasyOCR: '''{page_texts['easy']}'''

Your task:
- Merge and correct these extracts into a single, accurate markdown representation.
- **Format the output for RAG:**  
    * Each paragraph, MCQ, and table should be clearly separated and kept self-contained.
    * MCQs must have the full question text and all options together, formatted as a block.
    * Tables must preserve header rows and row structure, ideally in markdown table syntax.
    * Section headings must be present, and each logical unit (e.g., question, table, section) should be easy to split for semantic chunking.
    * Do not skip any content that could be useful for semantic retrieval.
    * Ensure all Bengali words, grammar, and symbols are correct.
    * Omit page numbers or headers/footers not part of the educational content.

Remember: The output will be embedded, chunked, and indexed for QA, MCQ lookup, and table retrieval in a RAG pipeline, so clarity and structural integrity are vital. Output only the cleaned markdown, no explanations.
"""
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content


In [None]:
import traceback

start_page = 0  # Page 11 (0-indexed)
cleaned_dir = "cleaned_pages_advanced/"
os.makedirs(cleaned_dir, exist_ok=True)
cleaned_pages = []

for i, page_texts in enumerate(extractions[start_page:], start=start_page):
    print(f"\n--- Processing Page {i+1} ---")
    try:
        cleaned_md = clean_page_w_gpt(page_texts)
        print(f"Page {i+1} cleaned. Length: {len(cleaned_md)} characters.")
        with open(f"{cleaned_dir}/page_{i+1:03d}.md", "w", encoding="utf-8") as f:
            f.write(cleaned_md)
        cleaned_pages.append(cleaned_md)
    except Exception as e:
        print(f"Error cleaning Page {i+1}: {e}")
        traceback.print_exc()
        # Optionally, write an error log file for that page:
        with open(f"{cleaned_dir}/page_{i+1:03d}_error.log", "w", encoding="utf-8") as err_f:
            err_f.write(str(e) + "\n" + traceback.format_exc())


Cleaning-chunk

In [None]:
import re

def custom_bengali_chunker(md_text, page_number):
    # Pattern for Markdown headings
    section_pattern = re.compile(r'(^#{1,3} .+?$)', re.MULTILINE)
    # Find all heading start points
    splits = [m.start() for m in section_pattern.finditer(md_text)]
    splits.append(len(md_text))  # To get the last section

    chunks = []
    for i in range(len(splits) - 1):
        chunk = md_text[splits[i]:splits[i+1]].strip()
        # Classify chunk type
        if 'প্রশ্নাবলী' in chunk or 'উত্তর' in chunk:
            chunk_type = "qa"
        elif re.search(r'\|.+\|', chunk):  # crude table detection
            chunk_type = "table"
        elif 'শব্দার্থ' in chunk or 'টীকা' in chunk:
            chunk_type = "glossary"
        else:
            chunk_type = "narrative"

        # Attach metadata
        chunks.append({
            'content': chunk,
            'metadata': {
                'page': page_number,
                'type': chunk_type,
                # Optionally parse for more metadata
            }
        })
    return chunks


In [None]:
import os
import re
import glob

from langchain.schema import Document  # or just use dict
from markdown_it import MarkdownIt

# Settings
CHUNK_TOKEN_LIMIT = 512  # Or 300, adjust as needed
MCQ_PAT = re.compile(r'(\d+[\.\)]\s+.*?\n(?:[^(?:\d+\.)].*?\n)*?(```|\*\*উত্তর:\*\*|---)', re.DOTALL)
VOCAB_HEADING_PAT = re.compile(r'#? ?শব্দার্থ ও টীকা', re.IGNORECASE)
SECTION_HEADING_PAT = re.compile(r'^(#+ .+|---+)$', re.MULTILINE)
PAGE_NUMBER_PAT = re.compile(r'page_(\d+)\.md')

def chunk_md_file(md_path):
    with open(md_path, encoding='utf-8') as f:
        text = f.read()
    page = int(PAGE_NUMBER_PAT.findall(md_path)[0])
    chunks = []
    # 1. Split by main sections/headings
    sections = re.split(SECTION_HEADING_PAT, text)
    section_title = None
    for i, sec in enumerate(sections):
        if i % 2 == 1:
            section_title = sec.strip()
            continue
        content = sec.strip()
        if not content:
            continue
        # 2. MCQ blocks
        for m in MCQ_PAT.finditer(content):
            chunk_text = m.group(0).strip()
            if len(chunk_text) < 20:
                continue
            chunks.append(Document(
                page_content=chunk_text,
                metadata={"page": page, "type": "mcq", "section": section_title}
            ))
            content = content.replace(chunk_text, '')  # Remove so not re-chunked

        # 3. Vocab block (after headings)
        if VOCAB_HEADING_PAT.search(section_title or ""):
            vocab_lines = [l.strip() for l in content.split('\n') if l.strip()]
            # group 3 at a time if short, or per entry
            for i in range(0, len(vocab_lines), 3):
                vocab_chunk = '\n'.join(vocab_lines[i:i+3])
                if len(vocab_chunk) > 10:
                    chunks.append(Document(
                        page_content=vocab_chunk,
                        metadata={"page": page, "type": "vocab", "section": section_title}
                    ))
            continue

        # 4. Story/narrative, or other
        paras = [p.strip() for p in content.split('\n\n') if p.strip()]
        cur_chunk = []
        cur_tokens = 0
        for p in paras:
            cur_chunk.append(p)
            cur_tokens += len(p)
            if cur_tokens > CHUNK_TOKEN_LIMIT:
                chunks.append(Document(
                    page_content='\n\n'.join(cur_chunk),
                    metadata={"page": page, "type": "story", "section": section_title}
                ))
                cur_chunk, cur_tokens = [], 0
        if cur_chunk:
            chunks.append(Document(
                page_content='\n\n'.join(cur_chunk),
                metadata={"page": page, "type": "story", "section": section_title}
            ))
    return chunks

# Example: Process all md files
all_md_files = sorted(glob.glob("/mnt/data/page_*.md"))
all_chunks = []
for mdfile in all_md_files:
    all_chunks.extend(chunk_md_file(mdfile))
print(f"Total chunks: {len(all_chunks)}")


In [None]:
import re
import glob
from langchain.schema import Document

def refine_bengali_md_chunker(md_text, page_number):
    chunks = []
    lines = md_text.split('\n')
    cur_chunk = []
    cur_type = None
    cur_uddeepok = None
    cur_question_num = None
    section_heading = None

    for line in lines:
        # Detect headings
        heading_match = re.match(r'^#\s*(.+)', line)
        if heading_match:
            section_heading = heading_match.group(1).strip()
            continue

        # Detect Uddeepok
        if 'উদ্দীপক' in line:
            if cur_chunk:
                chunks.append(Document(
                    page_content='\n'.join(cur_chunk).strip(),
                    metadata={
                        "page": page_number,
                        "type": cur_type or "unknown",
                        "uddeepok": cur_uddeepok,
                        "question": cur_question_num,
                        "section": section_heading
                    }
                ))
                cur_chunk = []
                cur_type = None
                cur_question_num = None
            cur_uddeepok = line.strip()
            cur_type = "uddeepok"
            cur_chunk = [line]
            continue

        # Detect Question Number
        q_match = re.match(r'^(###|#)?\s*প্রশ্ন\s*(\d+)', line)
        if q_match:
            # Save previous chunk if exists
            if cur_chunk:
                chunks.append(Document(
                    page_content='\n'.join(cur_chunk).strip(),
                    metadata={
                        "page": page_number,
                        "type": cur_type or "unknown",
                        "uddeepok": cur_uddeepok,
                        "question": cur_question_num,
                        "section": section_heading
                    }
                ))
                cur_chunk = []
            cur_type = "mcq"
            cur_question_num = q_match.group(2)
            cur_chunk = [line]
            continue

        # Detect MCQ Answer (e.g., "**উত্তর:**")
        if re.match(r'^\*\*উত্তর:', line):
            cur_chunk.append(line)
            # Save MCQ chunk
            chunks.append(Document(
                page_content='\n'.join(cur_chunk).strip(),
                metadata={
                    "page": page_number,
                    "type": cur_type or "mcq",
                    "uddeepok": cur_uddeepok,
                    "question": cur_question_num,
                    "section": section_heading
                }
            ))
            cur_chunk = []
            cur_type = None
            cur_question_num = None
            continue

        # Detect Explanations ("**ব্যাখ্যা:**", "ব্যাখ্যা:", etc.)
        if re.match(r'^\*\*ব্যাখ্যা:', line) or re.match(r'^ব্যাখ্যা:', line):
            if cur_chunk:
                # Save any previous chunk (usually MCQ or uddeepok)
                chunks.append(Document(
                    page_content='\n'.join(cur_chunk).strip(),
                    metadata={
                        "page": page_number,
                        "type": cur_type or "unknown",
                        "uddeepok": cur_uddeepok,
                        "question": cur_question_num,
                        "section": section_heading
                    }
                ))
                cur_chunk = []
            cur_type = "explanation"
            cur_chunk = [line]
            continue

        # Delimiter ("---") means end of logical chunk
        if line.strip() == "---":
            if cur_chunk:
                chunks.append(Document(
                    page_content='\n'.join(cur_chunk).strip(),
                    metadata={
                        "page": page_number,
                        "type": cur_type or "unknown",
                        "uddeepok": cur_uddeepok,
                        "question": cur_question_num,
                        "section": section_heading
                    }
                ))
                cur_chunk = []
                cur_type = None
                cur_question_num = None
            continue

        # Accumulate content in current chunk
        cur_chunk.append(line)

    # Catch last chunk
    if cur_chunk:
        chunks.append(Document(
            page_content='\n'.join(cur_chunk).strip(),
            metadata={
                "page": page_number,
                "type": cur_type or "unknown",
                "uddeepok": cur_uddeepok,
                "question": cur_question_num,
                "section": section_heading
            }
        ))

    return chunks

# Example: Process all md files in your directory
import os
all_md_files = sorted(glob.glob("/mnt/data/page_*.md"))
all_chunks = []
for mdfile in all_md_files:
    with open(mdfile, 'r', encoding='utf-8') as f:
        text = f.read()
        page_num = int(re.findall(r'page_(\d+)\.md', mdfile)[0])
        new_chunks = refine_bengali_md_chunker(text, page_num)
        all_chunks.extend(new_chunks)

print(f"Total refined chunks: {len(all_chunks)}")


In [13]:
import os
import re
import glob
from langchain.schema import Document

def refined_bengali_chunker(md_text, page_number):
    chunks = []
    # Split by top-level headings (if any)
    sections = re.split(r'^(# .+)$', md_text, flags=re.MULTILINE)
    current_section = None
    for part in sections:
        if part.startswith("# "):
            current_section = part.strip("# ").strip()
            continue
        text = part.strip()
        if not text:
            continue

        # MCQ blocks (### or line starting with "প্রশ্ন", or numbered Q, or | MCQ Table)
        mcq_blocks = re.split(r'(?=(?:^#\s*প্রশ্ন|\n---|\n## |\n### ))', text, flags=re.MULTILINE)
        for block in mcq_blocks:
            block = block.strip()
            if not block:
                continue

            # Detect type
            if block.startswith("প্রশ্ন") or block.startswith("###") or re.search(r'\([কখগঘ]\)', block):
                block_type = "mcq"
                # Try to extract question number if present
                q_match = re.search(r'প্রশ্ন\s*-?(\d+)', block)
                qnum = q_match.group(1) if q_match else None
            elif block.startswith("|") and "উত্তর" in block:
                block_type = "table"
                qnum = None
            elif block.startswith("উদ্দীপক"):
                block_type = "uddeepok"
                qnum = None
            elif block.startswith("সমাধান") or block.startswith("উত্তর"):
                block_type = "explanation"
                qnum = None
            else:
                block_type = "narrative"
                qnum = None

            # Combine consecutive lines up to a token limit (e.g. 512)
            lines = block.split('\n')
            cur_lines = []
            cur_tokens = 0
            for line in lines:
                cur_lines.append(line)
                cur_tokens += len(line)
                if cur_tokens > 512:  # adjust as needed
                    chunk_text = '\n'.join(cur_lines).strip()
                    if chunk_text:
                        chunks.append(Document(
                            page_content=chunk_text,
                            metadata={
                                "page": page_number,
                                "type": block_type,
                                "section": current_section,
                                "question": qnum
                            }
                        ))
                    cur_lines, cur_tokens = [], 0
            # Any leftover
            if cur_lines:
                chunk_text = '\n'.join(cur_lines).strip()
                if chunk_text:
                    chunks.append(Document(
                        page_content=chunk_text,
                        metadata={
                            "page": page_number,
                            "type": block_type,
                            "section": current_section,
                            "question": qnum
                        }
                    ))
    return chunks

# Batch process all .md files:
# all_md_files = sorted(glob.glob("/mnt/data/page_*.md"))
all_md_files = sorted(glob.glob(r"D:\files\nafi\personal projects\bangla_book_ocr_based_rag\project-root\cleaned_pages_advanced\page_*.md"))

all_chunks = []
for mdfile in all_md_files:
    with open(mdfile, "r", encoding="utf-8") as f:
        md_text = f.read()
        page_num = int(re.findall(r'page_(\d+)\.md', mdfile)[0])
        these_chunks = refined_bengali_chunker(md_text, page_num)
        all_chunks.extend(these_chunks)

print(f"Total chunks created: {len(all_chunks)}")
# Each `Document` in all_chunks is RAG- and metadata-ready!


Total chunks created: 517


In [14]:
import json

output_path = r"D:\files\nafi\personal projects\bangla_book_ocr_based_rag\project-root\final_chunks.jsonl"
with open(output_path, "w", encoding="utf-8") as fout:
    for chunk in all_chunks:
        item = {
            "content": chunk.page_content,
            "metadata": chunk.metadata
        }
        fout.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Saved {len(all_chunks)} chunks to {output_path}")


Saved 517 chunks to D:\files\nafi\personal projects\bangla_book_ocr_based_rag\project-root\final_chunks.jsonl


In [15]:
import pickle
output_path = r"D:\files\nafi\personal projects\bangla_book_ocr_based_rag\project-root\final_chunks.pkl"
with open(output_path, "wb") as f:
    pickle.dump(all_chunks, f)


further cleaning optional

In [16]:
import json
import re

# 1. Read the JSONL
chunks = []
with open("D:/files/nafi/personal projects/bangla_book_ocr_based_rag/project-root/final_chunks.jsonl", encoding='utf-8') as f:
    for line in f:
        chunks.append(json.loads(line))

refined_chunks = []
current_section = None

for c in chunks:
    content = c['content'].strip()
    meta = c['metadata']

    # 1. Skip non-content/delimiters
    if len(content) < 10 or re.match(r'^[`\-\*]+$', content):
        continue

    # 2. Type detection by content
    # MCQ: Looks for numbered question and options
    if re.search(r'[\d\.]+[\u09F0-\u09FF].*?\n[ক-ঘ][)]', content) or "উত্তর" in content:
        new_type = "mcq"
        # Try to extract question number (if possible)
        q_match = re.search(r'(\d+)[\.।)]', content)
        question = q_match.group(1) if q_match else meta.get("question")
    # Table
    elif re.search(r'\|.*\|', content):
        new_type = "table"
        question = None
    # Glossary or vocabulary
    elif meta.get('section') and ("শব্দার্থ" in meta['section'] or "টীকা" in meta['section']):
        new_type = "glossary"
        question = None
    # Otherwise narrative
    else:
        new_type = "narrative"
        question = None

    # 3. Section propagation (optional, but good)
    section = meta.get("section")
    if section:
        current_section = section
    else:
        section = current_section

    # 4. Build new chunk dict
    refined_chunks.append({
        "content": content,
        "metadata": {
            "page": meta.get("page"),
            "type": new_type,
            "section": section,
            "question": question
        }
    })

print(f"Refined usable chunks: {len(refined_chunks)}")

# 5. Save back to JSONL (overwrite or use new name)
with open("D:/files/nafi/personal projects/bangla_book_ocr_based_rag/project-root/refined_chunks.jsonl", "w", encoding="utf-8") as f:
    for chunk in refined_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + "\n")


Refined usable chunks: 376


In [17]:
for c in refined_chunks[:10]:
    print(c['metadata'], c['content'][:200])
    print("="*50)


{'page': 1, 'type': 'narrative', 'section': None, 'question': None} ```markdown
{'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None} **বাংলা - ইংরেজি * আইসিটি**  
**বাংলা**  
**১ম পত্র**
{'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None} ## আলোচ্য বিষয়

অপরিচিতা
{'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None} ## অনলাইন ব্যাচ সম্পর্কিত যেকোনো জিজ্ঞাসায়

কল করো: 16910
```
{'page': 2, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None} ```markdown
{'page': 2, 'type': 'narrative', 'section': 'শিখনফল', 'question': None} - নিম্নবিত্ত ব্যক্তির হঠাৎ বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ করবে।  
- তৎকালীন সমাজ-সভ্যতা ও মানবতার অবমাননা সম্পর্কে জানতে পারবে।  
- তৎকালীন সমাজের পণপ্রথার কুপ্রভাব সম্পর
{'page': 2, 'type': 'narrative', 'section': 'প্রশ্নাবলী', 'question': None} **১। অনুপমের বাবা কী করে জীবিকা নির্বাহ করতেন?**  
ক) ডাক্তারি  
খ) ওকালতি  
গ) মাস্টারি  
ঘ) ব্যবসা
{'p

further refinement

In [18]:
import json
import re

fin = "final_chunks.jsonl"
fout = "final_chunks_refined.jsonl"

def is_mcq(content):
    # Looks for Bangla MCQ formats and options
    mcq_patterns = [
        r'উত্তর[:：]',  # Any 'উত্তর:' marker
        r'\*\*উত্তর:',  # Markdown bold
        r'^[০-৯]+\.',  # Bangla numbered
        r'^\d+\.',     # English numbered
        r'^\([ক-ঘ]\)', # Bangla option brackets
        r'^(ক|খ|গ|ঘ)\)', # Bangla options
        r'প্রশ্ন\s*[-:：]?\s*\d+',  # 'প্রশ্ন ১', 'প্রশ্ন-১'
        r'উত্তর\s*[:-：]' # Sometimes just 'উত্তর:'
    ]
    for pat in mcq_patterns:
        if re.search(pat, content, re.MULTILINE):
            return True
    return False

def is_table(content):
    # Detect Markdown-style tables (must have multiple pipes in a row)
    return content.count('|') > 2 and ('---' in content or re.match(r'\|[- ]+\|', content))

def is_glossary(meta, content):
    # Look for glossary markers or sections
    glossary_keywords = ["Glossary", "শব্দার্থ", "টীকা", "অর্থ", "ব্যাখ্যা"]
    # Section field check
    if meta.get("section") and any(kw in meta["section"] for kw in glossary_keywords):
        return True
    # Inline word explanation marker
    if re.search(r'(\*\*|\-)\s*[\u0980-\u09FF]+[।:]', content):
        return True
    # If it starts with several bold/term+meaning lines
    return False

refined = []
with open(fin, encoding="utf-8") as f:
    for line in f:
        chunk = json.loads(line)
        content = chunk["content"].strip()
        meta = chunk["metadata"]

        # Remove formatting-only lines
        if content.lower() in ["```markdown", "```", "---"]:
            continue

        # Classify chunk type, one-hot
        if is_table(content):
            meta["type"] = "table"
        elif is_mcq(content):
            meta["type"] = "mcq"
            # Try to get question number
            qn = re.search(r'প্রশ্ন\s*[-:：]?\s*(\d+)', content)
            if qn: meta["question"] = qn.group(1)
            else: meta["question"] = None
        elif is_glossary(meta, content):
            meta["type"] = "glossary"
        else:
            meta["type"] = "narrative"

        # Clean up content (strip stray markdown)
        content = re.sub(r"```markdown", "", content).strip("` \n")
        refined.append({"content": content, "metadata": meta})

with open(fout, "w", encoding="utf-8") as f:
    for item in refined:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Refinement complete! Chunks:", len(refined))


Refinement complete! Chunks: 350


Load embedding

In [2]:
import os
import json
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import pinecone
from langchain_pinecone import Pinecone as LC_Pinecone

# Make sure your .env file or environment variables are set!
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') or "YOUR_PINECONE_API_KEY"
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or "YOUR_PINECONE_ENV"


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# refined_chunks_path = "final_chunks_refined.jsonl"  # Update path if needed

# chunks = []
# with open(refined_chunks_path, encoding="utf-8") as f:
#     for line in f:
#         data = json.loads(line)
#         # Sanity check: minimal length
#         if len(data["content"].strip()) > 10:
#             chunks.append(data)
# print(f"Loaded {len(chunks)} refined chunks.")
# print(chunks)

In [21]:
from langchain.schema import Document

chunks = []
with open("refined_chunks.jsonl", encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        chunks.append(Document(page_content=item["content"], metadata=item["metadata"]))
print(f"Loaded {len(chunks)} chunks as LangChain Documents.")


Loaded 376 chunks as LangChain Documents.


In [22]:
print(chunks)

[Document(metadata={'page': 1, 'type': 'narrative', 'section': None, 'question': None}, page_content='```markdown'), Document(metadata={'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None}, page_content='**বাংলা - ইংরেজি * আইসিটি**  \n**বাংলা**  \n**১ম পত্র**'), Document(metadata={'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None}, page_content='## আলোচ্য বিষয়\n\nঅপরিচিতা'), Document(metadata={'page': 1, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None}, page_content='## অনলাইন ব্যাচ সম্পর্কিত যেকোনো জিজ্ঞাসায়\n\nকল করো: 16910\n```'), Document(metadata={'page': 2, 'type': 'narrative', 'section': 'অনলাইন ব্যাচ', 'question': None}, page_content='```markdown'), Document(metadata={'page': 2, 'type': 'narrative', 'section': 'শিখনফল', 'question': None}, page_content='- নিম্নবিত্ত ব্যক্তির হঠাৎ বিত্তশালী হয়ে ওঠার ফলে সমাজে পরিচয় সংকট সম্পর্কে ধারণা লাভ করবে।  \n- তৎকালীন সমাজ-সভ্যতা ও মানবতার অবমাননা সম্পর্কে জানতে পারবে।  

In [6]:
from dotenv import load_dotenv
import os

load_dotenv()  # reads the .env file
hf_token = os.getenv("HF_TOKEN")


In [7]:
import torch
print(torch.__version__)   # Should print 2.6.0 or above


2.6.0+cu126


In [None]:
pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126


In [8]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)


PyTorch version: 2.6.0+cu126
CUDA available: True
CUDA version: 12.6


In [9]:
import os
from dotenv import load_dotenv

load_dotenv()
hf_token = os.getenv("HF_TOKEN")

from huggingface_hub import logging as hf_logging
from transformers.utils import logging as transformers_logging
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# Enable verbose logs
hf_logging.set_verbosity_info()         # Shows download/cache actions
transformers_logging.set_verbosity_info()  # Shows model initialization logs

bge = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-m3",
    model_kwargs={"device": "cuda", "token": hf_token},
    encode_kwargs={"normalize_embeddings": True},
    show_progress=True
)

# Trigger downloads and caching
_ = bge.embed_query("Hello world")



  bge = HuggingFaceBgeEmbeddings(
loading configuration file config.json from cache at C:\Users\nasif\.cache\huggingface\hub\models--BAAI--bge-m3\snapshots\5617a9f61b028005a4858fdac845db406aefb181\config.json
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8194,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.53.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file pytorch_model.bin from cache at C:\Users\nasif\.cache\huggingface\hub\model

In [None]:
pip install langchain    sentence-transformers huggingface_hub python-dotenv pydantic tiktoken nltk neo4j langchain-openai langchain-community langchain-experimental
pip install pinecone-client
pip install pinecone

In [None]:
# %run firstpart_baritech.ipynb

import os
from typing import List, Tuple

# Environment variable loader
from dotenv import load_dotenv

# Pydantic
from pydantic import BaseModel, Field

# LangChain Core
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.runnables import (
    RunnableBranch, RunnableLambda, RunnableParallel, RunnablePassthrough
)

# LangChain Document Loaders & Splitters
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter

# LangChain Vectorstores & Embeddings
from langchain_community.vectorstores import Neo4jVector
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# LangChain Graph
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Pinecone

from pinecone import ServerlessSpec

# Neo4j driver
from neo4j import GraphDatabase

# Sentence Transformers
from sentence_transformers import SentenceTransformer

# Hugging Face Hub
from huggingface_hub import login
from langchain_pinecone import PineconeVectorStore
# Other utilities
import glob
import tiktoken
import nltk
from pinecone import Pinecone
import os

In [None]:
os.environ["PINECONE_ENVIRONMENT"] = "us-east-1"

# Fetch the API key from environment variables
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [None]:

index_name = "10mnafi-rag-index"
pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)


In [25]:
def sanitize_metadata(doc):
    new_meta = {}
    for k, v in doc.metadata.items():
        # Pinecone only allows string, number, boolean, list of strings
        if v is None:
            new_meta[k] = ""
        elif isinstance(v, list):
            # List must contain only strings
            new_meta[k] = [str(i) for i in v]
        else:
            new_meta[k] = str(v) if not isinstance(v, (str, int, float, bool)) else v
    return Document(page_content=doc.page_content, metadata=new_meta)


In [26]:
from langchain_pinecone import PineconeVectorStore

# 1. Create the vector store object ONCE
vectorstore = PineconeVectorStore(
    index_name=index_name,    # your Pinecone index, e.g. "normal-rag"
    embedding=bge      # your OpenAI or other embedding object
)

from langchain.schema import Document

batch_size = 100

for i in range(0, len(chunks), batch_size):
    batch = [sanitize_metadata(doc) for doc in chunks[i:i + batch_size]]
    vectorstore.add_documents(batch)
    print(f"Upserted batch {i // batch_size + 1}, size: {len(batch)}")

print("All documents upserted to Pinecone!")



Batches: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Upserted batch 1, size: 100


Batches: 100%|██████████| 4/4 [00:00<00:00,  6.87it/s]


Upserted batch 2, size: 100


Batches: 100%|██████████| 4/4 [00:00<00:00, 11.46it/s]


Upserted batch 3, size: 100


Batches: 100%|██████████| 3/3 [00:00<00:00,  7.23it/s]


Upserted batch 4, size: 76
All documents upserted to Pinecone!
