* Important Links
* https://tesseract-ocr.github.io/tessdoc/Downloads
* https://github.com/tesseract-ocr/tessdata/blob/main/tam.traineddata

In [3]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os 
import csv
import re

# Path to Tesseract if not in system PATH
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"
# Convert PDF to image
images = convert_from_path("sample-tamil-books.pdf", dpi=300)

# Store all text in a single variable
full_text = ""
for img in images:
    full_text += pytesseract.image_to_string(img, lang="tam")

# Split lines and extract book name + price pairs using regex
lines = full_text.splitlines()
book_data = []
for line in lines:
    match = re.match(r"^(.*?)(\d{2,4}\.00)$", line.strip())
    if match:
        name = match.group(1).strip()
        price = match.group(2)
        book_data.append([name, price])

# Save to CSV
with open("tamil_booksv2.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Book Name", "Price"])
    writer.writerows(book_data)

print("✅ CSV saved as 'tamil_booksv2.csv'")




✅ CSV saved as 'tamil_booksv2.csv'


In [5]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import os

# Configure Tesseract path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
os.environ["TESSDATA_PREFIX"] = r"C:\Program Files\Tesseract-OCR\tessdata"

# Convert PDF pages to images
images = convert_from_path("sample-tamil-books.pdf", dpi=300)

# Extract Tamil OCR text
ocr_text = ""
for img in images:
    ocr_text += pytesseract.image_to_string(img, lang="tam") + "\n"

# Save to .txt for review
with open("tamilbooktext_converted.txt", "w", encoding="utf-8") as f:
    f.write(ocr_text)

print("✅ Tamil OCR complete. Text saved to 'tamilbooktext_converted.txt'")


✅ Tamil OCR complete. Text saved to 'tamilbooktext_converted.txt'


In [10]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


# Load document
documents = SimpleDirectoryReader(input_files=["tamilbooktext_converted.txt"]
).load_data()

# Create index
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()



In [12]:
# Example query in Tamil
response = query_engine.query("100 ரூபாய்க்கு குறைவான புத்தகங்களை பட்டியலிடுங்கள்")
print(response)


The books priced below 100 rupees are:
1. இலக்கிய இலக்குகள்‌ பபப பபபபபபபபப டாக்டா்‌ எம்‌.ஏ. சுசீலா 50.00
2. பெண்‌ இலக்கியம்‌ வாசிப்பு பபப டாக்டர்‌ எம்‌.ஏ. சுசீலா 35.00
3. முல்லைப்பாட்டு பபப பபவவயவவவவவையை முனைவர்‌ மகாராசன்‌ 40.00


In [13]:
response = query_engine.query("list books of jeyakandan")
print(response)

Jeyakandan's books include "ஞானபீட" and "சிக்கனையாளர்‌ ஊயகாங்கன்‌ நாவல்கள்‌".
