In [None]:
!apt-get install tesseract-ocr -y
!pip install pytesseract
!pip install PyMuPDF
!pip install arabic-reshaper
!pip install python-bidi


In [10]:

# Import required libraries
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import re
import io
from tqdm import tqdm

# Function to remove diacritics from Arabic text
def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                                     ّ    | # Tashdid
                                     َ    | # Fatha
                                     ً    | # Tanwin Fath
                                     ُ    | # Damma
                                     ٌ    | # Tanwin Damm
                                     ِ    | # Kasra
                                     ٍ    | # Tanwin Kasr
                                     ْ    | # Sukun
                                     ـ    # Tatwil/Kashida
                                 """, re.VERBOSE)
    return re.sub(arabic_diacritics, '', text)

# Function to read and extract text from PDF using OCR
def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""
    # Iterate through the pages with tqdm progress bar
    for page_num in tqdm(range(len(pdf_document)), desc="Processing pages"):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes()))
        # Use pytesseract to do OCR on the image
        text += pytesseract.image_to_string(img, lang='ara')
    return text

# Function to clean text by removing invalid characters
def clean_text(text):
    # Replace invalid characters with a space
    return text.replace('\ufffd', ' ')

# Function to save text to a file
def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

In [None]:

# Cell 4: Set file paths and run extraction
pdf_path = 'history_book.pdf'  # Replace with your PDF file path
txt_path = 'output.txt'  # Replace with your desired output file path


# Extract text from PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Clean the extracted text
cleaned_text = clean_text(extracted_text)
# Remove diacritics from the cleaned text
final_text = remove_diacritics(cleaned_text)
# Save the final text to a file
save_text_to_file(final_text, txt_path)