# Line Segment

## OpenCV

In [36]:
import cv2
import numpy as np
from pathlib import Path
import os
from pdf2image import convert_from_path

def segment_lines_arabic(image, min_line_height=20, gap_threshold=0.7):
    """
    Given a scanned page image (Arabic printed text),
    returns list of bounding boxes for each text line.
    """

    img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    if img is None:
        raise ValueError(f"Could not read image {image}")

    # Binarize image (adaptive or Otsu)
    _, bw = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Compute horizontal projection (sum of black pixels per row)
    proj = np.sum(bw // 255, axis=1)

    # Normalize projection
    norm = (proj - proj.min()) / (proj.max() - proj.min() + 1e-6)

    # Identify row-segments where text likely exists (norm > small threshold)
    text_rows = norm > 0.1  # tweak threshold if needed

    # Find transitions between text vs non-text rows
    lines = []
    start = None
    for i, val in enumerate(text_rows):
        if val and start is None:
            start = i
        elif not val and start is not None:
            end = i
            height = end - start
            if height >= min_line_height:
                lines.append((start, end))
            start = None
    # Edge case: if line goes till image end
    if start is not None and (len(text_rows) - start) >= min_line_height:
        lines.append((start, len(text_rows)))

    # Convert to bounding boxes: full width, with some margin
    h, w = bw.shape
    bboxes = [ (0, s, w, e) for (s, e) in lines ]

    # Optionally cluster very close lines (gap small)
    merged = []
    prev = None
    for bbox in bboxes:
        if prev is None:
            prev = bbox
        else:
            _, prev_s, _, prev_e = prev
            _, cur_s, _, cur_e = bbox
            gap = cur_s - prev_e
            if gap < gap_threshold * (prev_e - prev_s):
                # merge
                prev = (0, prev_s, w, cur_e)
            else:
                merged.append(prev)
                prev = bbox
    if prev:
        merged.append(prev)

    return merged

In [32]:
file_path = '../examples/ifadatul-mustafid-1-page.pdf'
doc = convert_from_path(file_path)
offset = 20

for page_number, page_data in enumerate(doc):
    print("Processing page number - ", page_number)

    page_data = np.array(page_data)
    page_height, page_width, _ = page_data.shape
    line_boxes = segment_lines_arabic(page_data)

    ot_path = f"output/{file_path.split('/')[-1].split('.')[0] + '_lines'}"
    if not os.path.exists(ot_path):
        os.makedirs(ot_path)

    page_data = cv2.cvtColor(np.array(page_data), cv2.COLOR_RGB2GRAY)
    for idx, (x0, y0, x1, y1) in enumerate(line_boxes):
        y0_off = max(0, y0 - offset)
        y1_off = min(page_height, y1 + offset)
        x0_off = max(0, x0 - offset)
        x1_off = min(page_width, x1 + offset)

        line_img = page_data[y0_off:y1_off, x0_off:x1_off]
        cv2.imwrite(f"{ot_path}/line_{idx:03d}.png", line_img)


Processing page number -  0


## SciPy

In [41]:
from scipy.signal import find_peaks, peak_prominences


def directionalHistogram(img, direction='H'):
  # a function which outputs the intensity histogram for a given image along 
  #x or y directions

    (w,h) = img.shape
    sum = []
    pixel_count=0

    if(direction=='H'):
        for j in range(w-1):
          for i in range(h-1):
            pixel=img[j,i]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0

    else:
       for j in range(h-1):
          for i in range(w-1):
            pixel=img[i,j]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0

    return sum

##############################################################

def smoothHist(hist,kernel_size):
  # A function to smooth out the noise in intensity histograms of an image
  kernel = np.ones(kernel_size) / kernel_size
  return np.convolve(hist, kernel, mode='same')

##############################################################

def thresholding(image, threshold, typee='Binary', param1=0, param2=0):
  # A function to apply intensity thresholding to a grey-scale image
  # The thresholding could be simple binary thresholding or adaptive gaussian thresholding
  # If the type is not set to 'Binary' then the parameters for adaptive thresholdinf must
  # be used which are:
  #param1: local region size ( preferably an odd number)
  #param2: constant to be added to local mean
  if(typee.lower()=='binary'):
    ret, thresh= cv2.threshold(image,threshold,255,cv2.THRESH_BINARY_INV)
  else:
    thresh = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,param1,param2)
  return thresh

##############################################################

def peakinterp(interp_factor, hist, prominence_factor):
  #Given an intensity histogram of an image, this function increases the resolution of the histogram
  #by interpolation and then finds the sharp peaks in this histogram using find_peaks()
  #Interp factor controls the new resolution of the histogram
  #Prominence factor decides how much the targeted peaks stand out from the baseline of the spectrum
  resampled_pixel_space=np.linspace(0, interp_factor*len(hist)-1,interp_factor*len(hist))*(1/interp_factor)
  Original_pixel_space=np.linspace(0, len(hist)-1, len(hist))
  hist_interp = np.interp(resampled_pixel_space, Original_pixel_space, hist)
  peaks, properties = find_peaks(hist_interp, prominence=np.max(hist_interp)/prominence_factor, width=50)

  return(peaks,hist_interp, resampled_pixel_space, Original_pixel_space)

  ##############################################################

def findGradSignChange(hist_interp, resampled_pixel_space, Original_pixel_space):
    #Given an interpolated intensity histogram, this function finds the 1st derivative
    # of this histogram and outputs a vector of ones and zeros determining the sign
    # of the calculated derivative.
    # When the sign is +ve, the vector has 1
    # When the sign is -ve, the vector has 0
    hist_grad=np.gradient(hist_horizontal_smooth_interp)
    hist_grad_sign_change=np.where(hist_grad >= 0, 1, 0)
    return hist_grad_sign_change

   ##############################################################

def rle(ia):

        #A function which when given a sequence of binary values outputs the following:
        # 1) the start positions of a portion of repeated values in the sequence
        # 2) the length of the portion of repeated values
        #This will be useful in dealing with the vector representing the sign change of
        #1st derivative of image intensity histogram
       

        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])
 ##############################################################

def cutPositions(runlengths, startpositions, values, threshold,interp_factor):
  #Give a vector of ones and zeroes representing the sign change of 1st deriv. of
  # a histogram, this function smoothes out the abrupt changes in gradient sign
  # which might be an artifact of the gradient calculation.

  # This function also gives an estimation of the possible cutting locations to
  # extract lines

  viable_index=0
  for i in range(len(runlengths)):
    current_length=runlengths[i]
    if(current_length<threshold):
      values[i]=values[viable_index]
    viable_index=i

  new_hist=[]
  for i in range(len(startpositions)):
    if(values[i]):
      new_hist+=np.ones(runlengths[i]-1).tolist()
    else:
      new_hist+=np.zeros(runlengths[i]-1).tolist()

  cutpos=[]
  for i in range(1,len(startpositions)):
    last=values[i-1]
    current=values[i]
    if((last==0 and current==1)):
      cutpos.append(startpositions[i])
    elif((last==1 and i==1)):
      cutpos.append(0)


  return (cutpos, new_hist)

######################################################
def optimalThreshold(cutpos, runlengths, startpositions, values, new_hist, peaks, init_threshold, interp_factor):

  #when removing noise from the gradient sign vector prior to determining the cut locations, we use a threshold
  #value on the run lengths of ones and zeros.
  #An optimal value of the threshold is the value which when used gives us as many cut locations as detected peaks
  # in the original histogram
  while((len(cutpos)!= len(peaks))):
      init_threshold=init_threshold+interp_factor
      (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, init_threshold,interp_factor)

  (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, np.abs(init_threshold-interp_factor),interp_factor)
  cutpos=np.array(cutpos)/interp_factor
  
  return (cutpos, new_hist)

###################################################

def cropImageToLines(cutpos, image, direction='H'):
  (w,h)=image.shape
  cropped_images=[]
  if(direction=='H'):
    for i in range(len(cutpos)):
      currentpos=cutpos[i]
      lastpos=cutpos[i-1]
      cropped_images.append(image[lastpos:currentpos-1,0:h-1])
  else:
    for i in range(len(cutpos)):
      currentpos=cutpos[i]
      lastpos=cutpos[i-1]
      cropped_images.append(image[0:w-1, lastpos:currentpos-1])

  return cropped_images

In [47]:
file_path = '../examples/ifadatul-mustafid-1-page.pdf'
doc = convert_from_path(file_path)
offset = 20

# getting the names of all the paragraph images in the directory files
file_name = file_path.split('/')[-1].split('.')[0]
#stripping the extension of the image file from the string of the filename for further use
#Note that KHATT dataset provides the images with .tif extension
# filenames_split=[filename.replace('.tif', '') for filename in filenames]

for page_number, page_data in enumerate(doc):
    print("Processing page number - ", page_number)

    image = np.array(page_data)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    (w,h) = image.shape
    thresh1=thresholding(image, 240, typee='Binary', param1=0, param2=0)


    #obtaining horizontal histogram and smoothing it
    hist_horizontal=directionalHistogram(thresh1)
    hist_horizontal_smooth=smoothHist(hist_horizontal,17)

    #Obtaining peak locations from the smoothed horizontal histogram
    init_threshold=50
    interp_factor=100
    (peaks,hist_horizontal_smooth_interp,resampled_pixel_space, Original_pixel_space)=peakinterp(interp_factor, hist_horizontal_smooth, 8)
    hist_grad_sign_change=findGradSignChange(hist_horizontal_smooth_interp, resampled_pixel_space, Original_pixel_space)

    #obtaining the piecewise constant function approximating the sign change behavior of the 1st derivative of the horizontal histogram
    runlengths, startpositions, values =rle(hist_grad_sign_change)
    (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, init_threshold, interp_factor)


    #Removing undesired sign changes from the piecewise function which are the result of noise or numerical artifiacts, not the desired peaks
    cutpos, new_hist=optimalThreshold(cutpos, runlengths, startpositions, values, new_hist, peaks, 50, 100)
    #displaying lines extracted from the image
    lines= cropImageToLines(cutpos.astype(int), thresh1)
    
    ot_path = f"output/{file_path.split('/')[-1].split('.')[0] + '_lines'}"
    if not os.path.exists(ot_path):
        os.makedirs(ot_path)

    # cv2.imwrite(f"{ot_path}/line_{idx:03d}.png", line_img)
    for i in range(len(lines)):
        cv2.imwrite(f"{ot_path}/line" + str(i) + ".png", lines[i])

Processing page number -  0


# OCR

## Tesseract

In [19]:
img = Image.open('../examples/ar_line.png')
ar_text = pytesseract.image_to_string(img, lang="ara")
print(ar_text)

- مثال الأول: أن يقول: والله لا أكلّم أحداء ونوئ زيداء قُصِر



In [None]:
from paddleocr import TextRecognition
model = TextRecognition(model_name="arabic_PP-OCRv5_mobile_rec")
output = model.predict(input="../examples/ar_line.png", batch_size=1)
for res in output:
    res.print()
    res.save_to_img(save_path="./output/")
    res.save_to_json(save_path="./output/res.json")


[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/rizkyagung/.paddlex/official_models/arabic_PP-OCRv5_mobile_rec`.[0m
[32m{'res': {'input_path': '../examples/ar_line.png', 'page_index': None, 'rec_text': 'أّ', 'rec_score': 0.4356656074523926}}[0m


## PaddlePaddle

In [None]:
from paddleocr import PaddleOCR  

ocr = PaddleOCR(
    text_recognition_model_name="arabic_PP-OCRv5_mobile_rec",
    use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
    use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
    use_textline_orientation=True, # Use use_textline_orientation to enable/disable textline orientation classification model
    device="gpu:0", # Use device to specify GPU for model inference
)
# result = ocr.predict("https://cdn-uploads.huggingface.co/production/uploads/684ad4f6eb7d8ee8f6a92a3a/lGLRarnLFKJzE_VKOm36T.png")  
result = ocr.predict("../examples/ar_line.png")
for res in result:  
    res.print()  
    res.save_to_img("output")  
    res.save_to_json("output")


[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/rizkyagung/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m
[33mThe specified device (GPU) is not available! Switching to CPU instead.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/rizkyagung/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[33mThe specified device (GPU) is not available! Switching to CPU instead.[0m
[32mCreating model: ('arabic_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/Users/rizkyagung/.paddlex/official_models/arabic_PP-OCRv5_mobile_rec`.[0m
[33mThe specified device (GPU) is not available! Switching to CPU instead.[0m
[32m{'res': {'input_path': '../examples/a

In [None]:
import os
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfWriter, PdfReader
import io

# Path to the input PDF file. Modify at as needed
filePath = '../examples/idhah-qawaid-1-page.pdf'

# Convert PDF to images
doc = convert_from_path(filePath)

# Extract file information
path, fileName = os.path.split(filePath)
fileBaseName, fileExtension = os.path.splitext(fileName)

# Set Tesseract OCR command path
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

# Initialize PDF writer
pdf_writer = PdfWriter()

# List to store OCR text for each page
ocr_text_list = []

# Iterate through each page in the PDF
for page_number, page_data in enumerate(doc):
    print("Processing page number - ", page_number)

    # Perform OCR for Arabic language
    arabic_text = pytesseract.image_to_string(page_data, lang="ara")
    arabic_text = arabic_text.replace("\n", " ")
    ocr_text_list.append(arabic_text)

    # Get a searchable PDF from the OCR result
    pdf = pytesseract.image_to_pdf_or_hocr(page_data, extension='pdf', lang="ara")

    # Append the page to the output PDF
    page = PdfReader(io.BytesIO(pdf)).pages[0]
    pdf_writer.add_page(page)

# Write the combined PDF to a file
output_pdf_path = '{}_OCR_combined.pdf'.format(fileBaseName)
with open(output_pdf_path, 'wb') as output_pdf:
    pdf_writer.write(output_pdf)

# Write the extracted OCR text to a text file
output_text_path = '{}_Arabic.txt'.format(fileBaseName)
with open(output_text_path, 'w', encoding='utf-8') as output_text_file:
    for page_number, arabic_text in enumerate(ocr_text_list):
        output_text_file.write(f"Page {page_number + 1}:\n{arabic_text}\n\n")

# Print output file paths
print(f"Combined PDF saved at: {output_pdf_path}")
print(f"Translated text saved at: {output_text_path}")

Processing page number -  0
Combined PDF saved at: idhah-qawaid-1-page_OCR_combined.pdf
Translated text saved at: idhah-qawaid-1-page_Arabic.txt


## Surya

In [3]:
from PIL import Image
from surya.foundation import FoundationPredictor
from surya.recognition import RecognitionPredictor
from surya.detection import DetectionPredictor

In [4]:
IMAGE_PATH = '../examples/idhah-1.jpg'

image = Image.open(IMAGE_PATH)
foundation_predictor = FoundationPredictor()
recognition_predictor = RecognitionPredictor(foundation_predictor)
detection_predictor = DetectionPredictor()

predictions = recognition_predictor([image], det_predictor=detection_predictor)
print(predictions)

Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Recognizing Text: 100%|██████████| 22/22 [00:13<00:00,  1.60it/s]

[OCRResult(text_lines=[TextLine(polygon=[[573.0, 230.0], [729.0, 230.0], [729.0, 259.0], [573.0, 259.0]], confidence=0.9774964276482078, text='الأمور بتقاصدها', chars=[TextChar(polygon=[[573.0, 230.0], [573.0, 231.0], [574.0, 231.0], [574.0, 230.0]], confidence=0.9970449805259705, text='', bbox_valid=False, bbox=[573.0, 230.0, 574.0, 231.0]), TextChar(polygon=[[573.0, 230.0], [573.0, 231.0], [574.0, 231.0], [574.0, 230.0]], confidence=0.843285858631134, text='', bbox_valid=False, bbox=[573.0, 230.0, 574.0, 231.0]), TextChar(polygon=[[612.0, 237.0], [690.0, 237.0], [690.0, 251.0], [612.0, 251.0]], confidence=0.9851405620574951, text='ا', bbox_valid=True, bbox=[612.0, 237.0, 690.0, 251.0]), TextChar(polygon=[[612.0, 237.0], [690.0, 237.0], [690.0, 251.0], [612.0, 251.0]], confidence=0.9851405620574951, text='ل', bbox_valid=True, bbox=[612.0, 237.0, 690.0, 251.0]), TextChar(polygon=[[612.0, 237.0], [690.0, 237.0], [690.0, 251.0], [612.0, 251.0]], confidence=0.9851405620574951, text='أ', b




In [6]:
det_lines = predictions[0].text_lines
for line in det_lines:
    print(line.text)
    print(line.bbox)



الأمور بتقاصدها
[573.0, 230.0, 729.0, 259.0]
تنبيهان
[592.0, 315.0, 699.0, 350.0]
* الأول: ما أوَّلُه من العبادات ذِكْرٌ، وجب اقترانها بكل اللفظ،
[241.0, 375.0, 986.0, 427.0]
وقيل: يكفى بأوَّله.
[840.0, 447.0, 1052.0, 487.0]
_ فمن ذلك: الصلاة، ومعنى اقترانها بكل التكبير أن يوجد جميع
[238.0, 506.0, 985.0, 554.0]
النية المعتبرة عند كل حرفٍ منه. ومعنى الاكتفاء بأوله أنه لا يجب
[237.0, 561.0, 1048.0, 614.0]
استصحابها إلىٰ آخره، واختاره إمام الحرمين والغزالي، وقد تقدُّمْ (١٠).
[298.0, 618.0, 1047.0, 671.0]
ونظير ذلك: نية كناية الطلاق، فإنه يشترط مقارنة النية لجميع اللفظ
[237.0, 689.0, 981.0, 735.0]
علىٰ خلافٍ فيه بين الروضة وغيرها<sup>(۲)</sup>.
[616.0, 752.0, 1047.0, 796.0]
_ ومن ذلك: الوضوء والغسل، فيستحبُّ اقتران النية فيهما مع
[235.0, 814.0, 982.0, 866.0]
التسمية .
[951.0, 892.0, 1043.0, 917.0]
_ ومن ذلك: الإحرام، فينبغي أن يقال بمقارنة النية مع التلبية، وهو
[232.0, 944.0, 982.0, 995.0]
ظاهر كما يفهم من كلامهم وإن لم يصرِّحوا به، قاله السيوطي<math>^{(r)}</math>.
[334.0, 1003.0, 1043.0, 