In [None]:
import os
import numpy as np
import cv2
from scipy.signal import find_peaks, peak_prominences
import matplotlib.pyplot as plt
from google.colab.patches import cv2_imshow
from IPython.display import clear_output
from google.colab import files

%matplotlib inline  
# if you are running this code in Jupyter notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
filenames=os.listdir('/content/drive/MyDrive/KITAB Books/Shamela_0037716/Pages/Shamela_0037716.pdf_dir/')

In [None]:
class cropm:

  def __init__(self):
   pass

  def cropimage(self,image):
    original = image.copy()
    blur = cv2.GaussianBlur(image, (3, 3), 0)
    thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Remove horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,3))
    detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
      cv2.drawContours(thresh, [c], -1, 0, -1)    
# Dilate to merge into a single contour
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,25))
    dilate = cv2.dilate(thresh, vertical_kernel, iterations=3)

# Find contours, sort for largest contour and extract ROI
    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[-2:]
    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:-1]
    for c in cnts:
       x,y,w,h = cv2.boundingRect(c)
       cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 4)
       ROI = original[y:y+h, x:x+w]
       break
    return ROI
     

In [None]:
def directionalHistogram(img, direction='H'):
  # a function which outputs the intensity histogram for a given image along 
  #x or y directions

    (w,h) = img.shape
    sum = []
    pixel_count=0

    if(direction=='H'):
        for j in range(w-1):
          for i in range(h-1):
            pixel=img[j,i]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0

    else:
       for j in range(h-1):
          for i in range(w-1):
            pixel=img[i,j]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0

    return sum

##############################################################

def smoothHist(hist,kernel_size):
  # A function to smooth out the noise in intensity histograms of an image
  kernel = np.ones(kernel_size) / kernel_size
  return np.convolve(hist, kernel, mode='same')

##############################################################

def thresholding(image, threshold, typee='Binary', param1=0, param2=0):
  # A function to apply intensity thresholding to a grey-scale image
  # The thresholding could be simple binary thresholding or adaptive gaussian thresholding
  # If the type is not set to 'Binary' then the parameters for adaptive thresholdinf must
  # be used which are:
  #param1: local region size ( preferably an odd number)
  #param2: constant to be added to local mean
  if(typee.lower()=='binary'):
    ret, thresh= cv2.threshold(image,threshold,255,cv2.THRESH_BINARY_INV)
  else:
    thresh = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,param1,param2)
  return thresh

##############################################################

def peakinterp(interp_factor, hist, prominence_factor):
  #Given an intensity histogram of an image, this function increases the resolution of the histogram
  #by interpolation and then finds the sharp peaks in this histogram using find_peaks()
  #Interp factor controls the new resolution of the histogram
  #Prominence factor decides how much the targeted peaks stand out from the baseline of the spectrum
  resampled_pixel_space=np.linspace(0, interp_factor*len(hist)-1,interp_factor*len(hist))*(1/interp_factor)
  Original_pixel_space=np.linspace(0, len(hist)-1, len(hist))
  hist_interp = np.interp(resampled_pixel_space, Original_pixel_space, hist)
  peaks, properties = find_peaks(hist_interp, prominence=np.max(hist_interp)/prominence_factor, width=50)

  return(peaks,hist_interp, resampled_pixel_space, Original_pixel_space)

  ##############################################################

def findGradSignChange(hist_interp, resampled_pixel_space, Original_pixel_space):
    #Given an interpolated intensity histogram, this function finds the 1st derivative
    # of this histogram and outputs a vector of ones and zeros determining the sign
    # of the calculated derivative.
    # When the sign is +ve, the vector has 1
    # When the sign is -ve, the vector has 0
    hist_grad=np.gradient(hist_horizontal_smooth_interp)
    hist_grad_sign_change=np.where(hist_grad >= 0, 1, 0)
    return hist_grad_sign_change

   ##############################################################

def rle(ia):

        #A function which when given a sequence of binary values outputs the following:
        # 1) the start positions of a portion of repeated values in the sequence
        # 2) the length of the portion of repeated values
        #This will be useful in dealing with the vector representing the sign change of
        #1st derivative of image intensity histogram
       

        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])
 ##############################################################

def cutPositions(runlengths, startpositions, values, threshold,interp_factor):
  #Give a vector of ones and zeroes representing the sign change of 1st deriv. of
  # a histogram, this function smoothes out the abrupt changes in gradient sign
  # which might be an artifact of the gradient calculation.

  # This function also gives an estimation of the possible cutting locations to
  # extract lines

  viable_index=0
  for i in range(len(runlengths)):
    current_length=runlengths[i]
    if(current_length

#create the directory that will hold your line images
!mkdir lines

#loop over all prargraph images
for m in range(1):

    #Read the paragraph image and apply thresholding
    image = cv2.imread('/content/final pre bleedthru2.png',0)
    #/content/drive/MyDrive/KITAB Books/Shamela_0037716/Pages/Shamela_0037716.pdf_dir/51_Shamela_0037716.pdf.jpg
    c = cropm()
    image = c.cropimage(image)
    (w,h) = image.shape
    thresh1=thresholding(image, 240, typee='Binary', param1=0, param2=0)
 

    #obtaining horizontal histogram and smoothing it
    hist_horizontal=directionalHistogram(thresh1)
    hist_horizontal_smooth=smoothHist(hist_horizontal,17)

    #Obtaining peak locations from the smoothed horizontal histogram
    init_threshold=50
    interp_factor=100
    (peaks,hist_horizontal_smooth_interp,resampled_pixel_space, Original_pixel_space)=peakinterp(interp_factor, hist_horizontal_smooth, 8)
    hist_grad_sign_change=findGradSignChange(hist_horizontal_smooth_interp, resampled_pixel_space, Original_pixel_space)

    #obtaining the piecewise constant function approximating the sign change behavior of the 1st derivative of the horizontal histogram
    runlengths, startpositions, values =rle(hist_grad_sign_change)
    (cutpos, new_hist)=cutPositions(runlengths, startpositions, values, init_threshold, interp_factor)


    #Removing undesired sign changes from the piecewise function which are the result of noise or numerical artifiacts, not the desired peaks
    cutpos, new_hist=optimalThreshold(cutpos, runlengths, startpositions, values, new_hist, peaks, 50, 100)
    #displaying lines extracted from the image
    lines= cropImageToLines(cutpos.astype(int), thresh1)
    for i in range(len(lines)):
        cv2.imwrite(f"/content/lines/img{i}.jpg", lines[i])
    


#downloading the line image directory in zipped format
#!zip -r /content/files_segmented.zip /content/files_segmented
#files.download("/content/files_segmented.zip")

In [None]:
!zip -r /content/linesB.zip /content/lines

In [None]:
def directionalHistogram(img, direction='H'):
  # A function to calculate the intensity histograms of an image in x and y directions
    (w,h) = img.shape
    sum = []
    pixel_count=0
    if(direction=='H'):
        for j in range(w-1):
          for i in range(h-1):
            pixel=img[j,i]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0
    else:
       for j in range(h-1):
          for i in range(w-1):
            pixel=img[i,j]
            if(pixel==255):
              pixel_count+=1
          sum.append(pixel_count)
          pixel_count=0
    return sum

def cropLineToWords(viable_sequences, image):
  #Given a line image and the cutpositions, this functions return the images
  #of the words contained in a line
  (w,h) = image.shape
  words=[]
  for i in range(len(viable_sequences)):
    if(i >0 and i< len(viable_sequences)):
      words.append(image[0:w-1, viable_sequences[i-1]:viable_sequences[i]])
    elif(i== len(viable_sequences)-1):
      words.append(image[0:w-1, viable_sequences[i]:len(viable_sequences)])
  return words

def removeSpaces(words):
  words_without_spaces=[]
  for i in range(len(words)):
    if(np.sum(words[i][:,:]>0)):
      words_without_spaces.append(words[i])
  return words_without_spaces  

In [None]:
#make a duirectory that will hold the output words
!mkdir words

#loop over all the lines in your line images folder
filenames = os.listdir('/content/lines')
for m in range(len(filenames)):

  path='/content/lines/'+filenames[2]
  filename=filenames[2]
  words=[]

  #read the line image in grey-scale
  img=cv2.imread(path, 0)
  #get dimensions of the image
  (w,h) = img.shape
  #compute the intensity histogram in the y-direction
  hist_vertical=directionalHistogram(img, direction='V')
  #find the locations where the vertical histogram is zero (background spaces between words)
  zero_sites=np.where(np.asarray(hist_vertical)==0)
  zero_sites=zero_sites[0]

  sequences=[]
  sequence_start=0

  #get the start and end of zero sequences in the vertical histogram
  for i in range(1,len(zero_sites)):
    last_zero=zero_sites[i-1]
    current_zero=zero_sites[i]
    if(current_zero!=last_zero+1):
      sequence_end=last_zero
      sequences.append([sequence_start,sequence_end])
      sequence_start=current_zero
    if(current_zero==last_zero+1 and i==len(zero_sites)-1):
      sequence_start=sequence_end
      sequence_end=current_zero
      sequences.append([sequence_start,sequence_end])
  sequence_lengths=[]
  for i in range(len(sequences)):
     sequence_lengths.append(sequences[i][1]-sequences[i][0]+1)
  

  #Threshold the size of the zero sequences (whether it is big enough to consider it as
  # an interword spacing or small enough to consider as intraword spacing)
  sequence_ratio=np.asarray(sequence_lengths)/w
  average_sequence_length=np.sum(sequence_lengths[1:len(sequence_lengths)-1])/len(sequence_lengths)
  viable_sequences=[]
  overlap_factor=0.75*average_sequence_length
  viable_sequences_unrolled=[]

  for i in range(len(sequences)):
      if(sequence_lengths[i]>=average_sequence_length-overlap_factor):
        viable_sequences.append(sequences[i])
        viable_sequences_unrolled.append(sequences[i][0])
        viable_sequences_unrolled.append(sequences[i][1])  

  viable_sequences_unrolled.append(-1)
  if(viable_sequences_unrolled[0]!=0):
        viable_sequences_unrolled=[0]+viable_sequences_unrolled
  words.append(cropLineToWords(viable_sequences_unrolled, img))

  ordered_words=[]

#remove the spaces (word images with blank black background)
  for i in range(len(words[0])):
    word=words[0][i]
    sum=np.sum(words[0][i][:,:])
    if(sum):
      ordered_words.append(word)
    else:
      ordered_words.append('space')

#save word images to the word directory
  count=0
  for i in range(len(ordered_words)):
      if(not type(ordered_words[i]) is str):
        count+=1
        cv2.imwrite(f"/content/words/word{i}.jpg", ordered_words[i])
        # with open("/content/words/"+filename+'_word'+str(count)+".txt", 'w') as fid: # 'w' creates a new file
        #   fid.write('')
  break

#download word directory as a zipped folder
# !zip -r /content/words.zip /content/words 
# files.download("/content/words.zip") 