! gdown --id 1AmismOZjj96BYsqpcLAwHHQRXiI3Uv9S
! tar -xzvf data.tar.gz

! wget 'https://raw.githubusercontent.com/roshan-shaik-ml/Project-Hermes/main/requirements.txt'

%%time
! pip install -r requirements.txt
! pip install PyPDF2
! pip install pdf2image
! apt-get install -y -qq poppler-utils

In [1]:
! pip install PyPDF2



In [2]:
import os
import cv2
import sys
import glob
import time
import torch
import shutil
import PyPDF2
import logging
import numpy as np
from env import PASSKEY
from PyPDF2 import PdfReader # To open PDF
from PIL import Image, ImageFilter # To open images and sharpen
from pdf2image import convert_from_path # Split PDF pages to JPG
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_directory = os.path.join(parent_directory, 'data')
punganur_directory = os.path.join(data_directory, 'punganur')
punganur_rem_directory = os.path.join(data_directory, 'punganur_rem')
temp_path = os.path.join(os.getcwd(), 'temp') # probably needs to be deleted
boxes_directory = os.path.join(temp_path, 'boxes')
results_directory = os.path.join(os.getcwd(), 'results')

### Converting images to OCR using Microsoft trocr base model

In [4]:
def ocr_print_image(img):

    """
    Perform Optical Character Recognition (OCR) on an image and return the extracted text.

    Args:
        img (PIL.Image): The input image to perform OCR on.

    Returns:
        str: The extracted text from the image.

    Note:
        This function uses a pre-trained model for OCR, which requires the input image
        to be converted to RGB format before processing. The image is then passed through
        the model for text generation. The generated text is returned after decoding,
        skipping any special tokens.

    Raises:
        RuntimeError: If the input image cannot be converted to RGB format.
    """
    try:
        # Convert image to RGB format
        pil_image = img.convert('RGB')
    except Exception as e:
        raise RuntimeError("Failed to convert image to RGB format.") from e

    # Generate pixel values using a print processor
    pixel_values = print_processor(images=pil_image, return_tensors="pt").pixel_values

    # Generate text using a print model
    generated_ids = print_model.generate(pixel_values)

    # Decode the generated text, skipping special tokens
    extracted_text = print_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return extracted_text

In [5]:
def get_details(image_path):

    """
    Extract details from an image.

    Args:
        image_path (str): The path to the image file.

    Returns:
        str: A string containing the extracted details in the format:
            "Name: [name], Husband's Name: [husband_name], House Number: [house_number], Age and Gender: [age_gender]"

    Note:
        This function assumes specific regions of interest (ROIs) within the image for extraction:
        - Name: (0, 100, 75% of image width, 170)
        - Husband's Name: (0, 160, 75% of image width, 220)
        - House Number: (0, 210, 75% of image width, 280)
        - Age and Gender: (0, 270, 70% of image width, 370)
        It uses Optical Character Recognition (OCR) to extract text from these regions.
    """
    image = Image.open(image_path)
    # Define regions of interest (ROIs)
    name_cropped = image.crop((0, 100, int(image.size[0] * 0.75), 170))
    husband_name_cropped = image.crop((0, 160, int(image.size[0] * 0.75), 220))
    house_number_cropped = image.crop((0, 210, int(image.size[0] * 0.75), 280))
    age_gender_cropped = image.crop((0, 270, int(image.size[0] * 0.70), 370))

    # Perform OCR on each region
    husband_name = ocr_print_image(husband_name_cropped)
    name = ocr_print_image(name_cropped)
    house_number = ocr_print_image(house_number_cropped)
    age_gender = ocr_print_image(age_gender_cropped)

    # Construct and return the details string
    details_string = f"Name: {name}, Husband's Name: {husband_name}, House Number: {house_number}, Age and Gender: {age_gender}\n"
    return details_string

In [6]:
def cut_the_box_pil(image, output_path, coordinates, box):


    left, upper, right, lower = coordinates
    image = Image.fromarray(image)
    cropped_img = image.crop((left, upper, right, lower))
    # Save the cropped image
    cropped_img.save(output_path)

def create_imgs_from_pdf(filepath, image_folder_path):


    images = convert_from_path(filepath, 500)

    box = 0
    for i in range(len(images)):

        # Save pages as images in the pdf
        image_name = image_folder_path + '/page'+ str(i) +'.jpg'

        images[i] = images[i].filter(ImageFilter.SHARPEN)
        images[i].save(image_name, 'JPEG')

        image = cv2.imread(image_name, cv2.IMREAD_COLOR)

        x0, y0 = 95, 190
        w, h = 1290, 520

        gap = 30
        color = (255, 0, 0)  # Blue color
        thickness = 2

        output_dir_path = 'temp/boxes'
        for row in range(0, 10):


            if row > 5:
                y = y0 + (h + gap + 2) * row
            else:
                y = y0 + (h + gap) * row


            for col in range(0, 3):

                x = x0 + (w + gap) * col

                # print(x, y)
                # Plot the rectangle
                # image = cv2.rectangle(image, (x, y), (x+w, y+h), color, thickness)

                file_name = 'box-' + str(box)+'.jpg'
                output_file_path = os.path.join(output_dir_path, file_name)
                cut_the_box_pil(image, output_file_path, [x, y, x+w, y+h], box)
                box += 1

        cv2.imwrite(image_name, image)

In [7]:
def remove_pages(input_path, output_path):

    """
    Remove specific pages from a PDF file and save the modified PDF to a new file.
    In this scenario, it is the generally the first two pages and the last page
    of the PDF.

    Args:
        input_path (str): The file path of the input PDF.
        output_path (str): The folder path where the modified PDF will be saved.

    Returns:
        str: The file path of the saved modified PDF.

    Raises:
        FileNotFoundError: If the input PDF file does not exist.
        Exception: If there is an error during the PDF processing or page removal process.
    """
    try:
        # Extract input PDF file name
        input_pdf = os.path.basename(input_path)
        output_pdf = input_pdf[:-4] + '-rem.pdf'

        # Construct full input PDF file path
        input_pdf = os.path.join(data_directory, input_pdf)

        # Read input PDF and create PDF writer object
        pdf_reader = PdfReader(input_pdf)
        pdf_writer = PyPDF2.PdfWriter()

        # Get total number of pages in the input PDF
        total_pages = len(pdf_reader.pages)

        # Define pages to be removed (here: first, second, and last page)
        pages_to_remove = [0, 1, total_pages - 1]

        # Iterate through each page in the input PDF
        for page in range(total_pages):
            # Add page to output PDF if not in pages_to_remove list
            if page not in pages_to_remove:
                pdf_writer.add_page(pdf_reader.pages[page])

        # Construct full output PDF file path
        output_pdf = os.path.join(output_path, output_pdf)

        # Write modified PDF to output file
        with open(output_pdf, 'wb') as output_pdf_file:
            pdf_writer.write(output_pdf_file)

        print(output_pdf)

        return output_pdf

    except FileNotFoundError as e:
        # If input PDF file does not exist, raise FileNotFoundError
        raise FileNotFoundError(f"Input PDF file '{input_path}' not found.") from e

    except Exception as e:
        # If any other error occurs during the process, raise an exception
        raise Exception("Error occurred during PDF processing or page removal.") from e

In [8]:
def remove_directory(directory_path):

    """
    Remove a directory and all its contents.

    Args:
        directory_path (str): The path to the directory to be removed.

    Returns:
        None

    Raises:
        FileNotFoundError: If the directory does not exist.
        NotADirectoryError: If the specified path is not a directory.
        PermissionError: If the user does not have permission to remove the directory.
        OSError: If any other error occurs during the directory removal process.
    """
    try:
        # Check if the directory exists and is a directory
        if os.path.exists(directory_path) and os.path.isdir(directory_path):
            # Remove the directory and all its contents
            shutil.rmtree(directory_path)
        else:
            # If the directory does not exist, print a message
            print(f"Directory '{directory_path}' does not exist.")

    except FileNotFoundError as e:
        # If the directory does not exist, raise FileNotFoundError
        raise FileNotFoundError(f"Directory '{directory_path}' not found.") from e

    except NotADirectoryError as e:
        # If the specified path is not a directory, raise NotADirectoryError
        raise NotADirectoryError(f"'{directory_path}' is not a directory.") from e

    except PermissionError as e:
        # If the user does not have permission to remove the directory, raise PermissionError
        raise PermissionError(f"Permission denied to remove directory '{directory_path}'.") from e

    except OSError as e:
        # If any other error occurs during the directory removal process, raise OSError
        raise OSError(f"Error occurred while removing directory '{directory_path}'.") from e

In [9]:
if __name__ == "__main__":

    # Start time tracking
    print("Process started at:", time.strftime("%Y-%m-%d %H:%M:%S"))
    remove_directory(results_directory)
    os.mkdir(results_directory)
    
    # Set device to 'cpu' if GPU is not available
    device = 'cpu'
    if torch.cuda.is_available():

        device = 'gpu'
        # sys.exit()  # Exit the script if GPU is not available

    # Initialize TrOCR processor and model
    print_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
    print_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

    # Get paths of input PDF files
    input_files_paths = glob.glob(data_directory + '/*.pdf')
    input_files_paths.sort()

    # Loop through input PDF files (testing for one file in this scenario)
    for input_file_path in input_files_paths[:5]:

        # Remove and recreate temporary directory
        remove_directory(temp_path)
        os.mkdir(temp_path)

        # Remove specific pages from the PDF and save the modified PDF in temporary directory
        input_file_name = os.path.basename(input_file_path)[:-4]
        print(input_file_name, "is being processed.")
        rem_file_path = remove_pages(input_file_path, temp_path)

        # Create a folder for storing images extracted from PDF
        image_folder = os.path.basename(input_file_path)[:-4]
        image_folder_path = os.path.join(temp_path, image_folder)
        os.mkdir(image_folder_path)

        # Create a directory for storing extracted boxes from images
        os.mkdir(boxes_directory)

         # Configure logging to save error messages to a file
        logging.basicConfig(filename='error_log.txt', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

        # Convert each page of the modified PDF into images and extract boxes
        create_imgs_from_pdf(rem_file_path, image_folder_path)

        # Get paths of extracted box images
        images = glob.glob(boxes_directory + '/*jpg')

        # Loop through extracted box images (testing for 15 images in this scenario)
        for image in images[:15]:
            try:
                # Start time tracking for processing an image
                start_time = time.time()

                # Get details from the image
                details = get_details(image)
                print(details, end=" ")

                # Write details to a text file
                text_file_name = f'{input_file_name}.txt'
                text_file_path = os.path.join(results_directory, text_file_name)
                # print(f'writing to {text_file_path}')
                with open(text_file_path, 'a') as file:
                    file.write(details)

                # End time tracking for processing an image
                end_time = time.time()
                print(end_time - start_time)

            except Exception as e:
                # Log the error message and the image name
                logging.error(f"An error occurred while processing image '{image}': {str(e)}. Skipping this image...")

    # End time tracking
    print("Process terminated at:", time.strftime("%Y-%m-%d %H:%M:%S"))

Process started at: 2024-02-19 15:16:25


  return torch._C._cuda_getDeviceCount() > 0
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


10 - SUGALIMITTA HO PALEMPALLI is being processed.
/home/shaik/code/electoral/election_data/Scripts/temp/10 - SUGALIMITTA HO PALEMPALLI-rem.pdf




Name: NAME : B LAKSHMI DEVI, Husband's Name: FATHERS NAME: BHANAVATH BALAJI NAYAK, House Number: HOUSE NUMBER : 4-58, Age and Gender: AGE : 20 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/10 - SUGALIMITTA HO PALEMPALLI.txt
9.071650981903076
Name: NAME : YANMAL NAGARATNA, Husband's Name: HUSBANDS NAME: CHANDRA, House Number: HOUSE NUMBER : 4-2/2, Age and Gender: AGE : 41 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/10 - SUGALIMITTA HO PALEMPALLI.txt
7.664220333099365
Name: NAME : BUKYA MUNNA NAIK, Husband's Name: FATMENTS NAME: DANJA NAIK, House Number: HOUSE NUMBER : 2-38, Age and Gender: AGE : 58 GENDER : MALE :
 writing to /home/shaik/code/electoral/election_data/Scripts/results/10 - SUGALIMITTA HO PALEMPALLI.txt
6.77746844291687
Name: NAME : B PADMA, Husband's Name: HUSBANDS NAME:B NANIKA NAIK, House Number: HOUSE NUMBER : 2-11, Age and Gender: AGE : 38 GENDER : FEMALE
 writing to /home/shaik/code/electo



Name: NAME : DHAMODHARS, Husband's Name: FATHERS NAME: VENKATRAMANA'S, House Number: HOUSE NUMBER : 1-113, Age and Gender: AGE : 32 GENDER : MALE :
 writing to /home/shaik/code/electoral/election_data/Scripts/results/11 - PALEMPALLI part 1.txt
7.815220832824707
Name: NAME : P REDDEMMA, Husband's Name: FATHERS NAME:P VENKATESHU, House Number: HOUSE NUMBER : 5-13, Age and Gender: AGE : 33 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/11 - PALEMPALLI part 1.txt
7.44286847114563
Name: NAME : APARNABLE, Husband's Name: MOTHERS NAME: PADMAWATHAMMA B, House Number: HOUSE NUMBER : 2-6, Age and Gender: AGE :36 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/11 - PALEMPALLI part 1.txt
6.843819856643677
Name: NAME : B CHINNA GANGULAMMA, Husband's Name: FATHERS NAME: B RAMASWAMY, House Number: HOUSE NUMBER : 7-16, Age and Gender: AGE : 28 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/



Name: NAME : A PADMAVATHAMMA VI, Husband's Name: HUSBANDS NAME: GANGARAJU VI, House Number: HOUSE NUMBER : 8-11, Age and Gender: AGE : 40 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/12 - PALEMPALLI part 2.txt
8.060759544372559
Name: NAME : CHINNA PAPIREDDY GARI ANUSHA, Husband's Name: HUSBANDS NAME:N PRAVEEN KUMAR, House Number: HOUSE NUMBER : 4-70, Age and Gender: AGE : 23 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/12 - PALEMPALLI part 2.txt
9.126441240310669
Name: NAME : SHITAPAGARI VENKATESU, Husband's Name: FATHERS NAME: MUNASWAMI, House Number: HOUSE NUMBER : 8-12, Age and Gender: AGE :83 GENDER : MALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/12 - PALEMPALLI part 2.txt
7.271390676498413
Name: NAME : DURGAM BHAGYAMMA, Husband's Name: HUSBANDS NAME: NAGARAJU, House Number: HOUSE NUMBER : 8-4, Age and Gender: AGE : 39 GENDER : FEMALE
 writing to /home/shaik/code/electoral/el



Name: NAME : CHAKALA MANGAMMA, Husband's Name: HUSBANDS NAME: RAMAKRISHAN, House Number: HOUSE NUMBER : 9-56/B, Age and Gender: AGE : 45 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/2 - NETHIGUTTAPALLI HO MCPALLI.txt
7.855467796325684
Name: NAME:M.THERISHA, Husband's Name: HUSBANDS NAME:VENKATARAMANA, House Number: HOUSE NUMBER : 9-13, Age and Gender: AGE : 45 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/2 - NETHIGUTTAPALLI HO MCPALLI.txt
7.334768056869507
Name: NAME : CHAKALA REDDEMMA, Husband's Name: HUSBANDS NAME:MUNIRAAJ, House Number: HOUSE NUMBER : 9-56A, Age and Gender: AGE : 46 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/2 - NETHIGUTTAPALLI HO MCPALLI.txt
7.387057781219482
Name: NAME : A SAROJAMMA, Husband's Name: HUSBANDS NAME: BUDDNNA, House Number: HOUSE NUMBER : 9-89A, Age and Gender: AGE : 49 GANDER : FEMALE
 writing to /home/shaik/code/electoral/electio



Name: NAME : VEERAPPA, Husband's Name: FATHERS NAME: MUNASWAMI, House Number: HOUSE NUMBER : 2-24C, Age and Gender: AGE : 58 GENDER : MALE :
 writing to /home/shaik/code/electoral/election_data/Scripts/results/3 - PATRAPALLI part 1.txt
7.287741184234619
Name: NAME : Y VARALAKSHMI, Husband's Name: HUSBANDS NAME: Y BUDDANNA, House Number: HOUSE NUMBER : 1-20, Age and Gender: AGE : 36 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/3 - PATRAPALLI part 1.txt
7.600168943405151
Name: NAME : ALDANET LAK3FMANIMA, Husband's Name: HUSBANDS NAME:KWISHNAPPPA, House Number: HOUSE NUMBER 12.25, Age and Gender: AGE : 58 GEGUER,FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/3 - PATRAPALLI part 1.txt
7.592540502548218
Name: NAME : MODE RATHNAMMA, Husband's Name: HUSHANDS NAME: PEDDAREDDEPPA, House Number: HOUSE NUMBER : 3-34, Age and Gender: AGE : 73 GENDER : FEMALE
 writing to /home/shaik/code/electoral/election_data/Scripts/results/3 -

#Random snippet tests

