In [1]:
import os
import shutil
from pdf2image import convert_from_path
import boto3
from tqdm.notebook import tqdm
import cv2
import re

import io

In [2]:
class process_pdf_csv():
    def __init__(self):
        pass

    def create_pdf_image(self, output_path, **kwargs):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        else:
            shutil.rmtree(output_path, ignore_errors=True)
            os.makedirs(output_path)
        self.images_list = convert_from_path(**kwargs, output_folder = output_path)
        for idx, file_path in enumerate(self.images_list):
            directory = os.path.dirname(file_path)
            filename = os.path.basename(file_path)

            if filename.endswith('.jpg'):
                original_number = filename.split('-')[-1].split('.')[0]
                new_filename = original_number + '.jpg'
                old_filepath = file_path
                new_filepath = os.path.join(directory, new_filename)
                os.rename(old_filepath, new_filepath)
                self.images_list[idx] = new_filepath

        return self.images_list

    def get_rows_columns_map(self, table_result, blocks_map):
        rows = {}
        for relationship in table_result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    cell = blocks_map[child_id]
                    if cell['BlockType'] == 'CELL':
                        row_index = cell['RowIndex']
                        col_index = cell['ColumnIndex']
                        if row_index not in rows:
                            rows[row_index] = {}

                        rows[row_index][col_index] = self.get_text(cell, blocks_map)
        return rows

    def get_text(self, result, blocks_map):
        text = ''
        if 'Relationships' in result:
            for relationship in result['Relationships']:
                if relationship['Type'] == 'CHILD':
                    for child_id in relationship['Ids']:
                        word = blocks_map[child_id]
                        if word['BlockType'] == 'WORD':
                            text += word['Text'] + ' '
                        if word['BlockType'] == 'SELECTION_ELEMENT':
                            if word['SelectionStatus'] =='SELECTED':
                                text +=  'X '
        return text

    def generate_table_csv(self, table_result, blocks_map, table_index):
        rows = self.get_rows_columns_map(table_result, blocks_map)

        table_id = 'Table_' + str(table_index)

        csv = 'Table: {0}\n\n'.format(table_id)

        for row_index, cols in rows.items():

            for col_index, text in cols.items():
                csv += '{}'.format(text) + ","
            csv += '\n'

        csv += '\n\n\n'
        return csv

    def get_table_csv_results(self, response):
        blocks = response['Blocks']

        blocks_map = {}
        table_blocks = []
        for block in blocks:
            blocks_map[block['Id']] = block
            if block['BlockType'] == "TABLE":
                table_blocks.append(block)

        if len(table_blocks) <= 0:
            return "<b> NO Table FOUND </b>"

        csv = ''
        for index, table in enumerate(table_blocks):
            csv += self.generate_table_csv(table, blocks_map, index +1)
            csv += '\n\n'

        return csv

    def extract_info(self, access_key, secret_key, region):
        os.environ['AWS_ACCESS_KEY_ID'] = access_key
        os.environ['AWS_SECRET_ACCESS_KEY'] = secret_key
        os.environ['AWS_DEFAULT_REGION'] = region
        client = boto3.client('textract')
        self.responses = {}
        self.tables_info = {}
        for img_path in self.images_list:
            with open(img_path, 'rb') as file:
                tiff_image_bytes = file.read()
            response = client.analyze_document(
                Document={
                    'Bytes': tiff_image_bytes,
                },
                FeatureTypes=['TABLES']
            )
            self.responses[img_path] = response
            self.tables_info[img_path] = self.get_table_csv_results(response)

        return self.responses, self.tables_info

    def mark_tables(self, alpha = 0.3):
        self.only_tables = {}
        for key in tqdm(self.responses, desc = "FILES", total = len(self.responses)):
            self.only_tables[key] = []
            image = cv2.imread(key)
            original_height, original_width = image.shape[:-1]
            for obj in tqdm(self.responses[key]['Blocks'], total = len(self.responses[key]['Blocks']), desc = "BLOCKS", leave = False):
                if obj['BlockType'] == 'TABLE':
                    self.only_tables[key].append(obj)
                    width, height, left, top = obj['Geometry']['BoundingBox'].values()
                    width, height, left, top = int(width * original_width), int(height * original_height), int(left * original_width), int(top * original_height)

                    overlay = image.copy()
                    cv2.rectangle(overlay, (left, top), (left + width, top + height), (0, 255, 0), thickness = cv2.FILLED)
                    image = cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0)
            cv2.imwrite(key, image)
        return self.only_tables

In [3]:
class process_pdf_json():
    def __init__(self):
        pass

    def create_pdf_image(self, output_path, **kwargs):
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        else:
            shutil.rmtree(output_path, ignore_errors=True)
            os.makedirs(output_path)
        self.images_list = convert_from_path(**kwargs, output_folder = output_path)
        for idx, file_path in enumerate(self.images_list):
            directory = os.path.dirname(file_path)
            filename = os.path.basename(file_path)

            if filename.endswith('.jpg'):
                original_number = filename.split('-')[-1].split('.')[0]
                new_filename = original_number + '.jpg'
                old_filepath = file_path
                new_filepath = os.path.join(directory, new_filename)
                os.rename(old_filepath, new_filepath)
                self.images_list[idx] = new_filepath

        return self.images_list

    def get_rows_columns_map(self, table_result, blocks_map):
        rows = {}
        for relationship in table_result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    cell = blocks_map[child_id]
                    if cell['BlockType'] == 'CELL':
                        row_index = cell['RowIndex']
                        col_index = cell['ColumnIndex']
                        if row_index not in rows:
                            rows[row_index] = {}

                        rows[row_index][col_index] = self.get_text(cell, blocks_map)
        return rows

    def get_text(self, result, blocks_map):
        text = ''
        if 'Relationships' in result:
            for relationship in result['Relationships']:
                if relationship['Type'] == 'CHILD':
                    for child_id in relationship['Ids']:
                        word = blocks_map[child_id]
                        if word['BlockType'] == 'WORD':
                            text += word['Text'] + ' '
                        if word['BlockType'] == 'SELECTION_ELEMENT':
                            if word['SelectionStatus'] =='SELECTED':
                                text +=  'X '
        return text

    def generate_table_json(self, table_result, blocks_map, table_index):
        rows = self.get_rows_columns_map(table_result, blocks_map)

        table_id = 'Table_' + str(table_index)

        table_data = {'table_id': table_id, 'rows': []}

        for row_index, cols in rows.items():
            row_data = {'row_index': row_index, 'columns': []}
            for col_index, text in cols.items():
                column_data = {'column_index': col_index, 'text': text}
                row_data['columns'].append(column_data)
            table_data['rows'].append(row_data)

        return table_data

    def get_table_json_results(self, response):
        blocks = response['Blocks']

        blocks_map = {}
        table_blocks = []
        for block in blocks:
            blocks_map[block['Id']] = block
            if block['BlockType'] == "TABLE":
                table_blocks.append(block)

        if len(table_blocks) <= 0:
            return "<b> NO Table FOUND </b>"

        tables = []
        for index, table in enumerate(table_blocks):
            tables.append(self.generate_table_json(table, blocks_map, index +1))

        return tables

    def extract_info(self, access_key, secret_key, region, **kwargs):
        os.environ['AWS_ACCESS_KEY_ID'] = access_key
        os.environ['AWS_SECRET_ACCESS_KEY'] = secret_key
        os.environ['AWS_DEFAULT_REGION'] = region
        client = boto3.client('textract')
        self.responses = {}
        self.tables_info = {}
        images_list = convert_from_path(**kwargs)
        for i in range(len(images_list)):
            buf = io.BytesIO()
            images_list[i].save(buf, format='JPEG')
            byte_im = buf.getvalue()
            response = client.analyze_document(
                Document={
                    'Bytes': byte_im,
                },
                FeatureTypes=['TABLES']
            )
            self.responses[i] = response
            self.tables_info[i] = self.get_table_json_results(response)

        return self.responses, self.tables_info

    def mark_tables(self, alpha = 0.3):
        self.only_tables = {}
        for key in tqdm(self.responses, desc = "FILES", total = len(self.responses)):
            self.only_tables[key] = []
            image = cv2.imread(key)
            original_height, original_width = image.shape[:-1]
            for obj in tqdm(self.responses[key]['Blocks'], total = len(self.responses[key]['Blocks']), desc = "BLOCKS", leave = False):
                if obj['BlockType'] == 'TABLE':
                    self.only_tables[key].append(obj)
                    width, height, left, top = obj['Geometry']['BoundingBox'].values()
                    width, height, left, top = int(width * original_width), int(height * original_height), int(left * original_width), int(top * original_height)

                    overlay = image.copy()
                    cv2.rectangle(overlay, (left, top), (left + width, top + height), (0, 255, 0), thickness = cv2.FILLED)
                    image = cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0)
            cv2.imwrite(key, image)
        return self.only_tables

    def extract_text(self,filepath):
        text = {}
        doc = fitz.open(filepath)
        for i in tqdm(range(len(doc)), desc="pages"):
            page = doc.load_page(i)
            page_text = page.get_text()
            text[i] = page_text
        return text

In [4]:
#output_path = '/Users/sushilbagate/Downloads/EMAlpha/Notebooks/output_dir'
pdf_path = '/Users/sushilbagate/Downloads/Article_IV_2017.pdf'

pp = process_pdf_json()
#imgs = pp.create_pdf_image(output_path = output_path, pdf_path = pdf_path,
#                        fmt = "jpeg", first_page = 10, last_page = 12, paths_only = True)

## Add access_key, secret_key, region below

informationes, tables = pp.extract_info(pdf_path = pdf_path, access_key = "", secret_key = "", region = '')
#marked_tables = pp.mark_tables()

In [5]:
len(tables)

58

In [6]:
tables[3]

[{'table_id': 'Table_1',
  'rows': [{'row_index': 1,
    'columns': [{'column_index': 1, 'text': 'Russian Federation: Selected '},
     {'column_index': 2, 'text': 'Macroeconomic '},
     {'column_index': 3, 'text': ''},
     {'column_index': 4, 'text': 'Indicators, '},
     {'column_index': 5, 'text': '2014-18 '},
     {'column_index': 6, 'text': ''}]},
   {'row_index': 2,
    'columns': [{'column_index': 1, 'text': ''},
     {'column_index': 2, 'text': '2014 '},
     {'column_index': 3, 'text': '2015 '},
     {'column_index': 4, 'text': '2016 '},
     {'column_index': 5, 'text': '2017 '},
     {'column_index': 6, 'text': '2018 '}]},
   {'row_index': 3,
    'columns': [{'column_index': 1, 'text': ''},
     {'column_index': 2, 'text': ''},
     {'column_index': 3, 'text': ''},
     {'column_index': 4, 'text': ''},
     {'column_index': 5, 'text': 'Projections '},
     {'column_index': 6, 'text': ''}]},
   {'row_index': 4,
    'columns': [{'column_index': 1, 'text': 'Production and pric

In [7]:
## Tables per page
tables

{0: '<b> NO Table FOUND </b>',
 1: '<b> NO Table FOUND </b>',
 2: '<b> NO Table FOUND </b>',
 3: [{'table_id': 'Table_1',
   'rows': [{'row_index': 1,
     'columns': [{'column_index': 1, 'text': 'Russian Federation: Selected '},
      {'column_index': 2, 'text': 'Macroeconomic '},
      {'column_index': 3, 'text': ''},
      {'column_index': 4, 'text': 'Indicators, '},
      {'column_index': 5, 'text': '2014-18 '},
      {'column_index': 6, 'text': ''}]},
    {'row_index': 2,
     'columns': [{'column_index': 1, 'text': ''},
      {'column_index': 2, 'text': '2014 '},
      {'column_index': 3, 'text': '2015 '},
      {'column_index': 4, 'text': '2016 '},
      {'column_index': 5, 'text': '2017 '},
      {'column_index': 6, 'text': '2018 '}]},
    {'row_index': 3,
     'columns': [{'column_index': 1, 'text': ''},
      {'column_index': 2, 'text': ''},
      {'column_index': 3, 'text': ''},
      {'column_index': 4, 'text': ''},
      {'column_index': 5, 'text': 'Projections '},
      {

## Misc Experiments (Ignore)

In [5]:
access_key = ""
secret_key = ""
region = ''

client = boto3.client('textract')


#ppm_img_file = image_to_byte_array(images_list[10])

#imgByteArr = io.BytesIO()
#ppm_img_file.save(imgByteArr, format=ppm_img_file.format)
#imgByteArr = imgByteArr.getvalue()

#stream = io.BytesIO(ppm_img_file)
#image=Image.open(stream)

# Analyze the document
#image_binary = ppm_img_file.tobytes()

response = client.analyze_document(
    Document={
        'Bytes': byte_im,
    },
    FeatureTypes=['TABLES']
)