In [1]:
import json
import re
from enum import Enum
import copy

In [2]:
NOISES = {'-', '.', ';', ':', ','}
class Positions(Enum):
    left = "Left"
    right = "Right"
    top = "Top"
    bottom = "Bottom"

In [3]:
def get_positions_value(line_block, position: Positions):
    bounding_box = line_block['Geometry']['BoundingBox']
    if position == Positions.left or position == Positions.top:
        return bounding_box[position.value]
    elif position == Positions.bottom:
        return bounding_box[Positions.top.value] + bounding_box['Height']
    elif position == Positions.right:
        return bounding_box[Positions.left.value] + bounding_box['Width']

In [4]:
def get_line_containing_the_block(all_blocks, pivot_block, delta=0.005):
    """
    Gets all the blocks which are in same line as the pivot_block
    :param all_blocks: all the blocks to look into
    :param pivot_block: block to get the line position
    :param delta: block position differed by this much will still be considered in same lin
    :return: blocks that are in same line as pivot_block
    """
    if pivot_block:
        line_item_blocks = []
        start_top_pos = get_positions_value(pivot_block, Positions.top)
        start_delta = round(start_top_pos - delta, 3)
        end_delta = round(start_top_pos + delta, 3)
        for line_block in all_blocks:
            if line_block['Text'] not in NOISES:
                if start_delta <= round(get_positions_value(line_block, Positions.top), 3) <= end_delta:
                    line_item_blocks.append(line_block)
        return line_item_blocks
    else:
        return []

In [5]:
def get_blocks_under_with_same_start_pos(blocks, anchor_block, *, limit=None, delta=0.1):
    anchor_block_left_pos = get_positions_value(anchor_block, Positions.left)
    try:
        anchor_block_start_index = blocks.index(anchor_block)
        blocks = blocks[anchor_block_start_index + 1:]
    except Exception:
        pass
    blocks_under = []
    for block in blocks:
        block_left_position = get_positions_value(block, Positions.left)
        if abs(anchor_block_left_pos - block_left_position) < delta:
            blocks_under.append(block)
        if limit is not None and len(blocks_under) == limit:
            return blocks_under
    return blocks_under

In [6]:
def column_name_above(blocks,table_blocks):
    first_line = get_line_containing_the_block(table_blocks,table_blocks[0])
    flag = False
    total_pages=set([])
    if first_line:
        first_line_text = get_text_line_blocks(first_line)
        print("first line")
        
        for text in first_line_text:
            if not re.match(r'[a-z]',text,flags=re.IGNORECASE):
                flag = True
                break
        if not flag:
            print("yes first")
            print(first_line_text)
            return table_blocks
        else:
            flag1 = False
            first_block = table_blocks[0]
            while True:
#                 if blocks.index(first_block)-1<=0:
#                     break
#                 if first_block['Page']!=cur_page:
#                     brea
                total_pages.add(first_block['Page'])
                if len(total_pages)>2:
                    break
                previous_line = get_line_containing_the_block(blocks,blocks[blocks.index(first_block)-1])
                previous_line_text = get_text_line_blocks(previous_line)
                print("second line")
                flag1 = False
                for text in previous_line_text:
                    if not re.match(r'[a-z]',text,re.IGNORECASE):
                        flag1 = True
                        break
                if not flag1:
                    print(previous_line_text)
                    new_table_blocks = previous_line + table_blocks
                    return new_table_blocks
                else:             
                    print("nah")
                    first_block = previous_line[0]
        return table_blocks
                

In [7]:
def get_blocks_in_region(blocks, reference_block=None, min_left_pos=-0.0001, max_left_pos=0.9999, min_top_pos=-0.0001,
                         max_top_pos=0.9999, min_width=0.0, max_width=0.0, min_height=0.0, max_height=0.0,page=None):
    """
    This function can be used for getting a list of blocks in a particular region(based on BoundingBox)
    :param blocks: the whole list of blocks
    :param reference_block: optional: a specific block from the whole list. if given the region demarcation is relative
    to this  block
    ### region demarcation parameters ###
    :param min_left_pos: float: the expected blocks must have a Left position greater or equal to this / in case
    reference_block is mentioned this value added with the reference_block's Left position, becomes the `min_left_pos`
    :param max_left_pos: float: the expected blocks must have a Left position less or equal to this / in case
    reference_block is mentioned this value added with the reference_block's left position, becomes the `max_left_pos`
    :param min_top_pos: float: the expected blocks must have a Top position greater or equal to this / in case
    reference_block is mentioned this value added with the reference_block's Top position, becomes the `min_top_pos`
    :param max_top_pos: float: the expected blocks must have a Top position less or equal to this / in case
    reference_block is mentioned this value added with the reference_block's Top position, becomes the `max_top_pos`
    ### font size and text length specific parameters ###
    these parameters are used to further filter the blocks in a target region based on their font size
    :param min_width: optional: float: if specified the block must at least have the width of the specified value
    :param max_width: optional: float: if specified the block can at most have the width of the specified value
    :param min_height: optional: float: if specified the block must at least have the height of the specified value
    :param max_height: optional: float: if specified the block can at most have the height of the specified value
    :return: list of blocks
    """
    reference_block = reference_block or {}
    anchor_bounds = reference_block.get('Geometry', {}).get('BoundingBox', {})
    min_abs_left_pos = anchor_bounds.get('Left', 0) + min_left_pos
    max_abs_left_pos = anchor_bounds.get('Left', 0) + max_left_pos
    min_abs_top_pos = anchor_bounds.get('Top', 0) + min_top_pos
    max_abs_top_pos = anchor_bounds.get('Top', 0) + max_top_pos
    qualified_blocks = []
    for block in blocks:
        block_bounds = block['Geometry']['BoundingBox']
        if not min_abs_left_pos <= block_bounds['Left'] <= max_abs_left_pos:
            continue
        if not min_abs_top_pos <= block_bounds['Top'] <= max_abs_top_pos:
            continue
        if max_width and block_bounds['Width'] > max_width:
            continue
        if min_width and block_bounds['Width'] < min_width:
            continue
        if max_height and block_bounds['Height'] > max_height:
            continue
        if min_height and block_bounds['Height'] < min_height:
            continue
        if page!=None:
            if block['Page']!=page:
                continue
        qualified_blocks.append(block)
    return qualified_blocks

In [8]:
def get_lines_with_words(line_blocks, match_list, limit=None, *, left_margin=None, right_margin=None,
                         bottom_margin=None, top_margin=None):
    """
    finds and return the blocks matching any of the regex in match_list
    :param bottom_margin: blocks to look for top below positions
    :param top_margin: blocks to look for top above positions
    :param right_margin: blocks to look for right positions
    :param left_margin: blocks to look for left positions
    :param line_blocks: blocks to look for pattern
    :param match_list: list of regex to look for
    :param limit: max number of matched blocks needed
    :return: blocks matching the match_list regex
    """
    matched_lines = []
    regexes = list(map(lambda x: re.compile(x, re.IGNORECASE), match_list))
    for block in line_blocks:
        block_text = block['Text'].lower()
        if any(x.search(block_text) for x in regexes):
            if bottom_margin:
                if get_positions_value(block, Positions.top) > bottom_margin:
                    continue
            if top_margin:
                if get_positions_value(block, Positions.top) < top_margin:
                    continue
            if left_margin:
                if get_positions_value(block, Positions.left) < left_margin:
                    continue
            if right_margin:
                if get_positions_value(block, Positions.left) > right_margin:
                    continue
            matched_lines.append(block)
            if limit is not None and len(matched_lines) == limit:
                return matched_lines
    return matched_lines

def get_positions_value(line_block, position: Positions):
    bounding_box = line_block['Geometry']['BoundingBox']
    if position == Positions.left or position == Positions.top:
        return bounding_box[position.value]
    elif position == Positions.bottom:
        return bounding_box[Positions.top.value] + bounding_box['Height']
    elif position == Positions.right:
        return bounding_box[Positions.left.value] + bounding_box['Width']

In [9]:
def get_max_bottom(under_blocks_with_delta,c,table_flag,maxlength,max_bottom):
    dictmap={}
    #c=0
    top_block=under_blocks_with_delta[0]
    top=under_blocks_with_delta[0]['Geometry']['BoundingBox']['Top']
    bottom=None
    bottom_block=None

    for idx in range(1,len(under_blocks_with_delta)):
        under_block=under_blocks_with_delta[idx]
        prev_block=under_blocks_with_delta[idx-1]
        cur_block=under_blocks_with_delta[idx]
        prev_block_ycoord=prev_block['Geometry']['BoundingBox']['Top']
        cur_block_ycoord=cur_block['Geometry']['BoundingBox']['Top']
        if cur_block_ycoord<prev_block_ycoord:
            #print(dictmap)
            break
            #continue
        diff=cur_block_ycoord-prev_block_ycoord
        diff=round(diff, 2)
        #rounding off the y coordinates to keep the difference constant
        #print(diff)
        try:

            dictmap[diff]=dictmap[diff]+1
        except:
            dictmap[diff]=1

        if len(dictmap)==1:
            c=c+1
            if c>maxlength:
                maxlength=c
#                             if bottom>max_bottom:
#                                 max_bottom=bottom
            for keys in dictmap:
                count=dictmap[keys]
                if count>10:
                    #print(dictmap)
                    #print(cur_block)
                    bottom=cur_block_ycoord
                    bottom_block=cur_block
                    if bottom>max_bottom:
                        max_bottom=bottom

                    table_flag=True
        else:
            #break
            pass
    
    pass

In [10]:
def get_text_line_blocks(line_blocks):
    return list(map(lambda x: x.get('Text') if x else None, line_blocks))

In [11]:
def get_right_margin(word_lines):


    for word in word_lines:

        under_blocks_with_delta = get_blocks_under_with_same_start_pos(blocks, word,delta=0.001)
        table_flag=False
        #print(get_text_line_blocks(under_blocks))
        if len(under_blocks_with_delta)>=3:
            dictmap={}
            c=0
            top=under_blocks_with_delta[0]['Geometry']['BoundingBox']['Top']
            bottom=None
            bottom_block=None
            for idx in range(1,len(under_blocks_with_delta)):
                under_block=under_blocks_with_delta[idx]
                prev_block=under_blocks_with_delta[idx-1]
                cur_block=under_blocks_with_delta[idx]
                prev_block_ycoord=prev_block['Geometry']['BoundingBox']['Top']
                cur_block_ycoord=cur_block['Geometry']['BoundingBox']['Top']
                cur_right_coord=cur_block['Geometry']['BoundingBox']['Left']
                max_right_margin=cur_right_coord
                if max_right_margin>max_right_global:
                    max_right_global=max_right_margin
                max_right_block=cur_block
                if cur_block_ycoord<prev_block_ycoord:
                    #print(dictmap)
                    break
                    #continue
                diff=cur_block_ycoord-prev_block_ycoord
                diff=round(diff, 2)
                #rounding off the y coordinates to keep the difference constant
                #print(diff)
                try:

                    dictmap[diff]=dictmap[diff]+1
                except:
                    dictmap[diff]=1

                if len(dictmap)==1:
                    c=c+1
                    if c>maxlength:
                        maxlength=c
                    for keys in dictmap:
                        count=dictmap[keys]
                        if count>10:
                            print(dictmap)
                            #print(cur_block)
                            bottom=cur_block_ycoord
                            bottom_block=cur_block
                            table_flag=True
                else:
                    pass
                if c<(maxlength)//2:
                    break




    print('max_right_block_below')
    print(max_right_block)


In [12]:
def check_similiar_lengths(under_blocks_without_delta):
    #print(get_text_line_blocks(under_blocks_without_delta))
    max_bottom=0
    dictcount, dicttype = {}, {}
    under_blocks_without_delta=copy.deepcopy(under_blocks_without_delta)
    breaking_block = copy.deepcopy(under_blocks_without_delta[0])
    for index in range(len(under_blocks_without_delta)):
        var_type = "None"
        if under_blocks_without_delta[index]["Text"].isnumeric():
            var_type = "Number"
        elif under_blocks_without_delta[index]["Text"].isalnum():
            var_type = "String_Number"
        elif under_blocks_without_delta[index]["Text"].isalpha():
            var_type = "String"    
        current_block_length = len(under_blocks_without_delta[index]["Text"])
        #print('cur_block_length')
        #print(current_block_length)
        
        cur_bottom=under_blocks_without_delta[index]['Geometry']['BoundingBox']['Top']
        try:
            dictcount[current_block_length] = dictcount[current_block_length] + 1
            dicttype[var_type] = dicttype[var_type] + 1
        except:
            dictcount[current_block_length] = 1
            dicttype[var_type] = 1
        #print(dictcount)
#         print(dicttype)
        if (1 <= len(dictcount) < 3) and len(dicttype) == 1:
           
            for key in dictcount:
                count = dictcount[key]
                if count >= 3:
                    
                    #print("dict count")
                    if cur_bottom>max_bottom:
#                         print(f"curr block: {under_blocks_without_delta[index]['Text']}")
                        max_bottom=cur_bottom
                    #print(dictcount)
                    
        else:
            breaking_block = copy.deepcopy(under_blocks_without_delta[index])
            
    return breaking_block,max_bottom
        


In [13]:
def check_characters(under_blocks_without_delta):
    #below blocks character check
    character_break_block = copy.deepcopy(under_blocks_without_delta[0])
    count = 0
    for index in range(len(under_blocks_without_delta)):
        current_block_text = under_blocks_without_delta[index]["Text"]
        if re.match(r'[\d!@#$%&*s\-\\/]', current_block_text, re.IGNORECASE):
            count += 1
        else:
            character_break_block = copy.deepcopy(under_blocks_without_delta[index])
            break
    return character_break_block, count

In [33]:
f = open(r'/home/lenovo/Documents/RealPage-Jupyter/tableidentified2.png.json',)
data = json.load(f)

In [34]:
#blocks = data['Blocks']

In [35]:
import csv
import pandas as pd
from copy import deepcopy
data_text1 = data["Blocks"]
data_text2 = " ".join(list(map(lambda x: x['Text'], data_text1)))
print("Raw Text")
print(data_text2)
blocks = copy.deepcopy(data['Blocks'])
data1 = copy.deepcopy(data['Blocks'])
data2 = [x for x in data1 if not re.match(r'^[a-z]{1,2}$', x["Text"], re.IGNORECASE) and not re.match(r'^[;\',\-\/\.]{1,2}$', x["Text"], re.IGNORECASE)]
print("Text after Cleaning")
print(get_text_line_blocks(data2))
data["Blocks"] = data2
response=data
columns = []
lines = []
all_columns=[]
for item in response["Blocks"]:
      if item["BlockType"] == "LINE":
        column_found=False
        for index, column in enumerate(columns):
            bbox_left = item["Geometry"]["BoundingBox"]["Left"]
            bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
            bbox_centre = (item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"])/2
            column_centre = (column['left'] + column['right'])/2
            
            if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right) or (bbox_left > column['left'] and bbox_left < column['right']+0.001) :
                #Bbox appears inside the column
#                 diff=abs(column_centre-bbox_centre)
#                 if diff<0.01:
                lines.append([index, item])
                column_found=True
                break
        if not column_found:
            columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]})
            lines.append([len(columns)-1, item])
lines.sort(key=lambda x: x[0])
for line in lines:
    print ((line[0],line[1]["Text"]))
for column in columns:
    print(column)
    
    
#column_data={}
dictmap={}
maxcolumn=0

column_headers=[]
for line in lines:
    #column_headers.append(line[1][0])
    
    
    if len(line[1])>maxcolumn:
        
        maxcolumn=len(line[1])
        
    try:
        dictmap[line[0]].append(line[1])
    except:
        dictmap[line[0]]=[line[1]]
# df = pd.DataFrame(dictmap)
# print(df)
# df.to_csv()
#print(dictmap)

for keys in dictmap:
    if len(dictmap[keys])>maxcolumn:
        maxcolumn=len(dictmap[keys])


table=[]
all_blocks_region=copy.deepcopy(data['Blocks'])
maxrow=len(all_blocks_region[0])
row=[]
row.append(all_blocks_region[0])
for idx in range(1,len(all_blocks_region)):
    curblock=all_blocks_region[idx]
    prev_block=all_blocks_region[idx-1]
    
    if len(curblock)>maxrow:
        maxrow=len(curblock)
        

    xprev=all_blocks_region[idx-1]['Geometry']['BoundingBox']['Left']
    xcur=all_blocks_region[idx]['Geometry']['BoundingBox']['Left']
    if xcur>xprev:
        row.append(all_blocks_region[idx])
    else:

        table.append(row)
        row=[]
        row.append(all_blocks_region[idx])
table.append(row)
    #print(row)
    
# print(" all rows line by line")
# print(table)
# total_words=0
# for row in table:
#     for sentence in row:
#         words=sentence.split()
#         total_words=total_words+len(words)

#     print(row)
# print('total words below')
# print(total_words)



# table_name='Table1'
# with open(table_name+'.csv', mode='w') as employee_file:
#     employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     for rows in table:
#         employee_writer.writerow(row)

# print("All Tables below")
# print(table)

                
            
            
            
matrix=[[None for x in range(0,maxcolumn)] for x in range(0,len(table))]
                              
                              




for column in lines:
    column_number,item=column
    row_number=None
    flag=True
    bbox_left = item["Geometry"]["BoundingBox"]["Left"]
    bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
    bbox_centre = (item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"])/2
   # column_centre = (column['left'] + column['right'])/2
    for idx in range(0,len(table)):

        cur_row=table[idx]

        for block in cur_row:

            if block["Text"]==item["Text"]:

                block_left=block["Geometry"]["BoundingBox"]["Left"]

                diff =abs(block_left-bbox_left)
                if diff <0.01:
                    matrix[idx][column_number]=item["Text"]
print("Matrix before checking")
print(matrix)

for row in matrix:
    print(row)
                      

# Checking if it is a valid table
check_matrix = copy.deepcopy(matrix)
column_names, header_length = [], 0
index = 0
# headers = matrix.pop(0)
# df = pd.DataFrame(matrix, columns=headers)
# df.to_csv('file1.csv')
# print("Final")
# print(df)
valid_table1 = False
possible_column_names = check_matrix[:5]
max_col, i = 0, 0
for colm in possible_column_names:
    cols = [x for x in colm if (x is not None and not re.match(r'[;\',\-\/]', x, re.IGNORECASE)) and (re.match(r'^[a-z .]+$', x, re.IGNORECASE))]
    if max_col < len(cols):
        max_col = len(cols)
        valid_table1 = True
        index = i
        column_names = cols
        header_length = len(cols)
    valid_table1 = True
    
# while True:
#     if index > len(check_matrix) - 1:
#         break
#     column_names = [x for x in check_matrix[index] if x is not None and not re.match(r'[.;\',\-\/]', x, re.IGNORECASE)]
#     if len(column_names) > 1:
#         header_length = len(column_names)
#         valid_table1 = True
#         break
#     index += 1
if valid_table1:
    print(f"Header: {column_names}")
    print(f"Header length: {header_length}")
    valid_row_count = 0 
    for row in matrix[index:]:
#         column_val_count = 0
        row_name = [x for x in row if x is not None and not re.match(r'[.;\',\-\/]', x, re.IGNORECASE)]
        print("This")
        print(row_name)
#         for name in row:
#             if name is None:
#                 break
#             else:
#                 column_val_count += 1
        column_val_count = len(row_name)
        if column_val_count > header_length:
            break
        else:
            valid_row_count += 1

    valid_table2 = False
    if valid_row_count >= 3 and header_length > 1:
        valid_table2 = True
        print(f"Valid rows: {valid_row_count}")

    if valid_table2:
        print("Valid Matrix below")
        for row in matrix:
            print(row)
        headers = matrix.pop(0)
        df = pd.DataFrame(matrix, columns=headers)
        df.to_csv('tablefound1.csv')
        print("Final")
        print(df)
    
    else:
        print("Not a valid table")
else:
    print("Not a valid table")
                        
        










    
   
        
                        
                        
                        
                        


                    
            
                    
        
                    
                    
                    


        
        
    
    
    

Raw Text
Description Date Ticket Quantity Amount Ticket Total 66.29 Overage service yards - recycle materials 1595792636 02/24/20 838059 1.00 66.29 Ticket Total 66.29 96 Gallon cart service - organics 03/01/20 10.00 0.00 Monthly recycle per unit 03/01/20 188.00 1,432.56 6 - 3 Yard dumpster recycle 2 times per week 03/01/20 6.00 2,201.52 6 - 3 Yard dumpster 2 times per week 03/01/20 6.00 4,403.04 Total Current Charges 8,501.15
Text after Cleaning
['Description', 'Date', 'Ticket', 'Quantity', 'Amount', 'Ticket Total', '66.29', 'Overage service yards - recycle materials 1595792636', '02/24/20', '838059', '1.00', '66.29', 'Ticket Total', '66.29', '96 Gallon cart service - organics', '03/01/20', '10.00', '0.00', 'Monthly recycle per unit', '03/01/20', '188.00', '1,432.56', '6 - 3 Yard dumpster recycle 2 times per week', '03/01/20', '6.00', '2,201.52', '6 - 3 Yard dumpster 2 times per week', '03/01/20', '6.00', '4,403.04', 'Total Current Charges', '8,501.15']
(0, 'Description')
(0, 'Ticket T

In [36]:
import csv
import pandas as pd
from copy import deepcopy

data_text1 = data["Blocks"]
data_text2 = " ".join(list(map(lambda x: x['Text'], data_text1)))
print("Raw Text")
print(data_text2)
blocks = copy.deepcopy(data['Blocks'])
data1 = copy.deepcopy(data['Blocks'])
data2 = [x for x in data1 if not re.match(r'^[a-z]{1,2}$', x["Text"], re.IGNORECASE) and not re.match(r'^[;\',\-\/\.]{1,2}$', x["Text"], re.IGNORECASE)]
print("Text after Cleaning")
print(get_text_line_blocks(data2))
data["Blocks"] = data2
response=data
columns = []
lines = []



first_block=blocks[0]

# first_line=get_lines_with_words(first_block)








for item in response["Blocks"]:
      if item["BlockType"] == "LINE":
        column_found=False
        for index, column in enumerate(columns):
            bbox_left = item["Geometry"]["BoundingBox"]["Left"]
            bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
            bbox_centre = (item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"])/2
            column_centre = (column['left'] + column['right'])/2
            
            if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right) or (bbox_left > column['left'] and bbox_left < column['right']+0.001) :
                #Bbox appears inside the column
#                 diff=abs(column_centre-bbox_centre)
#                 if diff<0.01:
                lines.append([index, item])
                column_found=True
                break
        if not column_found:
            columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]})
            lines.append([len(columns)-1, item])
lines.sort(key=lambda x: x[0])
for line in lines:
    print ((line[0],line[1]["Text"]))
for column in columns:
    print(column)
    
    
#column_data={}
dictmap={}
maxcolumn=0

column_headers=[]
for line in lines:
    #column_headers.append(line[1][0])
    
    
    if len(line[1])>maxcolumn:
        
        maxcolumn=len(line[1])
        
    try:
        dictmap[line[0]].append(line[1])
    except:
        dictmap[line[0]]=[line[1]]
# df = pd.DataFrame(dictmap)
# print(df)
# df.to_csv()
#print(dictmap)

for keys in dictmap:
    if len(dictmap[keys])>maxcolumn:
        maxcolumn=len(dictmap[keys])


table=[]
all_blocks_region=copy.deepcopy(data['Blocks'])
maxrow=len(all_blocks_region[0])
row=[]
row.append(all_blocks_region[0])
for idx in range(1,len(all_blocks_region)):
    curblock=all_blocks_region[idx]
    prev_block=all_blocks_region[idx-1]
    
    if len(curblock)>maxrow:
        maxrow=len(curblock)
        

    xprev=all_blocks_region[idx-1]['Geometry']['BoundingBox']['Left']
    xcur=all_blocks_region[idx]['Geometry']['BoundingBox']['Left']
    if xcur>xprev:
        row.append(all_blocks_region[idx])
    else:

        table.append(row)
        row=[]
        row.append(all_blocks_region[idx])
table.append(row)
    #print(row)

#print(table)
first_line=table[0]

all_columns=[]

for header in first_line:
    column=[header]
    all_column=get_blocks_under_with_same_start_pos(blocks, header, delta=0.1)
    
    
    for items in all_column:
        column.append(items)
    all_columns.append(column)

# table_name='Table1'
# with open(table_name+'.csv', mode='w') as employee_file:
#     employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     for rows in table:
#         employee_writer.writerow(row)

# print("All Tables below")
# print(table)

                
            
            
            
matrix=[[None for x in range(0,maxcolumn)] for x in range(0,len(table))]
                              
                              
# print(all_columns)



for i in range(0,len(all_columns)):
    column=all_columns[i]
    column_number=i
    for item in column:
        row_number=None
        flag=True
        print(item["Text"])
        bbox_left = item["Geometry"]["BoundingBox"]["Left"]
        bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
        bbox_centre = (item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"])/2
       # column_centre = (column['left'] + column['right'])/2
        for idx in range(0,len(table)):

            cur_row=table[idx]

            for block in cur_row:

                if block["Text"]==item["Text"]:

                    block_left=block["Geometry"]["BoundingBox"]["Left"]

                    diff =abs(block_left-bbox_left)
                    if diff <0.01:
                        matrix[idx][column_number]=item["Text"]
print("Matrix before checking")
print(matrix)

for row in matrix:
    print(row)
                      
# Checking if it is a valid table
check_matrix = copy.deepcopy(matrix)
column_names, header_length = [], 0
index = 0
# headers = matrix.pop(0)
# df = pd.DataFrame(matrix, columns=headers)
# df.to_csv('file1.csv')
# print("Final")
# print(df)
valid_table1 = False
possible_column_names = check_matrix[:5]
max_col, i = 0, 0
for colm in possible_column_names:
    cols = [x for x in colm if (x is not None and not re.match(r'[;\',\-\/]', x, re.IGNORECASE)) and (re.match(r'^[a-z .]+$', x, re.IGNORECASE))]
    if max_col < len(cols):
        max_col = len(cols)
        valid_table1 = True
        index = i
        column_names = cols
        header_length = len(cols)
    valid_table1 = True
    
# while True:
#     if index > len(check_matrix) - 1:
#         break
#     column_names = [x for x in check_matrix[index] if x is not None and not re.match(r'[.;\',\-\/]', x, re.IGNORECASE)]
#     if len(column_names) > 1:
#         header_length = len(column_names)
#         valid_table1 = True
#         break
#     index += 1
if valid_table1:
    print(f"Header: {column_names}")
    print(f"Header length: {header_length}")
    valid_row_count = 0 
    for row in matrix[index:]:
#         column_val_count = 0
        row_name = [x for x in row if x is not None and not re.match(r'[.;\',\-\/]', x, re.IGNORECASE)]
        print("This")
        print(row_name)
#         for name in row:
#             if name is None:
#                 break
#             else:
#                 column_val_count += 1
        column_val_count = len(row_name)
        if column_val_count > header_length:
            break
        else:
            valid_row_count += 1

    valid_table2 = False
    if valid_row_count >= 3 and header_length > 1:
        valid_table2 = True
        print(f"Valid rows: {valid_row_count}")

    if valid_table2:
        print("Valid Matrix below")
        for row in matrix:
            print(row)
        headers = matrix.pop(0)
        df = pd.DataFrame(matrix, columns=headers)
        df.to_csv('tablefound2.csv')
        print("Final")
        print(df)
    
    else:
        print("Not a valid table")
else:
    print("Not a valid table")
                        

                        

Raw Text
Description Date Ticket Quantity Amount Ticket Total 66.29 Overage service yards - recycle materials 1595792636 02/24/20 838059 1.00 66.29 Ticket Total 66.29 96 Gallon cart service - organics 03/01/20 10.00 0.00 Monthly recycle per unit 03/01/20 188.00 1,432.56 6 - 3 Yard dumpster recycle 2 times per week 03/01/20 6.00 2,201.52 6 - 3 Yard dumpster 2 times per week 03/01/20 6.00 4,403.04 Total Current Charges 8,501.15
Text after Cleaning
['Description', 'Date', 'Ticket', 'Quantity', 'Amount', 'Ticket Total', '66.29', 'Overage service yards - recycle materials 1595792636', '02/24/20', '838059', '1.00', '66.29', 'Ticket Total', '66.29', '96 Gallon cart service - organics', '03/01/20', '10.00', '0.00', 'Monthly recycle per unit', '03/01/20', '188.00', '1,432.56', '6 - 3 Yard dumpster recycle 2 times per week', '03/01/20', '6.00', '2,201.52', '6 - 3 Yard dumpster 2 times per week', '03/01/20', '6.00', '4,403.04', 'Total Current Charges', '8,501.15']
(0, 'Description')
(0, 'Ticket T