@@ -316,6 +316,7 @@ def to_markdown(
316316 filename = None ,
317317 force_text = True ,
318318 page_chunks = False ,
319+ page_separators = False ,
319320 margins = 0 ,
320321 dpi = 150 ,
321322 page_width = 612 ,
@@ -341,6 +342,7 @@ def to_markdown(
341342 image_format: (str) use this image format. Choose a supported one.
342343 force_text: (bool) output text despite of image background.
343344 page_chunks: (bool) whether to segment output by page.
345+ page_separators: (bool) whether to include page separators in output.
344346 margins: omit content overlapping margin areas.
345347 dpi: (int) desired resolution for generated images.
346348 page_width: (float) assumption if page layout is variable.
@@ -381,7 +383,7 @@ def to_markdown(
381383 IGNORE_IMAGES = ignore_images
382384 IGNORE_GRAPHICS = ignore_graphics
383385 DETECT_BG_COLOR = detect_bg_color
384- if doc .is_form_pdf or doc .has_annots ():
386+ if doc .is_form_pdf or ( doc .is_pdf and doc . has_annots () ):
385387 doc .bake ()
386388
387389 # for reflowable documents allow making 1 page for the whole document
@@ -560,6 +562,7 @@ def write_text(
560562 )
561563 parms .line_rects .extend (cells )
562564 parms .written_tables .append (i )
565+ prev_hdr_string = None
563566
564567 # ------------------------------------------------------------
565568 # Pick up images / graphics ABOVE this text block
@@ -592,6 +595,7 @@ def write_text(
592595 if not is_white (img_txt ):
593596 out_string += img_txt
594597 parms .written_images .append (i )
598+ prev_hdr_string = None
595599
596600 parms .line_rects .append (lrect )
597601 # if line rect is far away from the previous one, add a line break
@@ -751,7 +755,7 @@ def output_tables(parms, text_rect):
751755 ):
752756 if i in parms .written_tables :
753757 continue
754- this_md += parms .tabs [i ].to_markdown (clean = False )
758+ this_md += parms .tabs [i ].to_markdown (clean = False ) + " \n "
755759 if EXTRACT_WORDS :
756760 # for "words" extraction, add table cells as line rects
757761 cells = sorted (
@@ -772,7 +776,7 @@ def output_tables(parms, text_rect):
772776 for i , trect in parms .tab_rects .items ():
773777 if i in parms .written_tables :
774778 continue
775- this_md += parms .tabs [i ].to_markdown (clean = False )
779+ this_md += parms .tabs [i ].to_markdown (clean = False ) + " \n "
776780 if EXTRACT_WORDS :
777781 # for "words" extraction, add table cells as line rects
778782 cells = sorted (
@@ -954,7 +958,7 @@ def get_page_output(
954958 ) # accept invisible text
955959
956960 # determine background color
957- parms .bg_color = get_bg_color ( page ) if DETECT_BG_COLOR else None
961+ parms .bg_color = None if not DETECT_BG_COLOR else get_bg_color ( page )
958962
959963 left , top , right , bottom = margins
960964 parms .clip = page .rect + (left , top , - right , - bottom )
@@ -994,12 +998,12 @@ def get_page_output(
994998 if img_info :
995999 img_max_size = abs (parms .clip ) * 0.9
9961000 sane = [i for i in img_info if abs (i ["bbox" ] & parms .clip ) < img_max_size ]
997- if len (sane ) < len (img_info ): # found some
998- img_info = sane # use those images instead
999- # output full page image
1000- name = save_image (parms , parms .clip , "full" )
1001- if name :
1002- parms .md_string += GRAPHICS_TEXT % name
1001+ if len (sane ) < len (img_info ): # found some
1002+ img_info = sane # use those images instead
1003+ # output full page image
1004+ name = save_image (parms , parms .clip , "full" )
1005+ if name :
1006+ parms .md_string += GRAPHICS_TEXT % name
10031007
10041008 img_info = img_info [:30 ] # only accept the largest up to 30 images
10051009 # run from back to front (= small to large)
@@ -1024,31 +1028,31 @@ def get_page_output(
10241028 # Locate all tables on page
10251029 parms .written_tables = [] # stores already written tables
10261030 omitted_table_rects = []
1031+ parms .tabs = []
10271032 if IGNORE_GRAPHICS or not table_strategy :
10281033 # do not try to extract tables
1029- parms . tabs = None
1034+ pass
10301035 else :
1031- parms .tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
1032- # remove tables with too few rows or columns
1033- for i in range (len (parms .tabs .tables ) - 1 , - 1 , - 1 ):
1034- t = parms .tabs .tables [i ]
1036+ tabs = page .find_tables (clip = parms .clip , strategy = table_strategy )
1037+ for t in tabs .tables :
1038+ # remove tables with too few rows or columns
10351039 if t .row_count < 2 or t .col_count < 2 :
10361040 omitted_table_rects .append (pymupdf .Rect (t .bbox ))
1037- del parms .tabs .tables [i ]
1038- parms .tabs .tables .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
1041+ continue
1042+ parms .tabs .append (t )
1043+ parms .tabs .sort (key = lambda t : (t .bbox [0 ], t .bbox [1 ]))
10391044
10401045 # Make a list of table boundary boxes.
10411046 # Must include the header bbox (which may exist outside tab.bbox)
10421047 tab_rects = {}
1043- if parms .tabs is not None :
1044- for i , t in enumerate (parms .tabs .tables ):
1045- tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
1046- tab_dict = {
1047- "bbox" : tuple (tab_rects [i ]),
1048- "rows" : t .row_count ,
1049- "columns" : t .col_count ,
1050- }
1051- parms .tables .append (tab_dict )
1048+ for i , t in enumerate (parms .tabs ):
1049+ tab_rects [i ] = pymupdf .Rect (t .bbox ) | pymupdf .Rect (t .header .bbox )
1050+ tab_dict = {
1051+ "bbox" : tuple (tab_rects [i ]),
1052+ "rows" : t .row_count ,
1053+ "columns" : t .col_count ,
1054+ }
1055+ parms .tables .append (tab_dict )
10521056 parms .tab_rects = tab_rects
10531057 # list of table rectangles
10541058 parms .tab_rects0 = list (tab_rects .values ())
@@ -1064,15 +1068,12 @@ def get_page_output(
10641068 and p ["rect" ].width < parms .clip .width
10651069 and p ["rect" ].height < parms .clip .height
10661070 and (p ["rect" ].width > 3 or p ["rect" ].height > 3 )
1067- and not (p ["fill" ] == parms .bg_color and p ["fill" ] != None )
1068- and not intersects_rects (
1069- p ["rect" ], parms .tab_rects0 + omitted_table_rects
1070- )
1071+ and not (p ["type" ] == "f" and p ["fill" ] == parms .bg_color )
1072+ and not intersects_rects (p ["rect" ], parms .tab_rects0 )
10711073 and not intersects_rects (p ["rect" ], parms .annot_rects )
10721074 ]
10731075 else :
10741076 paths = []
1075-
10761077 # catch too-many-graphics situation
10771078 if GRAPHICS_LIMIT and len (paths ) > GRAPHICS_LIMIT :
10781079 paths = []
@@ -1168,6 +1169,9 @@ def get_page_output(
11681169 else :
11691170 words = []
11701171 parms .words = words
1172+ if page_separators :
1173+ # add page separators to output
1174+ parms .md_string += f"\n \n --- end of page={ parms .page .number } ---\n \n "
11711175 return parms
11721176
11731177 if page_chunks is False :
0 commit comments