In [2]:
import fitz

In [6]:
import os


def capture_page_images(pdf_path, output_folder, zoom_factor=2):
    os.makedirs(output_folder, exist_ok=True)
    
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  
        mat = fitz.Matrix(zoom_factor, zoom_factor)
        pix = page.get_pixmap(matrix=mat)
        
        image_save_path = os.path.join(output_folder, f"captured_page_{page_num+1}.png")
        pix.save(image_save_path)
        
        print(f"Captured and saved page image to: {image_save_path}")

if __name__ == "__main__":
    # Modify as needed
    pdf_path = "D:/Desktop/PROJECT/AI-Powered Visual Report Generator/AI-Powered-Visual-Report-Generator/DASHBOARD.pdf"
    output_folder = "pdf_pages2"
    capture_page_images(pdf_path, output_folder)



Captured and saved page image to: pdf_pages2\captured_page_1.png
Captured and saved page image to: pdf_pages2\captured_page_2.png
Captured and saved page image to: pdf_pages2\captured_page_3.png


In [7]:
pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl (39.5 MB)
     ---------------------------------------- 0.0/39.5 MB ? eta -:--:--
     ---------------------------------------- 0.4/39.5 MB 8.3 MB/s eta 0:00:05
      --------------------------------------- 0.7/39.5 MB 8.9 MB/s eta 0:00:05
      --------------------------------------- 0.9/39.5 MB 7.0 MB/s eta 0:00:06
     - -------------------------------------- 1.0/39.5 MB 6.6 MB/s eta 0:00:06
     - -------------------------------------- 1.0/39.5 MB 6.6 MB/s eta 0:00:06
     - -------------------------------------- 1.0/39.5 MB 6.6 MB/s eta 0:00:06
     - -------------------------------------- 1.0/39.5 MB 6.6 MB/s eta 0:00:06
     - -------------------------------------- 1.0/39.5 MB 6.6 MB/s eta 0:00:06
     - -------------------------------------- 1.3/39.5 MB 3.4 MB/s eta 0:00:12
     -- ------------------------------------- 2.0/39.5 MB 4.4 MB/s eta 0:00:09
     -- ---------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import cv2
import numpy as np
import os

def extract_graphs(image_folder):
    graphs = {}
    # Iterate over each file in the folder
    for filename in os.listdir(image_folder):
        # Construct the full path to the image file
        image_path = os.path.join(image_folder, filename)
        # Load the image
        image = cv2.imread(image_path)
        
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Apply Gaussian blur to smoothen the image
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        
        # Edge detection
        edged = cv2.Canny(blurred, 10, 100)
        
        # Find contours in the edged image
        contours, _ = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        # Filter contours based on area or other criteria
        for contour in contours:
            # Approximate the contour
            peri = cv2.arcLength(contour, True)
            approx = cv2.approxPolyDP(contour, 0.02 * peri, True)
            
            # The bounding rect of the contour
            x, y, w, h = cv2.boundingRect(approx)
            
            # Consider a minimum area to filter out noise
            if w * h > 10000: # You might need to adjust this value based on your specific images
                # Get the page number from the filename
                page_number = filename.split('_')[-1].split('.')[0]
                if page_number not in graphs:
                    graphs[page_number] = []
                graph = image[y:y+h, x:x+w]
                graphs[page_number].append(graph)
    
    return graphs

# Path to the folder containing the images
image_folder = 'D:/Desktop/PROJECT/AI-Powered Visual Report Generator/AI-Powered-Visual-Report-Generator/Notebooks/pdf_pages2'

# Extract graphs
graphs = extract_graphs(image_folder)

# Ensure the existence of the "All_Graphs" directory
output_folder = "All_Graphs"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save the extracted graphs
for page_number, page_graphs in graphs.items():
    page_folder = os.path.join(output_folder, f'Page_{page_number}')
    if not os.path.exists(page_folder):
        os.makedirs(page_folder)
    for i, graph in enumerate(page_graphs):
        output_path = os.path.join(page_folder, f'graph_{i}.png')
        cv2.imwrite(output_path, graph)
