In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
import os
import cv2
import numpy as np

## Extract and Track Figure Pages

In [2]:
def isolate_figure(img, output_path=None, page_num=0, save_full_page=False):
    # Convert PIL image to OpenCV format
    open_cv_image = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    
    # Convert image to grayscale
    gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian Blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    
    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Filter contours based on area size to avoid small noise
    filtered_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > 5000]  # Adjust the area threshold as needed

    if filtered_contours:
        # Get the bounding box for the largest contour
        largest_contour = max(filtered_contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        figure_img = open_cv_image[y:y+h, x:x+w]

        # Draw bounding box on the image for visualization
        cv2.rectangle(open_cv_image, (x, y), (x+w, y+h), (0, 255, 0), 2)

        # Save the image with bounding boxes if output_path is provided
        if save_full_page and output_path:
            print("SAVING CONTOURS")
            cv2.imwrite(output_path, open_cv_image)

        return Image.fromarray(cv2.cvtColor(figure_img, cv2.COLOR_BGR2RGB))
    
    # Return None if no significant figure is found
    return None



In [3]:

# This is a placeholder function. Replace it with actual logic to process figures.
def extract_figures_with_ocr(pdf_path, extracted_figures_path, year):
    # Use OCR to extract text
    ocr_config = r'--oem 3 --psm 6'
    doc = fitz.open(pdf_path)
    saved_figures = []
    for page_num, page in enumerate(doc):

        if(page_num == 14):
            # Increase DPI of the image
            zoom_x = 2.0  # horizontal zoom
            zoom_y = 2.0  # vertical zoom
            mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

            # Render the page into an image using the zoom matrix
            pix = page.get_pixmap(matrix=mat)

            # Open the image with PIL
            img_0 = Image.open(io.BytesIO(pix.tobytes("ppm")))
            # Rotate the image 90 degrees clockwise
            img_m_90 = img_0.rotate(-90, expand=True)  # Use expand=True to resize the image to fit the new orientation
            img_p_90 = img_0.rotate(90, expand=True)  # Use expand=True to resize the image to fit the new orientation

            # Get text from images
            text_0 = pytesseract.image_to_string(img_0, config=ocr_config)
            text_m_90 = pytesseract.image_to_string(img_m_90, config=ocr_config)
            text_p_90 = pytesseract.image_to_string(img_p_90, config=ocr_config)
            
            for i in range(1, 6):
                figure_keywords = [f'Figure {i}']
                saved = False
                figure_name = f"F{i}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                rotation = 0
                if(any(substring in text_0 for substring in figure_keywords)):
                    isolated_figure = isolate_figure(img_0, output_path=extracted_figures_path +"/" + str(year) + "/BB" + figure_name, page_num=page_num, save_full_page=True)
                    if(isolated_figure):
                        isolated_figure.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                        img_0.save(extracted_figures_path +"/" + str(year) + "/P" + figure_name)
                        saved = True
                elif(any(substring in text_m_90 for substring in figure_keywords)):
                    img_m_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                    saved = True
                    rotation = -90
                elif (any(substring in text_p_90 for substring in figure_keywords)):
                    img_p_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                    saved = True
                    rotation = 90 
                if(saved):
                    print(f"Saved {figure_name} from {pdf_path}")
                    saved_figures.append({
                        "figure_name" : figure_name, 
                        "figure_number": i,
                        "page_number": page_num + 1,
                        "image_rotation": rotation
                    })
                    
    return saved_figures



In [4]:
# Define the directory to process
extracted_figures_dir = '../extracted_figures'

# List to hold data for the DataFrame
figure_data = []
papers_with_no_identified_figures = []

for year in range(2010,2011): 
    paper_year_dir = f'../papers_by_year/{year}'
    figure_year_dir = f'{extracted_figures_dir}/{year}'
    # Loop through all files in the directory
    test_files =  [
            "../papers_by_year/2010/Azoulay et al._2010_ASQ_Nasty Brutish and Short - Embeddedness Failure in the Pharma Industry.pdf", 
            # "../papers_by_year/2010/Bresman_2010_OrgSci_Explaining Employee Engagement with Strategic Change.pdf"
            ]
    
    for paper_file in os.listdir(paper_year_dir):
        paper_path = os.path.join(paper_year_dir, paper_file)

        ## make year directory in extracted figures folder if needed
        os.makedirs(figure_year_dir, exist_ok=True)

        if os.path.isfile(paper_path) and paper_path in test_files:
            print(f'processing {paper_path}')
            # Extract figures from the paper
            figures =  extract_figures_with_ocr(paper_path, extracted_figures_dir, year)
            # Process each figure
            for figure_info in figures:
                figure_data.append({
                    'original paper': paper_file,
                    'figure name': figure_info["figure_name"],
                    'figure number': figure_info["figure_number"], 
                    'year': year, 
                    'page number': figure_info["page_number"],
                    'image rotation': figure_info["image_rotation"]
                })

            if(len(figures) == 0):
                papers_with_no_identified_figures.append({
                    'original paper': paper_file,
                    'year': year
                })

# Create DataFrames
figures_df = pd.DataFrame(figure_data, columns=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'])
no_figures_df = pd.DataFrame(figure_data, columns=['original paper', 'year'])


processing ../papers_by_year/2010/Azoulay et al._2010_ASQ_Nasty Brutish and Short - Embeddedness Failure in the Pharma Industry.pdf


In [5]:
figures_df

Unnamed: 0,original paper,figure name,figure number,year,page number,image rotation


In [6]:
no_figures_df

Unnamed: 0,original paper,year
