In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
import os
import cv2
import numpy as np

## Extract and Track Figure Pages

In [3]:
def extract_figure_pages_with_ocr(pdf_path, extracted_figures_path, year):
    # Use OCR to extract text
    ocr_config = r'--oem 3 --psm 6'
    doc = fitz.open(pdf_path)
    saved_figures = []
    for page_num, page in enumerate(doc):
        # Increase DPI of the image
        zoom_x = 2 # horizontal zoom
        zoom_y = 2  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

        # Render the page into an image using the zoom matrix
        pix = page.get_pixmap(matrix=mat)

        # Open the image with PIL
        img_0 = Image.open(io.BytesIO(pix.tobytes("ppm")))
        # Rotate the image 90 degrees clockwise

        # Get text from images
        text_0 = pytesseract.image_to_string(img_0, config=ocr_config)

        saved = False
        rotation = 0

        for i in range(1, 10):
            figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
            figure_name = f"F{i}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
            if(any(substring in text_0 for substring in figure_keywords)):
                img_0.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                saved = True
                print(f"Saved {figure_name} from {pdf_path}")
                saved_figures.append({
                    "figure_name" : figure_name, 
                    "figure_number": i,
                    "page_number": page_num + 1,
                    "image_rotation": rotation
                })

        if not saved and (len(text_0) < 2000): 
            img_m_90 = img_0.rotate(-90, expand=True)  # Use expand=True to resize the image to fit the new orientation
            text_m_90 = pytesseract.image_to_string(img_m_90, config=ocr_config)
            for i in range(1, 10):
                figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                figure_name = f"F{i}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                rotation = 0
                if(any(substring in text_m_90 for substring in figure_keywords)):
                    img_m_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                    saved = True
                    rotation = -90
                    print(f"Saved {figure_name} from {pdf_path}")
                    saved_figures.append({
                        "figure_name" : figure_name, 
                        "figure_number": i,
                        "page_number": page_num + 1,
                        "image_rotation": rotation
                    })

            if (len(text_m_90) < 2000): 
                img_p_90 = img_0.rotate(90, expand=True)  # Use expand=True to resize the image to fit the new orientation
                text_p_90 = pytesseract.image_to_string(img_p_90, config=ocr_config)
                for i in range(1, 10):
                    figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                    figure_name = f"F{i}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                    rotation = 0
                    if(any(substring in text_p_90 for substring in figure_keywords)):
                        img_p_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                        saved = True
                        rotation = 90 
                        print(f"Saved {figure_name} from {pdf_path}")
                        saved_figures.append({
                            "figure_name" : figure_name, 
                            "figure_number": i,
                            "page_number": page_num + 1,
                            "image_rotation": rotation
                        })
        
    return saved_figures


In [4]:
# Define the directory to process
extracted_figures_dir = '../extracted_figure_pages'

# List to hold data for the DataFrame
figure_data = []
papers_with_no_identified_figures = []

count = 0
for year in range(2010,2015): 
    paper_year_dir = f'../papers_by_year/{year}'
    figure_year_dir = f'{extracted_figures_dir}/{year}'
    # Loop through all files in the directory
    test_files =  [
            # "../papers_by_year/2010/Azoulay et al._2010_ASQ_Nasty Brutish and Short - Embeddedness Failure in the Pharma Industry.pdf", 
            # "../papers_by_year/2010/Bresman_2010_OrgSci_Explaining Employee Engagement with Strategic Change.pdf"
            ]
    
    for paper_file in os.listdir(paper_year_dir):
        paper_path = os.path.join(paper_year_dir, paper_file)

        ## make year directory in extracted figures folder if needed
        os.makedirs(figure_year_dir, exist_ok=True)

        if os.path.isfile(paper_path) and paper_path not in test_files:
            print(f'processing {paper_path}')
            count = count + 1
            # Extract figures from the paper
            if(count <= 10):
                figures =  extract_figure_pages_with_ocr(paper_path, extracted_figures_dir, year)
                # Process each figure
                for figure_info in figures:
                    figure_data.append({
                        'original paper': paper_file,
                        'figure name': figure_info["figure_name"],
                        'figure number': figure_info["figure_number"], 
                        'year': year, 
                        'page number': figure_info["page_number"],
                        'image rotation': figure_info["image_rotation"]
                    })

                if(len(figures) == 0):
                    papers_with_no_identified_figures.append({
                        'original paper': paper_file,
                        'year': year
                    })

# Create DataFrames
figures_df = pd.DataFrame(figure_data, columns=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'])
no_figures_df = pd.DataFrame(papers_with_no_identified_figures, columns=['original paper', 'year'])


processing ../papers_by_year/2010/Nielson, Randall & Christensen_2010_HR_Does training managers enhance the effects of implementing team working.pdf
1306
3127
2733
1654
2983
2373
242
281
2994
1774
1308
2907
2091
2373
3514
2788
3435
3296
2679
2045
2197
1826
261
processing ../papers_by_year/2010/MacLean & Behnam_2010_AMJ_The Dangers of Decoupling.pdf
4004
4505


KeyboardInterrupt: 

In [None]:
figures_df.head()

Unnamed: 0,original paper,figure name,figure number,year,page number,image rotation
0,MacLean & Behnam_2010_AMJ_The Dangers of Decou...,F1_MacLean & Behnam_2010_AMJ_The Dangers of De...,1,2010,8,0
1,Azoulay et al._2010_ASQ_Nasty Brutish and Shor...,F2_Azoulay et al._2010_ASQ_Nasty Brutish and S...,2,2010,19,0
2,Azoulay et al._2010_ASQ_Nasty Brutish and Shor...,F4_Azoulay et al._2010_ASQ_Nasty Brutish and S...,4,2010,23,0
3,Gioia et al_2010_ASQ_Forging an Identity.pdf,F1_Gioia et al_2010_ASQ_Forging an Identity.png,1,2010,12,0
4,Gioia et al_2010_ASQ_Forging an Identity.pdf,F2_Gioia et al_2010_ASQ_Forging an Identity.png,2,2010,30,0
5,Tilcsik_2010_AMJ_From Ritual to Reality.pdf,F1_Tilcsik_2010_AMJ_From Ritual to Reality.png,1,2010,14,0
6,Tilcsik_2010_AMJ_From Ritual to Reality.pdf,F2_Tilcsik_2010_AMJ_From Ritual to Reality.png,2,2010,20,0
7,Clark et al._2010_ASQ_Transitional Identity as...,F1_Clark et al._2010_ASQ_Transitional Identity...,1,2010,12,0
8,Clark et al._2010_ASQ_Transitional Identity as...,F2_Clark et al._2010_ASQ_Transitional Identity...,2,2010,30,0
9,Clark et al._2010_ASQ_Transitional Identity as...,F2_Clark et al._2010_ASQ_Transitional Identity...,2,2010,31,0


In [None]:
# Define the path to the Excel file
figure_tracking_file_path = "../extracted_figure_pages.xlsx"

# Check if the file exists
if os.path.exists(figure_tracking_file_path):
    # Read the existing Excel file
    existing_df = pd.read_excel(figure_tracking_file_path)
    # Concatenate existing and new data
    combined_df = pd.concat([existing_df, figures_df], ignore_index=True)
    # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_df.drop_duplicates(subset=['original paper', 'figure name', 'figure number', 'year', 'page number' 'image rotation'], inplace=True)
else:
    combined_df = figures_df

# Sort by 'year'
combined_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_df.to_excel(figure_tracking_file_path, index=False)

In [None]:
no_figures_df

Unnamed: 0,original paper,year
0,"Nielson, Randall & Christensen_2010_HR_Does tr...",2010
1,"Van Den Brink, Benschop & Jansen_2010_OrgStudi...",2010
2,"Hardy & Maguire_2010_AMJ_Discourse, Field-Conf...",2010
3,Rumens_2010__HR_Workplace friendships between ...,2010


In [None]:
# Define the path to the Excel file for no-figures papers
no_figures_file_path = "../no_extracted_figure_papers.xlsx"

# Check if the file exists
if os.path.exists(no_figures_file_path):
    # Read the existing Excel file
    existing_no_figures_df = pd.read_excel(no_figures_file_path)
    # Concatenate existing and new data
    combined_no_figures_df = pd.concat([existing_no_figures_df, no_figures_df], ignore_index=True)
    # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_no_figures_df.drop_duplicates(subset=['original paper', 'year'], inplace=True)
else:
    combined_no_figures_df = no_figures_df

# Sort by 'year'
combined_no_figures_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_no_figures_df.to_excel(no_figures_file_path, index=False)

print("No-figure paper data saved successfully to Excel.")

No-figure data saved successfully to Excel.
