In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
import os
import cv2
import numpy as np
from streamlit_drawable_canvas import st_canvas


## Extract and Track Figure Pages

In [2]:
def extract_figure_pages_with_ocr(pdf_path, extracted_figures_path, year):
    # Use OCR to extract text
    ocr_config = r'--oem 3 --psm 6'
    doc = fitz.open(pdf_path)
    saved_figures = []
    figure_1_detected = False
    for page_num, page in enumerate(doc):
        # Increase DPI of the image
        zoom_x = 2 # horizontal zoom
        zoom_y = 2  # vertical zoom
        mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

        # Render the page into an image using the zoom matrix
        pix = page.get_pixmap(matrix=mat)

        # Open the image with PIL
        img_0 = Image.open(io.BytesIO(pix.tobytes("ppm")))
        # Rotate the image 90 degrees clockwise

        # Get text from images
        text_0 = pytesseract.image_to_string(img_0, config=ocr_config)

        saved = False
        rotation = 0
        ## to avoid overwriting figure 1 with figure 10, etc. 
        figure_range = range(1, 20)
        if(figure_1_detected):
            figure_range = range(2,19)

        for i in figure_range:
            figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
            figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
            if(any(substring in text_0 for substring in figure_keywords)):
                img_0.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                saved = True
                if(i == 1):
                    figure_1_detected = True
                print(f"Saved {figure_name} from {pdf_path}")
                saved_figures.append({
                    "figure_name" : figure_name, 
                    "figure_number": i,
                    "page_number": page_num + 1,
                    "image_rotation": rotation
                })

        if not saved and (len(text_0) < 2000): 
            img_m_90 = img_0.rotate(-90, expand=True)  # Use expand=True to resize the image to fit the new orientation
            text_m_90 = pytesseract.image_to_string(img_m_90, config=ocr_config)
            for i in figure_range:
                figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                rotation = 0
                if(any(substring in text_m_90 for substring in figure_keywords)):
                    img_m_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                    saved = True
                    rotation = -90
                    if(i == 1):
                        figure_1_detected = True
                    print(f"Saved {figure_name} from {pdf_path}")
                    saved_figures.append({
                        "figure_name" : figure_name, 
                        "figure_number": i,
                        "page_number": page_num + 1,
                        "image_rotation": rotation
                    })

            if not saved and (len(text_m_90) < 2000): 
                img_p_90 = img_0.rotate(90, expand=True)  # Use expand=True to resize the image to fit the new orientation
                text_p_90 = pytesseract.image_to_string(img_p_90, config=ocr_config)
                for i in figure_range:
                    figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                    figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                    rotation = 0
                    if(any(substring in text_p_90 for substring in figure_keywords)):
                        img_p_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                        saved = True
                        rotation = 90 
                        if(i == 1):
                            figure_1_detected = True
                        print(f"Saved {figure_name} from {pdf_path}")
                        saved_figures.append({
                            "figure_name" : figure_name, 
                            "figure_number": i,
                            "page_number": page_num + 1,
                            "image_rotation": rotation
                        })
        
    return saved_figures


In [3]:

# Define the directory to process
extracted_figures_dir = '../extracted_figure_pages'

# List to hold data for the DataFrame
figure_data = []
papers_with_no_identified_figures = []


for year in range(2000,2023): 
    paper_year_dir = f'../papers_by_year/{year}'
    figure_year_dir = f'{extracted_figures_dir}/{year}'
    # Loop through all files in the directory
    test_files =  [
            # "../papers_by_year/2010/Azoulay et al._2010_ASQ_Nasty Brutish and Short - Embeddedness Failure in the Pharma Industry.pdf", 
            # "../papers_by_year/2010/Bresman_2010_OrgSci_Explaining Employee Engagement with Strategic Change.pdf"
            ]
    
    for paper_file in os.listdir(paper_year_dir):
        paper_path = os.path.join(paper_year_dir, paper_file)

        ## make year directory in extracted figures folder if needed
        os.makedirs(figure_year_dir, exist_ok=True)

        if os.path.isfile(paper_path) and paper_path not in test_files:
            print(f'processing {paper_path}')
            figures =  extract_figure_pages_with_ocr(paper_path, extracted_figures_dir, year)
            # Process each figure
            for figure_info in figures:
                figure_data.append({
                    'original paper': paper_file,
                    'figure name': figure_info["figure_name"],
                    'figure number': figure_info["figure_number"], 
                    'year': year, 
                    'page number': figure_info["page_number"],
                    'image rotation': figure_info["image_rotation"]
                })

            if(len(figures) == 0):
                papers_with_no_identified_figures.append({
                    'original paper': paper_file,
                    'year': year
                })




processing ../papers_by_year/2000/Human et al_2000_ASQ_Legitimacy Building in the Evolution of Small Firm Multilateral Networks.pdf
processing ../papers_by_year/2000/Glynn_2000_OrgSci_When Cymbals become Symbols.pdf
Saved F1_P9_Glynn_2000_OrgSci_When Cymbals become Symbols.png from ../papers_by_year/2000/Glynn_2000_OrgSci_When Cymbals become Symbols.pdf
processing ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
Saved F1_P6_James_2000_OrgSci_Race-Related Differences_Quant.png from ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
Saved F2_P10_James_2000_OrgSci_Race-Related Differences_Quant.png from ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
processing ../papers_by_year/2000/Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf
Saved F1_P1_Bansal & Roth_2000_AMJ_Why Companies Go Green.png from ../papers_by_year/2000/Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf
Saved F2_P11_Bansal & Roth_2000_AMJ_Why 

In [None]:
# Create DataFrames
figures_df = pd.DataFrame(figure_data, columns=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'])
no_figures_df = pd.DataFrame(papers_with_no_identified_figures, columns=['original paper', 'year'])
figures_df.head()

Unnamed: 0,original paper,figure name,figure number,year,page number,image rotation
0,"Jacobs, Kreutzer & Vaara_2021_AMJ_Political Dy...","F1_Jacobs, Kreutzer & Vaara_2021_AMJ_Political...",1,2021,7,0
1,"Jacobs, Kreutzer & Vaara_2021_AMJ_Political Dy...","F2_Jacobs, Kreutzer & Vaara_2021_AMJ_Political...",2,2021,10,0
2,"Jacobs, Kreutzer & Vaara_2021_AMJ_Political Dy...","F3_Jacobs, Kreutzer & Vaara_2021_AMJ_Political...",3,2021,12,0
3,"Jacobs, Kreutzer & Vaara_2021_AMJ_Political Dy...","F3_Jacobs, Kreutzer & Vaara_2021_AMJ_Political...",3,2021,27,0
4,"Jacobs, Kreutzer & Vaara_2021_AMJ_Political Dy...","F3_Jacobs, Kreutzer & Vaara_2021_AMJ_Political...",3,2021,28,0


In [None]:
# Define the path to the Excel file
figure_tracking_file_path = "../extracted_figure_pages.xlsx"

# Check if the file exists
if os.path.exists(figure_tracking_file_path):
    # Read the existing Excel file
    existing_df = pd.read_excel(figure_tracking_file_path)
    # Concatenate existing and new data
    combined_df = pd.concat([existing_df, figures_df], ignore_index=True)
#     # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_df.drop_duplicates(subset=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'], inplace=True)
else:
    combined_df = figures_df

# Sort by 'year'
combined_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_df.to_excel(figure_tracking_file_path, index=False)

In [None]:
combined_df.shape

(1928, 6)

In [None]:
no_figures_df

Unnamed: 0,original paper,year
0,Dawson & Bencherki_2021_HR_Federal Employees o...,2021
1,Burchiellaro_2021_OrgStudies_Queering Control ...,2021
2,"Brooks, Grgulis & Cook_2021_HR_Unlearning and ...",2021
3,Morlacchi_2021_OrgStudies_The Performative Pow...,2021
4,"Garcia-Lorenzo, Sell-Trujillo & Donnelly_2021_...",2021
5,Kozhevnikov_2021_HR_Career Capital in Global V...,2021
6,"Ji, Huang & Li_2021_AMJ_Guilt and Corporate Ph...",2021
7,Zhu & Delbridge_2021_HR_The Management of Seco...,2021
8,"Weller, Brown & Clarke_2021_HR_Questing for me...",2021
9,Jiang_2021_J Management Studies _The Sharing E...,2021


In [None]:
# Define the path to the Excel file for no-figures papers
no_figures_file_path = "../no_extracted_figure_papers.xlsx"

# Check if the file exists
if os.path.exists(no_figures_file_path):
    # Read the existing Excel file
    existing_no_figures_df = pd.read_excel(no_figures_file_path)
    # Concatenate existing and new data
    combined_no_figures_df = pd.concat([existing_no_figures_df, no_figures_df], ignore_index=True)
    # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_no_figures_df.drop_duplicates(subset=['original paper', 'year'], inplace=True)
else:
    combined_no_figures_df = no_figures_df

# Sort by 'year'
combined_no_figures_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_no_figures_df.to_excel(no_figures_file_path, index=False)

print("No-figure paper data saved successfully to Excel.")

No-figure paper data saved successfully to Excel.
