In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import pandas as pd
import os
import cv2
import numpy as np
from streamlit_drawable_canvas import st_canvas


## Extract and Track Figure Pages

In [2]:
def extract_figure_pages_with_ocr(pdf_path, extracted_figures_path, year):
    # Use OCR to extract text
    ocr_config = r'--oem 3 --psm 6'
    doc = fitz.open(pdf_path)
    saved_figures = []
    figure_1_detected = False
    for page_num, page in enumerate(doc):
        if(page_num == 13):
            # Increase DPI of the image
            zoom_x = 2 # horizontal zoom
            zoom_y = 2  # vertical zoom
            mat = fitz.Matrix(zoom_x, zoom_y)  # zoom factor 2 in each dimension

            # Render the page into an image using the zoom matrix
            pix = page.get_pixmap(matrix=mat)

            # Open the image with PIL
            img_0 = Image.open(io.BytesIO(pix.tobytes("ppm")))
            # Rotate the image 90 degrees clockwise

            # Get text from images
            text_0 = pytesseract.image_to_string(img_0, config=ocr_config)

            saved = False
            rotation = 0
            ## to avoid overwriting figure 1 with figure 10, etc. 
            figure_range = range(1, 20)
            if(figure_1_detected):
                figure_range = range(2,19)


            def getFIgureKeywords(i):
                # account for possible OCR errors: 
                # "1" being read as "I" - As you've noticed, the numeral "1" can often be confused with the uppercase letter "I".
                # "0" being read as "O" - The numeral "0" (zero) can be confused with the uppercase letter "O".
                # "5" being read as "S" - Sometimes, the numeral "5" can be misread as the letter "S".
                # "2" being read as "Z" - Less common, but possible if the image quality is poor, or the font makes these two characters look similar.
                # "8" being read as "B" - The numeral "8" can sometimes be confused with the uppercase letter "B".
                # "6" being read as "G" - Depending on the font, a "6" might be misread as a "G".
                figure_keywords = [f'Figure {i}', f'Figure  {i}'] 
                if(i == 10):
                    figure_keywords = figure_keywords + [f'Figure 1O', f'Figure I0', f'Figure IO', f'Figure  1O', f'Figure  I0', f'Figure  IO']
                elif(i == 1):
                    figure_keywords = figure_keywords + [f'Figure I', f'Figure  I']
                elif(i == 5):
                    figure_keywords = figure_keywords + [f'Figure S', f'Figure  S']
                elif(i == 2):
                    figure_keywords = figure_keywords + [f'Figure Z', f'Figure  Z']
                elif(i == 8):
                    figure_keywords = figure_keywords + [f'Figure B', f'Figure  B']
                elif(i == 6):
                    figure_keywords = figure_keywords + [f'Figure G', f'Figure  G']


            for i in figure_range:
                figure_keywords = [f'Figure {i}'] 
                
                ## keep as array in case needs to expand
                figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                if(any(substring in text_0 for substring in figure_keywords)):
                    img_0.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                    saved = True
                    if(i == 1):
                        figure_1_detected = True
                    print(f"Saved {figure_name} from {pdf_path}")
                    saved_figures.append({
                        "figure_name" : figure_name, 
                        "figure_number": i,
                        "page_number": page_num + 1,
                        "image_rotation": rotation
                    })

            if not saved and (len(text_0) < 2000): 
                img_m_90 = img_0.rotate(-90, expand=True)  # Use expand=True to resize the image to fit the new orientation
                text_m_90 = pytesseract.image_to_string(img_m_90, config=ocr_config)
                for i in figure_range:
                    figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                    figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                    rotation = 0
                    if(any(substring in text_m_90 for substring in figure_keywords)):
                        img_m_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                        saved = True
                        rotation = -90
                        if(i == 1):
                            figure_1_detected = True
                        print(f"Saved {figure_name} from {pdf_path}")
                        saved_figures.append({
                            "figure_name" : figure_name, 
                            "figure_number": i,
                            "page_number": page_num + 1,
                            "image_rotation": rotation
                        })

                if not saved and (len(text_m_90) < 2000): 
                    img_p_90 = img_0.rotate(90, expand=True)  # Use expand=True to resize the image to fit the new orientation
                    text_p_90 = pytesseract.image_to_string(img_p_90, config=ocr_config)
                    for i in figure_range:
                        figure_keywords = [f'Figure {i}'] ## keep as array in case needs to expand
                        figure_name = f"F{i}_P{str(page_num)}_{os.path.splitext(os.path.basename(pdf_path))[0]}.png"
                        rotation = 0
                        if(any(substring in text_p_90 for substring in figure_keywords)):
                            img_p_90.save(extracted_figures_path +"/" + str(year) + "/" + figure_name)
                            saved = True
                            rotation = 90 
                            if(i == 1):
                                figure_1_detected = True
                            print(f"Saved {figure_name} from {pdf_path}")
                            saved_figures.append({
                                "figure_name" : figure_name, 
                                "figure_number": i,
                                "page_number": page_num + 1,
                                "image_rotation": rotation
                            })
        
    return saved_figures


In [3]:

# Define the directory to process
extracted_figures_dir = '../extracted_figure_pages'

# List to hold data for the DataFrame
figure_data = []
papers_with_no_identified_figures = []


for year in range(2000,2023): 
    paper_year_dir = f'../papers_by_year/{year}'
    figure_year_dir = f'{extracted_figures_dir}/{year}'
    # Loop through all files in the directory
    test_files =  [
            # "../papers_by_year/2010/Azoulay et al._2010_ASQ_Nasty Brutish and Short - Embeddedness Failure in the Pharma Industry.pdf", 
            # "../papers_by_year/2010/Bresman_2010_OrgSci_Explaining Employee Engagement with Strategic Change.pdf"
            "../papers_by_year/2022/Creary &amp; Locke_2022_OrgSci_Breaking hte Cycle of Overwork and Recuperation.pdf""
            ]
    
    for paper_file in os.listdir(paper_year_dir):
        paper_path = os.path.join(paper_year_dir, paper_file)

        ## make year directory in extracted figures folder if needed
        os.makedirs(figure_year_dir, exist_ok=True)

        if os.path.isfile(paper_path) and paper_path in test_files:
            print(f'processing {paper_path}')
            figures =  extract_figure_pages_with_ocr(paper_path, extracted_figures_dir, year)
            # Process each figure
            for figure_info in figures:
                figure_data.append({
                    'original paper': paper_file,
                    'figure name': figure_info["figure_name"],
                    'figure number': figure_info["figure_number"], 
                    'year': year, 
                    'page number': figure_info["page_number"],
                    'image rotation': figure_info["image_rotation"]
                })

            if(len(figures) == 0):
                papers_with_no_identified_figures.append({
                    'original paper': paper_file,
                    'year': year
                })




processing ../papers_by_year/2000/Human et al_2000_ASQ_Legitimacy Building in the Evolution of Small Firm Multilateral Networks.pdf
processing ../papers_by_year/2000/Glynn_2000_OrgSci_When Cymbals become Symbols.pdf
Saved F1_P9_Glynn_2000_OrgSci_When Cymbals become Symbols.png from ../papers_by_year/2000/Glynn_2000_OrgSci_When Cymbals become Symbols.pdf
processing ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
Saved F1_P6_James_2000_OrgSci_Race-Related Differences_Quant.png from ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
Saved F2_P10_James_2000_OrgSci_Race-Related Differences_Quant.png from ../papers_by_year/2000/James_2000_OrgSci_Race-Related Differences_Quant.pdf
processing ../papers_by_year/2000/Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf
Saved F1_P1_Bansal & Roth_2000_AMJ_Why Companies Go Green.png from ../papers_by_year/2000/Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf
Saved F2_P11_Bansal & Roth_2000_AMJ_Why 

In [4]:
# Create DataFrames
figures_df = pd.DataFrame(figure_data, columns=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'])
no_figures_df = pd.DataFrame(papers_with_no_identified_figures, columns=['original paper', 'year'])
figures_df.head()

Unnamed: 0,original paper,figure name,figure number,year,page number,image rotation
0,Glynn_2000_OrgSci_When Cymbals become Symbols.pdf,F1_P9_Glynn_2000_OrgSci_When Cymbals become Sy...,1,2000,10,0
1,James_2000_OrgSci_Race-Related Differences_Qua...,F1_P6_James_2000_OrgSci_Race-Related Differenc...,1,2000,7,0
2,James_2000_OrgSci_Race-Related Differences_Qua...,F2_P10_James_2000_OrgSci_Race-Related Differen...,2,2000,11,0
3,Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf,F1_P1_Bansal & Roth_2000_AMJ_Why Companies Go ...,1,2000,2,0
4,Bansal & Roth_2000_AMJ_Why Companies Go Green.pdf,F2_P11_Bansal & Roth_2000_AMJ_Why Companies Go...,2,2000,12,0


In [5]:
# Define the path to the Excel file
figure_tracking_file_path = "../extracted_figure_pages.xlsx"

# Check if the file exists
if os.path.exists(figure_tracking_file_path):
    # Read the existing Excel file
    existing_df = pd.read_excel(figure_tracking_file_path)
    # Concatenate existing and new data
    combined_df = pd.concat([existing_df, figures_df], ignore_index=True)
#     # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_df.drop_duplicates(subset=['original paper', 'figure name', 'figure number', 'year', 'page number', 'image rotation'], inplace=True)
else:
    combined_df = figures_df

# Sort by 'year'
combined_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_df.to_excel(figure_tracking_file_path, index=False)

In [6]:
combined_df.shape

(1893, 6)

In [7]:
no_figures_df

Unnamed: 0,original paper,year
0,Human et al_2000_ASQ_Legitimacy Building in th...,2000
1,Prasad & Prasad_2000_OrgSci_Stretching the Iro...,2000
2,Earley & Mosakowski_2000_AMJ_Creating Hybrid T...,2000
3,Labianca et al._2001_OrgSci_Emulation in Acade...,2001
4,Amabile et al._AMJ_2001_Academic Practitioner ...,2001
...,...,...
210,Mikkelsen_2022_HR_Looking over your shoulder.pdf,2022
211,"Baikovich, Wasserman & Pfefferman_2022_OrgStud...",2022
212,"Bharatan, Swan & Oborn_2022_HR_NAvigating Turb...",2022
213,"Villeseche, Meloui & Jha_2022_HR_Feminism in W...",2022


In [8]:
# Define the path to the Excel file for no-figures papers
no_figures_file_path = "../no_extracted_figure_papers.xlsx"

# Check if the file exists
if os.path.exists(no_figures_file_path):
    # Read the existing Excel file
    existing_no_figures_df = pd.read_excel(no_figures_file_path)
    # Concatenate existing and new data
    combined_no_figures_df = pd.concat([existing_no_figures_df, no_figures_df], ignore_index=True)
    # Remove duplicates. Adjust the subset according to what uniquely identifies a row
    combined_no_figures_df.drop_duplicates(subset=['original paper', 'year'], inplace=True)
else:
    combined_no_figures_df = no_figures_df

# Sort by 'year'
combined_no_figures_df.sort_values(by='year', inplace=True)

# Save to Excel
combined_no_figures_df.to_excel(no_figures_file_path, index=False)

print("No-figure paper data saved successfully to Excel.")

No-figure paper data saved successfully to Excel.
