In [6]:
import cv2
import numpy as np
import pytesseract
import re
from PIL import Image
import os
import pandas as pd


## Extract Text From Image Functions

In [34]:
def extract_text_length_from_image(image_path):
    # Load the image using OpenCV
    img = cv2.imread(image_path)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply adaptive thresholding to make text stand out
    adaptive_thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                            cv2.THRESH_BINARY, 11, 2)

    # Show the thresholded image
    cv2.imshow('Thresholded Image', adaptive_thresh)
    cv2.waitKey(0)  # Wait for a key press to close the window
    cv2.destroyAllWindows()  # Close all OpenCV windows

    # Convert the image back to PIL format to use with pytesseract
    img_final = Image.fromarray(adaptive_thresh)

    # OCR configuration for detecting text in multiple regions
    ocr_config = '--oem 3 --psm 12'  # psm 12 for sparse text in a single column of variable sizes

    # Extract text from the preprocessed image using pytesseract
    extracted_text = pytesseract.image_to_string(img_final, config=ocr_config)
    filtered_text = re.sub('[^a-zA-Z0-9]', '', extracted_text)
    text_length = len(filtered_text)

    # Return the length of the filtered text and the filtered text
    return text_length, filtered_text




In [35]:
# extract_text_length_from_image("../final_figures/2017/F1_P8_Obodaru_AMJ_2017_Forgone but not Forgotten Toward a Theory of Forgone Professional Identities.png")

(61, 'NSSNNNENxpSModelofigfandHoldingUREENNYERENNBB8feNNNNSNalIdeNN')

## Run On All images

In [17]:

# Define the base directory
base_dir = "../final_figures"

# Initialize a list to hold all the results
results = []
# Iterate over each directory and subdirectory
for root, dirs, files in os.walk(base_dir):
    for file in files:
        # Check for image files (assuming typical image extensions)
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            # Construct the full file path
            full_path = os.path.join(root, file)
            
            # Extract the text length from the image
            text_length, extracted_text = extract_text_length_from_image(full_path)
            
            # Extract the year from the directory name
            year = os.path.basename(root)
            
            # Append the results to the list
            results.append({"image_name": file, "year": year, "extracted_text_length": text_length, "extracted_text": extracted_text})



## Create and save the dataframe to excel

In [20]:
# Create a DataFrame from the results
df = pd.DataFrame(results, columns=["image_name", "year", "extracted_text_length", "extracted_text"])
# Show the DataFrame
df = df.sort_values('year', ascending=True)
df.head()

Unnamed: 0,image_name,year,extracted_text_length,extracted_text
447,F1_P7_Orlikowski_2000_OrgSci_Using Technology ...,2000,270,Figure1EnactmentofStructuresInPracticereeereee...
442,F5_P13_Orlikowski_2000_OrgSci_Using Technology...,2000,265,Figure5ExampleofSkepticismTowardsTechnologyRow...
441,F3_P15_Feldman_2000_OrgSci_Organizational Rout...,2000,131,Figure3APerfarmativeModelofLearninginRoutinesP...
440,F3_P17_Kochan & Rubinstein_2000_OrgSci_Toward ...,2000,967,Figure3PropositionsforaGeneralStakeholderTheor...
439,F1_P5_Voss et al._2000_OrgSci_Linking Organiza...,2000,343,Figure1NonprofitProfessionalTheatresRelational...


In [21]:
current_directory = os.getcwd()

# Get the parent directory
parent_directory = os.path.dirname(current_directory)

# Construct the full path for the Excel file
file_path = os.path.join(parent_directory, "text_extracted_from_figures.xlsx")

# Save the DataFrame as an Excel file
df.to_excel(file_path, index=False)

print(f"DataFrame saved as Excel file at: {file_path}")

DataFrame saved as Excel file at: /Users/paulgaudin/Desktop/figure_averaging/text_extracted_from_figures.xlsx
