# ECTA Tesseract Extraction

This notebook extracts references and affiliations using tesseract

## Load Libraries

In [17]:
import fitz

# for OCR using PyTesseract
import cv2                              # pre-processing images
import pytesseract                      # extracting text from images
import numpy as np
import matplotlib.pyplot as plt         # displaying output images
from PIL import Image
import regex
import pandas as pd
import time
import os

In [2]:
base_path="/Users/sijiawu/Work/Refs Danae/Thesis/Data"
temp=base_path+'/PDFs/ECTA/'

In [3]:
Merged=pd.read_excel(base_path+'/Combined/ECTA_M_sco_du.xlsx')
Merged.loc[Merged['journal']=="Econometrica",'journal']='ECTA'

In [10]:
investigate=Merged[(Merged.year<=1960)&(Merged.year>=1950)&(Merged.content_type!="MISC")&(Merged.content_type!="Review")]

In [11]:
exclude={
    1971: "4",
    1970: "4",
    1968: "5",
    1967: "5",
    1966: "5"
}

* 1971 issue 4 
* 1970 issue 4
* 1968 issue 5
* 1967 issue 5
* 1966 issue 5

These are special issues with no author names, it is actually a list of abstracts where the title of each "article" is the title of the article set or discussion panel. Some have references but if these are abstracts then the papers are likely published elsewhere. I choose to exclude this issue due to the previous statement and because this is too inconsistent with the rest of ECTA to process with tesseract. You may be able to ask chatgpt to parse this entirely as a full issue. But this is for last.

In [None]:
# printing first and last page, the assumption is these pngs already exist from the previous stage
# this for loop may also be modified to print every page
number=None
count=0
pgs=0
for i in investigate.index:
#     if number == None:
#         number = Merged.iloc[i]['number']
#         count=count+1
#         continue
#     elif (str(number)!=str(Merged.iloc[i]['number'])):
#         number=str(Merged.iloc[i]['number'])
#         count=1
#         continue
#     elif (str(number)==str(Merged.iloc[i]['number']))&(count<=5):
#         count=count+1
#         continue
#     else:
#         count=count+1
    filepath=base_path+'/PDFs/ECTA/'+Merged.iloc[i]['URL'].split('/')[-1]+'.pdf'
    if os.path.exists(filepath):
        doc=fitz.open(filepath)
        print(Merged.iloc[i]['year'])
        print(Merged.iloc[i]['number'])
        print(Merged.iloc[i]['volume'])
        print(Merged.iloc[i]['author'])
        print(Merged.iloc[i]['title'])
        pgs=pgs+doc.page_count
#         for page in doc: 
#             if (page.number == 1) or (page.number==(doc.page_count-1)):
            
        png = base_path+"/PDFs/ECTA/png/" + Merged.iloc[i]['URL'].split('/')[-1].split('.')[0] + '_wo_cover_page-0.png'
        png2 = base_path+"/PDFs/ECTA/png/" + Merged.iloc[i]['URL'].split('/')[-1].split('.')[0] + '_wo_cover_page-'+str(doc.page_count-2)+'.png'
        print(png)
        #print(png2)
        if (os.path.exists(png)==True)&(os.path.exists(png2)==True):
            original_image = cv2.imread(png)

            # convert the image to grayscale
            gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)

            plt.figure(figsize=(15, 8))
            plt.imshow(gray_image, cmap='gray')
            plt.show()

            #uncomment to print last page too
            original_image = cv2.imread(png2)

            # convert the image to grayscale
            gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)

            plt.figure(figsize=(15, 8))
            plt.imshow(gray_image, cmap='gray')
            plt.show()
    else:
        print("sorry, but this "+filepath+" was not sharded")

### Extracting References

In [6]:
# This version ignores contouring and sectioning out paragraphs. It directly feeds the image to tesseract. 
# There seems to be no image resolution degradation this way as opposed to reduced using the openCV library.
# SCANNED_FILE: is for the full path to the original pdf. we require this to get the number of pages. 
#  The assumption is that the jstor (or other) cover page has been removed previously, so in our case it will always have wo_cover.pdf as suffix.
# path: to the folder containing pre-generated pngs, the pngs in this folder are assumed to have the same file name as the SCANNED_FILE + suffix page-{page no}.png for each sharded page
# keyword: this is whatever regex pattern that you wish to search for. This function uses the regex.search method from the regex library
#  It can take fuzzy match regex patterns
# config: this is the tesseract configuration default is '--oem 1 --psm 3', which is also the default for this function
#  3 implies automatic page segmentation, better for 2 column format pdfs, 6 assumes single column, top to bottom text and will preserve each line ending better.
def generate_refs2(SCANNED_FILE, path, keyword, custom_config = r'--oem 1 --psm 3'):
    try:
        doc = fitz.open(SCANNED_FILE)
    except:
        print("could not open: "+SCANNED_FILE)
        raise Exception("this file is corrupt")
    if "wo_cover" not in SCANNED_FILE:
        print("warning, the file: "+SCANNED_FILE.split('/')[-1]+" does not have it's coverpage removed.\nThis function will continue. Assumed image file name convention is: "+SCANNED_FILE.split('/')[-1].split('.')[0] + '_page-{number}.png')
    parsed={}
    references={}
    found=0
    for page in reversed(doc):
        png = path+"/" + SCANNED_FILE.split('/')[-1].split('.')[0] + '_page-%i.png' % page.number
#             print(png)
        parsed[page.number]=[]
        references[page.number]=[]
        if os.path.exists(png)==True:
            text = pytesseract.image_to_string(png, config=custom_config)
#                 print(text)
            parsed[page.number].append(text)
            keyword_search=regex.search(keyword,text.upper())
            if keyword_search is not None:
                print('found')
                return {'found': parsed, "pages":doc.page_count}
        else:
            print("error: this image does not exist, please generate png shards at 300 dpi in path: "+path)
    print("the keyword: "+keyword + "was not found. But this is the full tesseract output nonetheless.")
    return {'raw': parsed, "pages": doc.page_count}

In [None]:
t0=time.time()
custom_config = r'--oem 1 --psm 6'

for i in Merged[(Merged['year']<=2020) & (Merged['year']>=1940)& (Merged['content_type']!='MISC') & (Merged['content_type']!='Review')].index:
    if Merged.iloc[i]['year'] in exclude.keys():
        if (exclude[Merged.iloc[i]['year']]==Merged.iloc[i]["number"]):
            #this is the special issue we are excluding
            continue
    #construct file name
    filepath=temp+'wo_cover/'+Merged.iloc[i]['URL'].split('/')[-1]+'_wo_cover.pdf'
    print(filepath)
    if os.path.exists(filepath)==True:
        references=generate_refs2(filepath, temp+"png", '(^|\n)R(EFERENCES){e<=3}(\n| )', custom_config)
        o_file=base_path+'/'+Merged.iloc[i]['URL'].split('/')[-1]+ "_tesseract.json"
        with open(o_file,'w') as f:
                    json.dump({Merged.iloc[i]['URL'].split('/')[-1]: {'references':references, 'URL': Merged.iloc[i]['URL']}}, f, indent=3)
    else:
        dict_ref[Merged.iloc[i]['URL'].split('/')[-1]]='PDF not available, download at '+ Merged.iloc[i]['URL']
        print("filepath not valid, file "+Merged.iloc[i]['URL'].split('/')[-1]+'_wo_cover.pdf'+ " did not get sharded")
t1=time.time()
total=t1-t0
print(total)
print(i)