# Splitting Pdfs

The following code takes a pdf from jstor, checks for the coverpage, and if the coverpage exists, it creates a pdf without the coverage with certain naming conventions as specified. This is to account for documents that were taken from scihub and do not have a jstor coverpage.

Next, each pdf is split into different pages with page 1 indicated by suffix 0.

In [9]:
import pandas as pd
import numpy as np
import os
import re
from unidecode import unidecode
import fitz #for opening pdfs

# for OCR using PyTesseract
import cv2                              # pre-processing images
import pytesseract                      # extracting text from images
import numpy as np
import matplotlib.pyplot as plt         # displaying output images
from PIL import Image
import regex

In [4]:
JOURNALS= ['AER', 'JPE', 'ECTA', 'RES', 'QJE']
#read in all processed masterlists
All=pd.DataFrame()
for i in JOURNALS:
    All=pd.concat([pd.read_excel('C:\\Users\\sjwu1\\Journal_Data\\datadumps\\combined\\'+i+'_M_sco_du.xlsx'),All], ignore_index=True)

#Create a batch file
All=All[All.duplicated()==False].reset_index().drop('index', axis=1)

In [5]:
All.shape

(60354, 28)

##  Small correction
Change ECONOMETRICA to ECTA for ease

In [6]:
All['id']=All['stable_url'].str.split('/').str[-1]
All.loc[All['Jstor_journal']=="ECONOMETRICA",'Jstor_journal']='ECTA'

## Reduce the set to exclude reviews and miscellaneous content

In [7]:
All.columns

Index([     'stable_url',   'Jstor_authors',     'Jstor_title',
        'Jstor_abstract',    'content_type',       'issue_url',
                 'pages',            'year',          'volume',
                 'issue',   'Jstor_journal',            'type',
                       0,  'scopus_authors',    'scopus_title',
        'scopus_journal',             'DOI',    'affiliations',
       'scopus_abstract',       'citations',   'document type',
        'index keywords', 'author keywords',      'Unnamed: 0',
         'document_type',       'footnotes',             'raw',
            'references',              'id'],
      dtype='object')

In [11]:
pd.unique(All.content_type)

array(['Article', 'Comment', 'Reply', 'MISC', 'Rejoinder', 'Review',
       'Discussion', 'Review2'], dtype=object)

In [14]:
Reduced=All[['Jstor_authors', 'year', 'Jstor_journal', 'Jstor_title','volume', 'issue','content_type', 'stable_url']]

In [15]:
R_1940=Reduced[(Reduced['Jstor_authors'].isnull()==False) & (Reduced['year']>=1940)] #only select those papers that are 1940s or later
R_1940=R_1940.reset_index().drop('index', axis=1) #reset index
R_1940.loc[:, 'authors_caps'] = R_1940['Jstor_authors'].copy().str.upper().to_numpy() #make field to capitalize all author names
R_1940_NM=R_1940[R_1940['content_type']!='MISC'].reset_index().drop('index', axis=1) #exclude miscellaneous
R_1940_NMR=R_1940_NM[R_1940_NM['content_type']!='Review'].reset_index().drop('index', axis=1) #exclude reviews

## Set path

In [19]:
path='D:\\docs\\Masters\\Data\\' #path to data
Merged=R_1940_NMR
print(Merged['content_type'].unique())

['Article' 'Comment' 'Reply' 'Rejoinder' 'Discussion']
['QJE' 'RES' 'ECTA' 'JPE' 'AER']


In [13]:
redsets=R_1940_NM[(R_1940_NM['content_type']!='Review')&(R_1940_NM['year']<=2010)]

In [None]:
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y)

## Set functions

In [None]:
#creates a pdf in doc2_name taken from doc1 from the page specified inclusive
def make_new_pdf(doc1, doc2_name, from_pg):
    doc2 = fitz.open()                 # new empty PDF
    doc2.insert_pdf(doc1, from_page = from_pg)  
    pg_count=doc2.page_count
    doc2.save(doc2_name)
    doc2.close()
    return pg_count

In [None]:
#given a fitz object doc1 it will insert the specified pages inclusive into the name specified in doc2_name
def make_new_pdf2(doc1, doc2_name, from_pg, to_pg):
    doc2 = fitz.open()                 # new empty PDF
    doc2.insert_pdf(doc1, from_page = from_pg, to_page = to_pg)  # first 10 pages
    pg_count=doc2.page_count
    doc2.save(doc2_name)
    doc2.close()
    return pg_count

In [20]:
# cover page removal
def coverpage_removal(SCANNED_FILE, path):
    doc = None
    page= None
    try:
        doc=fitz.open(SCANNED_FILE)
        page=doc[0]
    except:
        doc.close()
        raise Exception("this file is corrupt")
    
    png = path + SCANNED_FILE.split('\\')[-1].split('.')[0] + '_page-%i.png' % page.number
    if os.path.exists(png)==False:
        pix = page.get_pixmap(matrix=mat)
        #print(png)
        pix.save(png)
    
    doc2_name=path+SCANNED_FILE.split('\\')[-1].split('.')[0]+'_wo_cover.pdf'
    if os.path.exists(doc2_name)==False:
        original_image = cv2.imread(png)

        text = pytesseract.image_to_string(original_image, lang='lat', config='--oem 3 --psm 6')

        #print(doc2_name)
        if (re.search('AUTHOR\(S\)', text.upper()) is not None) or (re.search('PUBLISHED BY:', text.upper()) is not None):
            make_new_pdf(doc, doc2_name, 1)
            print('found')
        else:
            make_new_pdf(doc, doc2_name, 0)
            print('not found')
    doc.close()
    
    return 1

In [None]:
# function splits the pdf into pages and saves them to the given path and returns a list of the pdf paths
def shard(SCANNED_FILE, path, year):
    df=pd.DataFrame()
    doc = None
    try:
        doc=fitz.open(SCANNED_FILE)
    except:
        doc.close()
        raise Exception("this file is corrupt")
        
    for page in doc:
        doc2_name=path+SCANNED_FILE.split('\\')[-1].split('.')[0]+'_page-%i.pdf' % page.number
        if os.path.exists(doc2_name)==False:
            make_new_pdf2(doc, doc2_name, page.number, page.number)
        df=pd.concat([df, pd.DataFrame(
        [
            {
                'pdf_url': '\''+bucket+SCANNED_FILE.split('\\')[-1].split('.')[0]+'_page-%i.pdf' % page.number+'\'',
                'year': year
            }
        ]
        )], ignore_index=True)
    doc.close()
    return df

## Going through the full data set

In [None]:
# make pandas objects to store the list of new pdf paths
JPE_refs=pd.DataFrame()
ECTA_refs=pd.DataFrame()
QJE_refs=pd.DataFrame()
AER_refs=pd.DataFrame()
RES_refs=pd.DataFrame()

ranges={
    #"AER": [1940, 2010, AER_refs], # I usually run these one at a time
   # "ECTA": [1940, 2010, ECTA_refs],
    #"JPE": [1940, 2010, JPE_refs],
   # "QJE": [1940, 2010, QJE_refs],
    "RES": [1940, 1950, RES_refs]
}
for journal in ranges.keys():
    bucket='https://myawsbucket-1231.s3.eu-west-3.amazonaws.com/'+journal+'_shards/'
    ret_frame=None

    filter=Merged[(Merged['year']<=ranges[journal][1]) & (Merged['year']>=ranges[journal][0])& (Merged['Jstor_journal']==journal)]
    for i in filter.index:
        # make the file paths
        path2=path+Merged.iloc[i]['Jstor_journal']+'_data\\'
        #filepath=path2+Merged.iloc[i]['stable_url'].split('/')[-1]+'.pdf'
        filepath2=path2+'wo_cover\\'Merged.iloc[i]['stable_url'].split('/')[-1]+'_wo_cover.pdf'
        
        print(filepath2)
        if os.path.exists(filepath2)==True:
            #coverpage_removal(filepath, path2) #remove cover page
            ret_frame=shard(filepath2, path2, Merged.iloc[i]['year']) #shard
            ranges[journal][2]=pd.concat([ranges[journal][2], ret_frame], ignore_index=True) #concat to the pandas objects
        else:
            print(filepath+' missing')

## Save the lists

In [None]:
ranges['AER'][2].to_csv(path+"\\aer_refs_all.csv",index=False)

In [None]:
ranges['ECTA'][2].to_csv(path+"\\ecta_refs_all.csv",index=False)

In [None]:
ranges['JPE'][2].to_csv(path+"\\jpe_refs_all.csv",index=False)

In [None]:
ranges['QJE'][2].to_csv(path+"\\qje_refs_all.csv",index=False)

In [None]:
ranges['RES'][2].to_csv(path+"\\res_refs_all.csv",index=False)