# Approach 1

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from nltk import ngrams
import os
import string

'''
self-defined pdf2text function: The "parse" funtions needs to be adapted according to specific pdf format of 
a certain jounal
ref source: https://www.reddit.com/r/Python/comments/50r1cs/text_extraction_from_pdf_published_scientific/
'''

def convert_pdf_to_txt_and_metadata(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    metadata=PDFDocument(PDFParser(fp)).info
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    return text, metadata

def findWord(word, pdftext):
    for sentence in pdftext.split("."):
        if word in sentence:
            print(sentence.replace("\n", " "))
            
def parse(text, content):        
    pages = text.split('\x0c')
    remove_lst = ['', ' ', '  ', '   ']
    pages = [item for item in pages if item not in remove_lst]
    full = []
    ##clean each page
    for page in range(len(pages)):
        if "l\n\nD\no\nw\nn\no\na\nd\ne\nd\n\n" in pages[page]:
            txt1 = pages[page].split('l\n\nD\no\nw\nn\no\na\nd\ne\nd\n\n')[0]
            txt2 = pages[page].split('l\n\nD\no\nw\nn\no\na\nd\ne\nd\n\n')[1].split('\n2\n0\n2\n2')[1]
            txt = (txt1 + txt2)
        else:
            txt = pages[page]
            
        txt = txt.replace('-\n\n', '').split('\n\n')
        txt = [item for item in txt if item not in remove_lst]
        remove = []
        for i in range(len(txt)):
            length = len(txt[i])
            
            ##remove noisy items
            str_len = len(''.join(e for e in txt[i] if e.isalnum() if not e.isdigit()))
            if str_len <= 2 or (str_len < 0.6*length and length < 40):
                remove += [txt[i]]
                continue
                
            ##detect and remove noise (i.e., download links)
            if str_len < 0.6*len(txt[i]):
                txt[i] = txt[i].replace('-\n', '').replace('\n','').strip()
            else:
                txt[i] = txt[i].replace('-\n', '').replace('\n',' ').strip()
            
            ##remove journal logos items
            for item in ['Journal of Mechanical Design',"Transactions of the ASME",'Vol.','Copyright © ',
                         'Contributed','Corresponding author',
                         'Journal of Mechanisms, Transmissions, and Automation in Design']:
                if item in txt[i] and len(txt[i]) < 300:
                    remove += [txt[i]]
                    
            ## remove journal logos contained in text
            for item in ['Journal of Mechanical Design',"Transactions of the ASME",'Vol.','Copyright © ',
                         'Journal of Mechanisms, Transmissions, and Automation in Design']:
                if item in txt[i] and len(txt[i]) >= 300:
                    txt[i] = txt[i].split(item)[0]
                    
            ## remove footnot regaring author info from text
            for item in ['Contributed','Corresponding author']:
                if item in txt[i] and len(txt[i]) >= 300 and len(txt[i].split(item)[1]) < 300:
                    txt[i] = txt[i].split(item)[0]
                    
            ## detect paragraphs that are split into two parts by footnotes
            if i > 0 and i < len(txt) - 1 and len(txt[i]) > 2:
                if txt[i][0:2] in [str(a)+b for a in range(20) for b in list(string.ascii_uppercase)] \
                   and txt[i][-1] == '.':
                    if txt[i-1][-1] not in ['.','?', '!'] and txt[i+1][0] not in list(string.ascii_uppercase):
                        txt[i],txt[i+1] = txt[i+1],txt[i]
 
        ##remove empty items
        remove = set(remove)
        for item in remove:
            txt.remove(item)
        ##insert "@#" between meta data for easy parsing
        if content == 1:
            tmpx = ' '.join(txt)
        else:
            tmpx = ' @# '.join(txt)
        full += [tmpx]
    
    cleaned_text = ' '.join(full)
    return cleaned_text.strip()


In [2]:
##load the spreadsheet in which the sparsed info will be stored
data = pd.read_excel('jmd_all_updated.xlsx')

##define the dictionary for storing the extracted info
info = {}
info['ref'] = ['']*len(data)
info['ack'] = ['']*len(data)
info['meta'] = ['']*len(data)
info['full_text'] = ['']*len(data)
for no in range(0,len(data)):
    pdf, metadata=convert_pdf_to_txt_and_metadata(os.path.join("folder_with_pdfs", "%d.pdf"%no))
    for i in range(20):
        if i%2 == 1:
            pdf = pdf.replace('(cid:%d)'%i, '(')
        else:
            pdf = pdf.replace('(cid:%d)'%i, ')')
            
    ##split pdf into different sections containing different info and extract the desired info using "parse"
    ref = ''
    for item in ['References\n','References \n','R\ne\nf\ne\nr\ne\nn\nc\ne\ns','Reference\n','Reference \n',
                 'R\ne\nf\ne\nr\ne\nn\nc\ne']:
        if item in pdf:
            ##split on last occurrence
            REF = pdf.rsplit(item,1)[-1] 
            pdf = pdf.rsplit(item,1)[-2]
#             ref = parse(REF,2)
            break
    
    ack = ''
    for item in ['Acknowledgments\n','Acknowledgments \n','A\nc\nk\nn\no\nw\nl\ne\nd\ng\nm\ne\nn\nt\ns',
                 'Acknowledgment\n','Acknowledgment \n','A\nc\nk\nn\no\nw\nl\ne\nd\ng\nm\ne\nn\nt']:
        if item in pdf:
            ##split on last occurrence
            ACK = pdf.rsplit(item,1)[-1]
            pdf = pdf.rsplit(item,1)[-2]
            ack = parse(ACK,1)
            break
    
    meta = ''
    for item in ['DOI: ',"Introduction \n",'Introduction\n','I\nn\nt\nr\no\nd\nu\nc\nt\ni\no\nn\n',
                 'I\nn\nt\nr\no\nd\nu\nc\nt\ni\no\nn \n']:
        if item in pdf:
            ##split on first occurrence
            META = pdf.split(item,1)[0]
            meta = parse(META,2)
            if item == 'DOI: ':
                pdf = pdf.split(item,1)[1][20:]
            else: 
                pdf = 'Introduction\n' + pdf.split(item,1)[-1]
            break
        
    text = parse(pdf,1)
    
    info['ref'][no] = ref
    info['ack'][no] = ack
    info['meta'][no] = meta
    info['full_text'][no] = text

In [3]:
##capture second-column text in first page going to meta incorretly
n = 0
data = data.astype({"abstract": str})
abstract = list(data['abstract'])
meta1 = ['']*len(data)
for i in range(len(info['meta'])):
# for i in [8]:
    meta = info['meta'][i]
    abst = abstract[i]
    if len(abst) > 10 and abst[-8:]+' @# ' in meta:
        meta1[i] = meta.split(abst[-8:]+' @# ')[-1].replace(' @# ', ' ')
        
##correct main body text by adding 
corrected = ['']*len(data)

for i in range(len(info['full_text'])):
    corrected[i] = meta1[i].strip() + ' ' + info['full_text'][i].strip()

In [4]:
## The maximal cell length of excel is 32767. Need to cut long text into multiple pieces
body = {}
m = 0
for item in info['full_text']:
    n = int(len(item) / 32767) + 1
    for i in range(n):
        if 'text_' + str(i) not in body.keys():
            body['text_' + str(i)] = ['']*len(data)
        body['text_' + str(i)][m] = item[i*32767:min((i+1)*32767, len(item))]
    m += 1

In [5]:
data['ref from pdf'] = info['ref']
data['acknowledgment'] = info['ack']
data['main body'] = info['full_text']
data['meta data'] = info['meta']
data['meta to main body'] = meta1
data['correccted main body'] = corrected
for key in body.keys():
    data[key] = body[key]

data.to_excel('jmd_all_updated.xlsx')

# Approach 2

In [8]:
'''
Approach 1: not generic, less effective for pdfs of earlier-year papers
Approach 2: generic to all journals, but can't be applied to all pdfs successfully. A little time-consuming 
to install the packages. Run well on Linux (Ubuntu), but not on Windows.
For low-quality pdfs, approach 1 catch paper content more comprehensively, but it's less accurate at the word level.
Many words have extra spaces between letters.
'''

In [9]:
'''
ref source 1: https://github.com/titipata/scipdf_parser
    pip install git+https://github.com/titipata/scipdf_parser
    ##The following command needs to be run everytime when the script is in use.
    ##The following command needs to run on Linux; the file needs to be revised according to ref 2.
    ##Go to the folder containing the bash file in terminal, then run:
    bash serve_grobid.sh
ref source 2: https://github.com/kermitt2/grobid
'''
import scipdf

In [53]:
##test
article_dict = scipdf.parse_pdf_to_dict('folder_with_pdfs/131.pdf') # return dictionary
 
## option to parse directly from URL to PDF, if as_list is set to True, output 'text' of parsed section will be in a list of paragraphs instead
# article_dict = scipdf.parse_pdf_to_dict('https://www.biorxiv.org/content/biorxiv/early/2018/11/20/463760.full.pdf', as_list=False)

xml = scipdf.parse_pdf('folder_with_pdfs/131.pdf', soup=True) # option to parse full XML from GROBID

In [301]:
import pandas as pd
import os

data = pd.read_excel('jmd_all_updated 1.xlsx')

# body = ['']*len(data)
# acknow = ['']*len(data)
path = 'folder_with_pdfs'
for i in range(0,5591):
    text = []
    ack = ''
    try:
        article_dict = scipdf.parse_pdf_to_dict(os.path.join(path,str(i)+'.pdf'))
        for sec in article_dict['sections']:
            head = sec['heading'].replace('\n','').strip()
            if "Acknowledgment" not in head and "acknowledgment" not in head:
                txt = sec['text'].replace('\n','').strip()
                text += [head + ' ' + txt]
            else:
                ack = sec['text'].replace('\n','').strip()
    except:
        print ("Doesn't work for paper " + str(i))
    body_text = ' '.join(text)

    body[i] = body_text
    acknow[i] = ack



Doesn't work for paper 2467
Doesn't work for paper 2470
Doesn't work for paper 2476
Doesn't work for paper 2478
Doesn't work for paper 2479
Doesn't work for paper 2482
Doesn't work for paper 2492
Doesn't work for paper 2494
Doesn't work for paper 2495
Doesn't work for paper 2496
Doesn't work for paper 2497
Doesn't work for paper 2507
Doesn't work for paper 2508
Doesn't work for paper 2512
Doesn't work for paper 2515
Doesn't work for paper 2516
Doesn't work for paper 2517
Doesn't work for paper 2519
Doesn't work for paper 2521
Doesn't work for paper 2527
Doesn't work for paper 2528
Doesn't work for paper 2531
Doesn't work for paper 2532
Doesn't work for paper 2537
Doesn't work for paper 2541
Doesn't work for paper 2550
Doesn't work for paper 2554
Doesn't work for paper 2556
Doesn't work for paper 2558
Doesn't work for paper 2570
Doesn't work for paper 2575
Doesn't work for paper 2577
Doesn't work for paper 2578
Doesn't work for paper 2579
Doesn't work for paper 2581
Doesn't work for pap

Doesn't work for paper 4420
Doesn't work for paper 4427
Doesn't work for paper 4432
Doesn't work for paper 4433
Doesn't work for paper 4442
Doesn't work for paper 4444
Doesn't work for paper 4445
Doesn't work for paper 4449
Doesn't work for paper 4451
Doesn't work for paper 4454
Doesn't work for paper 4458
Doesn't work for paper 4465
Doesn't work for paper 4471
Doesn't work for paper 4473
Doesn't work for paper 4479
Doesn't work for paper 4482
Doesn't work for paper 4486
Doesn't work for paper 4488
Doesn't work for paper 4492
Doesn't work for paper 4496
Doesn't work for paper 4504
Doesn't work for paper 4506
Doesn't work for paper 4508
Doesn't work for paper 4510
Doesn't work for paper 4513
Doesn't work for paper 4530
Doesn't work for paper 4545
Doesn't work for paper 4546
Doesn't work for paper 4550
Doesn't work for paper 4569
Doesn't work for paper 4586
Doesn't work for paper 4590
Doesn't work for paper 4594
Doesn't work for paper 4606
Doesn't work for paper 4614
Doesn't work for pap

Doesn't work for paper 5543
Doesn't work for paper 5544
Doesn't work for paper 5550
Doesn't work for paper 5551
Doesn't work for paper 5552
Doesn't work for paper 5559
Doesn't work for paper 5564
Doesn't work for paper 5566
Doesn't work for paper 5567
Doesn't work for paper 5568
Doesn't work for paper 5572
Doesn't work for paper 5574
Doesn't work for paper 5578
Doesn't work for paper 5581
Doesn't work for paper 5584
Doesn't work for paper 5586


In [316]:
main_body = {}
m = 0
for item in body:
    n = int(len(item) / 32767) + 1
    for i in range(0,n):
        if 'text_'+str(i) not in main_body.keys():
            main_body['text_'+str(i)] = ['']*len(data)
        main_body['text_'+str(i)][m] = item[i*32767:min((i+1)*32767,len(item))]
    m += 1

In [322]:
for key in main_body.keys():
    data[key] = main_body[key]
data['Ack'] = acknow

In [323]:
data.to_excel('jmd_all_updated.xlsx')

'''
Manual work is needed to check, revise, combine the extracted texts.
'''

In [193]:
##extract figures from pdf files
##issue: can only run for 20s (~6 papers). Need to split papers into multiple folders woth a smaller number of pdfs

##folder containing pdfs: folder should contain only PDF files
path_in = "folder_with_pdfs"
##folder containing figures
path_out = "figures"
scipdf.parse_figures(path_in, output_folder=path_out)
scipdf.parse_figures(input_folder, output_folder=path)