In [None]:
import glob, io, os, re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from multiprocessing import Manager, Pool
from tqdm import tqdm

# pdf_paths = glob.glob('data/2018/*.pdf')
def find_paper(input_path):
    pdf_paths = sorted(glob.glob(input_path), key=os.path.getmtime)
    path_set = set()
    preregis_paths = []
    for path in tqdm(pdf_paths):
        try: 
            m = re.search("\_(.+?)\.", path)
            path_set.add(m.group(1))

            rsrcmgr = PDFResourceManager()
            retstr = io.StringIO()
            device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
            pdf_file = open(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(pdf_file):
                interpreter.process_page(page)
            text = retstr.getvalue().lower()

            if 'preregister' in text or 'pre-register' in text or \
                'pre-registration' in text or 'preregistration' in text or \
                'open science framework' or 'osf' in text or 'aspredict' in text:
                    print("Found experiment ...", path)
                    preregis_paths.append(path)

            pdf_file.close()
            device.close()
            retstr.close()
        except:
            print("Alert: %s is not available. ", path)
        
    return preregis_paths

    

In [None]:
print("############# 2021 Papers #################")
paper_2021 = find_paper('data/2021/*.pdf')

print("############# 2020 Papers #################")
paper_2020 = find_paper('data/2020/*.pdf')

print("############# 2019 Papers #################")
paper_2019 = find_paper('data/2019/*.pdf')

print("############# 2018 Papers #################")
paper_2018 = find_paper('data/2018/*.pdf')