# Get files from Zotero

In [3]:
import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero

from secrets import *

In [2]:
output_dir = join('data', 'pdf')

This is the function that does the actual download of the PDFs using Zotero's API.

First, we need to get all of the collections in the Zotero Library. Collections are like sub-folders in the library. We will be looking for a collection with the given name.

Next, we will get all of the items in a collection with a given tag. We have been tagging items with a "Rel-Yes" or "Rel-No" when we determine if the item is relevant to the study or not.

Finally, we can get the PDF attachment associated with the item. An item may have more than one attachment (PDF, HTML, etc.) underneath it. However, for our current purpose we are only concerned with the PDF.

In [3]:
def get_pdfs(output_dir, collection_name, tag):

    # Creat the output directory
    path = join(output_dir, collection_name, tag)
    os.makedirs(path, exist_ok=True)

    # Connect to Zotero
    zot = zotero.Zotero(secrets, 'group', user_key)

    # Get the collection of interest and it's key
    collections = {c['data']['name']: c for c in zot.collections()}
    collection = collections[collection_name]
    key = collection['key']

    # Now get the items in the collection that have the given tag
    items = [d for d in zot.everything(zot.collection_items(key, tag=tag))]
    # items = [d for d in zot.collection_items(key, tag=tag, limit=3)]

    # Get the PDF attachment for each item and save it to the class directory
    for item in items:
        # An item's attachments
        children = [c for c in zot.children(item['key'])]

        # Just get the PDFs
        pdfs = [c for c in children
                if c['data'].get('contentType') == 'application/pdf']

        # Handle when there are no attachments
        if not children:
            print('\nMISSING DOCUMENTS {}\n'.format(item['key']))
        # Handle when there are no PDF attachments
        elif not pdfs:
            print('\nNO PDFs {}\n'.format(item['key']))
        # Handle when there is more than one PDF attachment
        elif len(pdfs) != 1:
            print('\nTOO MANY PDFs {}\n'.format(item['key']))
        # Save the PDF to the class directory
        else:
            doc = pdfs[0]
            print(doc['data']['filename'])
            zot.dump(doc['key'], '{}.pdf'.format(doc['key']), path)

In [4]:
get_pdfs(output_dir, 'RSet_N1', 'Rel-Yes', group_id, user_key)
get_pdfs(output_dir, 'RSet_N1', 'Rel-No', group_id, user_key)
get_pdfs(output_dir, 'RSet_N2', 'Rel-Yes', group_id, user_key)
get_pdfs(output_dir, 'RSet_N2', 'Rel-No', group_id, user_key)

Collevatti et al. - 2013 - Stability of Brazilian seasonally dry forests unde.pdf
Limitations-in-global-information-on-species-occurrences.pdf
Hernandez-Triana et al. - 2015 - DNA barcoding as an aid for species identification.pdf
Woodruff and Fasulo - Banana Root Borer, Cosmopolites sordidus (Germar)(.pdf
Barnes - 2010 - A remarkable case of fiddler crab, Uca spp., alpha.pdf
Burton et al. - 2012 - Hierarchical multi-species modeling of carnivore r.pdf
TWEET et al. - NAME-BEARING FOSSIL TYPE SPECIMENS AND TAXA NAMED .pdf
Rome et al. - 2015 - Caste differentiation and seasonal changes in Vesp.pdf
Mateo et al. - 2010 - Effects of the number of presences on reliability .pdf
FERRAZ et al. - 2012 - How species distribution mo-dels can improve cat c.pdf

NO PDFs 6URCJSNP

Brooks et al. - 2015 - Harnessing biodiversity and conservation knowledge.pdf


# Handle duplicate files

It turns out that some files have both labels (Rel-Yes and Rel-No). We need to remove these files from the data set.

Get all PDF file names for a particular class.

In [5]:
def file_names(root, cls):
    pattern = join('data', 'pdf', root, cls, '*.pdf')
    paths = glob(pattern)
    return [basename(p) for p in paths]

We move one copy of the file out of the way and delete the extra copy.

In [6]:
def move_duplicates(root):

    rel_yes = set(file_names(root, 'Rel-Yes'))
    rel_no = set(file_names(root, 'Rel-No'))
    duplicates = rel_yes & rel_no

    dup_root = join('data', 'pdf', 'duplicates')
    os.makedirs(dup_root, exist_ok=True)

    for duplicate in duplicates:
        print(duplicate)
        src = join('data', 'pdf', root, 'Rel-Yes', duplicate)
        dst = join(dup_root, duplicate)
        move(src, dst)
        src = join('data', 'pdf', root, 'Rel-No', duplicate)
        os.remove(src)

In [7]:
move_duplicates('RSet_N1')
move_duplicates('RSet_N2')

# Convert PDF files to text

Convert the PDF files to text. They will be placed into the given output directory. This utility depends on the external program "xpdf" specifically "pdftotext".

Extract the text from the PDF ad write it to a file.

In [8]:
def pdf_to_text(output_dir, pdf_path):
    txt_name = basename(pdf_path)
    txt_name = splitext(txt_name)[0] + '.txt'
    txt_path = join(output_dir, txt_name)
    cmd = "pdftotext '{}' '{}'".format(pdf_path, txt_path)
    try:
        subprocess.check_call(cmd, shell=True)
    except Exception:
        pass

Loop through all of the PDFs and convert them

In [11]:
def convert_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    pattern = join(input_dir, '*.pdf')
    pdf_paths = glob(pattern)

    for i, pdf_path in enumerate(pdf_paths, 1):
        print('Converting:', pdf_path)
        pdf_to_text(output_dir, pdf_path)

In [12]:
convert_pdfs('data/pdf/RSet_N1/Rel-Yes', 'data/text/RSet_N1/Rel-Yes')
convert_pdfs('data/pdf/RSet_N1/Rel-No', 'data/text/RSet_N1/Rel-No')

convert_pdfs('data/pdf/RSet_N2/Rel-Yes', 'data/text/RSet_N2/Rel-Yes')
convert_pdfs('data/pdf/RSet_N2/Rel-No', 'data/text/RSet_N2/Rel-No')

Converting: data/pdf/RSet_N1/Rel-Yes/NFBFRJE3.pdf
Converting: data/pdf/RSet_N1/Rel-Yes/7H3FB5AR.pdf
Converting: data/pdf/RSet_N1/Rel-Yes/9FWZX3P8.pdf
Converting: data/pdf/RSet_N1/Rel-No/HUK6N8SE.pdf
Converting: data/pdf/RSet_N1/Rel-No/TDIW72GZ.pdf
Converting: data/pdf/RSet_N1/Rel-No/PQ8MRSVV.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/U2BPDHGA.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/DT5FH8G5.pdf
Converting: data/pdf/RSet_N2/Rel-Yes/DHW5ACU8.pdf
Converting: data/pdf/RSet_N2/Rel-No/VR7BXTHD.pdf
Converting: data/pdf/RSet_N2/Rel-No/IFJXWSER.pdf


# Split data into training, validation, and test sets

In [20]:
init_seed = 23578 # Fix the random seed for testing
test_split = 0.2  # How much of the data to use for testing
val_split = 0.2   # How much of the data to use for validation

Copy files into the appropriate directory.

In [21]:
def copy_files(paths, cls):
    """Move files into the correct dirctories."""

    shuffle(paths)

    val_idx = int(len(paths) * val_split)
    test_idx = val_split + int(len(paths) * test_split)

    for i, src in enumerate(paths):
        if i < val_idx:
            dir_name = 'val'
        elif i < test_idx:
            dir_name = 'test'
        else:
            dir_name = 'train'

        dst = join('data', dir_name, cls, basename(src))
        print(dst)
        copy(src, dst)

In [22]:
def split_files():
    """Split into training, validation, and test datasets."""

    os.makedirs('data/train/Rel-Yes', exist_ok=True)
    os.makedirs('data/train/Rel-No', exist_ok=True)
    os.makedirs('data/test/Rel-Yes', exist_ok=True)
    os.makedirs('data/test/Rel-No', exist_ok=True)
    os.makedirs('data/val/Rel-Yes', exist_ok=True)
    os.makedirs('data/val/Rel-No', exist_ok=True)

    rel_yes = glob('data/text/*/Rel-Yes/*.txt')
    rel_no = glob('data/text/*/Rel-No/*.txt')

    copy_files(rel_yes, 'Rel-Yes')
    copy_files(rel_no, 'Rel-No')

In [23]:
seed(init_seed)
split_files()

data/val/Rel-Yes/DHW5ACU8.txt
data/test/Rel-Yes/NFBFRJE3.txt
data/train/Rel-Yes/9FWZX3P8.txt
data/train/Rel-Yes/7H3FB5AR.txt
data/train/Rel-Yes/U2BPDHGA.txt
data/train/Rel-Yes/DT5FH8G5.txt
data/val/Rel-No/VR7BXTHD.txt
data/test/Rel-No/TDIW72GZ.txt
data/train/Rel-No/HUK6N8SE.txt
data/train/Rel-No/PQ8MRSVV.txt
data/train/Rel-No/IFJXWSER.txt
