# Fetching annotations from Zotero

In [1]:
import yaml
from pyzotero import zotero
from pathlib import Path
from pyzotero.zotero_errors import ResourceNotFound
from dataclasses import asdict
import fitz 
from pdf_annot.extract import annotations

In [2]:
# Load Credentials, which, BTW, are not included in the GIT repo ... 
# You'll have to get your own. 

with open('credentials.yaml') as f:
    cred = yaml.safe_load(f)
    
# Get all of the items from the group

zot = zotero.Zotero(cred['userId'], 'group', cred['key'])

items = { item['data']['key']:item for item in zot.everything(zot.items()) }
attach = { k:v for k,v in items.items() if v['data'].get('contentType') }

assert zot.count_items() == len(items)

In [3]:
# Fetch each of the PDFS, but only if they aren't already cached 
# Extract annotations and text and pages

fitz.TOOLS.mupdf_display_errors(False) # Turn off "mupdf: invalid page object" warnings

for i, (key, item) in enumerate(items.items()):
    try:
        key = item['data']['key']
        typ = item['data']['itemType']
        ct = item['data'].get('contentType')
    except Exception as e:
        print(key, e)
        continue 
        
    if  ct and typ == 'attachment':
        fn = item['data']['filename']
        p = Path(f'pdf/{fn}')
        
        if ct == 'text/html':
            p = Path(f'pdf/{fn}.zip')
        else:
            p = Path(f'pdf/{fn}')
        
        if not p.exists():
            print(f"Fetch: {fn}")
            try:
                zot.dump(key, path='pdf')
            except ResourceNotFound as e:
                print(e)
            
        item['_path'] = str(p)
        
        try:
            item['_annotations'] = [asdict(e) for e in annotations(p)]
        except TypeError as e:
            pass # annotations didn't return dataclass to asdict
        except Exception as e:
            print(key, type(e), e)
        
        if(p.exists()):
            doc = fitz.open(p) 

            item['_pages'] = [{'pageno': i,  'text': page.get_text("text"), 'html': None }
                          for i, page in enumerate(doc)]
        else:
            item['_pages'] =  None

        
    else:
        item['_pages'] =  None
        item['_path'] = None
   

Fetch: special-economic-zones-progress-emerging-challenges-and-future-directions.html

Code: 404
URL: https://api.zotero.org/groups/3865474/items/AXKHW636/file
Method: GET
Response: Not found
AXKHW636 <class 'RuntimeError'> cannot open pdf/special-economic-zones-progress-emerging-challenges-and-future-directions.html.zip: No such file or directory


In [5]:
# Write the data as JSON to a ZIP file
import json 
import gzip

with open('zotero_records.json', 'w') as f:
    f.write(json.dumps(items))    


In [6]:
# Store the file into S3
import boto3
import zlib

session = boto3.Session(profile_name=cred['s3profile'])
client = session.client('s3')
  
k = 'sandiegodata.org/sez_zotero_papers/zotero_records-1.0.3.json.gzip'
    
client.put_object(Body=zlib.compress(json.dumps(items).encode('utf-8')), 
                  Bucket='ds.civicknowledge.org', 
                  Key=k, 
                  ACL='public-read')
    



{'ResponseMetadata': {'RequestId': 'CATZXAXBKFWVCA5Z',
  'HostId': 'BITsdywmLc+BThRwvy90Aau2IZA18pr9osMN2EoptBCw51iQioEWg8686X3etTpRva/1NbYsc+s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'BITsdywmLc+BThRwvy90Aau2IZA18pr9osMN2EoptBCw51iQioEWg8686X3etTpRva/1NbYsc+s=',
   'x-amz-request-id': 'CATZXAXBKFWVCA5Z',
   'date': 'Mon, 16 Aug 2021 22:30:44 GMT',
   'etag': '"b4e84bf5a4581fecece895c3eca61f95"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"b4e84bf5a4581fecece895c3eca61f95"'}

In [7]:
# Saving this for later, incase we discover that the text is garbled. 

from operator import itemgetter 
from itertools import groupby 

def recover(words, rect):
    """ Word recovery.

    Notes:
        Method 'get_textWords()' does not try to recover words, if their single
        letters do not appear in correct lexical order. This function steps in
        here and creates a new list of recovered words.
    Args:
        words: list of words as created by 'get_textWords()'
        rect: rectangle to consider (usually the full page)
    Returns:
        List of recovered words. Same format as 'get_text_words', but left out
        block, line and word number - a list of items of the following format:
        [x0, y0, x1, y1, "word"]
    """
    # build my sublist of words contained in given rectangle
    mywords = [w for w in words if fitz.Rect(w[:4]) in rect]

    # sort the words by lower line, then by word start coordinate
    mywords.sort(key=itemgetter(3, 0))  # sort by y1, x0 of word rectangle

    # build word groups on same line
    grouped_lines = groupby(mywords, key=itemgetter(3))

    words_out = []  # we will return this

    # iterate through the grouped lines
    # for each line coordinate ("_"), the list of words is given
    for _, words_in_line in grouped_lines:
        for i, w in enumerate(words_in_line):
            if i == 0:  # store first word
                x0, y0, x1, y1, word = w[:5]
                continue

            r = fitz.Rect(w[:4])  # word rect

            # Compute word distance threshold as 20% of width of 1 letter.
            # So we should be safe joining text pieces into one word if they
            # have a distance shorter than that.
            threshold = r.width / len(w[4]) / 5
            if r.x0 <= x1 + threshold:  # join with previous word
                word += w[4]  # add string
                x1 = r.x1  # new end-of-word coordinate
                y0 = max(y0, r.y0)  # extend word rect upper bound
                continue

            # now have a new word, output previous one
            words_out.append([x0, y0, x1, y1, word])

            # store the new word
            x0, y0, x1, y1, word = w[:5]

        # output word waiting for completion
        words_out.append([x0, y0, x1, y1, word])

    return words_out

