# Read and Use the Zotero Export

In [1]:
import requests
import zlib
import json

k = 'zotero_records-1.0.2.json.gzip'
      
r = requests.get("http://ds.civicknowledge.org.s3.amazonaws.com/sandiegodata.org/sez_zotero_papers/"+k)
r.raise_for_status()

d = json.loads(zlib.decompress(r.content))

In [11]:
from itertools import islice 
for k, v in islice(d.items(),15):
    pages = v.get('_pages') or []
  
    # Get full text of one record
    txt = '\n'.join(e['text'] for e in pages)
    
    print(k, len(pages), len(txt), v['data'].get('title'))

PYPXWAIV 34 2271 Moberg and Tarko - 2014 - Why No Chinese Miracle in Africa Special Economic.pdf
ZNLN5LP7 0 0 Why No Chinese Miracle in Africa? Special Economic Zones and Liberalization Avalanches
YSJZIFHR 0 0 Does development zone have spillover effect in China?
BGFIKR9I 13 69166 Valerio Mendoza - 2016 - Preferential policies and income inequality Evide.pdf
MEQK439C 0 0 Preferential policies and income inequality: Evidence from Special Economic Zones and Open Cities in China
98DRB7TG 40 60785 Gebremariam and Feyisa - ASSESSING THE PERFORMANCE OF INDUSTRIAL PARKS (IPS.pdf
83BKNBA7 0 0 ASSESSING THE PERFORMANCE OF INDUSTRIAL PARKS (IPS) IN ETHIOPIA: THE CASE OF BOLE LEMI 1, EASTERN INDUSTRY ZONE AND HAWASSA INDUSTRIAL PARKS
KF8F2MLY 8 62808 Giannecchini and Taylor - 2018 - The eastern industrial zone in Ethiopia Catalyst .pdf
EXPWL9HX 0 0 The eastern industrial zone in Ethiopia: Catalyst for development?
KZWBXW3L 23 62465 Kuznetsov and Kuznetsova - 2019 - The success and failure of Russ

In [15]:
%%time
# Use a spacy model for English to extract entities. 

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import re

import en_core_web_lg
# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_lg.load()

# Add the component to the pipeline
nlp.add_pipe('sentencizer')

entities = []
for k, v in d.items():
    pages = v.get('_pages') or []
    
    if len(pages) > 0:
        text = '\n'.join(e['text'] for e in pages)

        doc = nlp(text)

        # Create list of word tokens
        token_list = [tok for tok in doc if tok.text not in STOP_WORDS]

        entities.extend([(k, i, i.label_, i.label) for i in doc.ents if i.label_ not in ('CARDINAL','DATE') ])
      


In [37]:
import re, string; 
pat = re.compile('[\W_]+')
  
ents = set([ (pat.sub(' ',str(e[1]).lower()).strip(),str(e[2])) for e in entities])

set([ e[1] for e in ents ])

{'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [43]:
# Geopolitical Entities
list(set([ e[0] for e in ents if e[1] == 'GPE' ]))[:20]


['',
 'k v',
 'v y',
 'mi',
 'dominican republic n equal',
 'argentina',
 'subic bay',
 'washington',
 'cheesman',
 'brazil',
 'burundi',
 'the united kingdom',
 'eu',
 'hawassa ip',
 'lanka',
 'the soviet union',
 'australia',
 'bangladesh',
 'tatarstan',
 'ningbo']