In [325]:
import pandas as pd
import numpy as np
from xml.etree.ElementTree import parse

import spacy

In [60]:
document = parse('./traveladvisory.xml') 

In [61]:
print(document)

<xml.etree.ElementTree.ElementTree object at 0x10b0559d0>


In [62]:
print(dir(document))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_root', '_setroot', 'find', 'findall', 'findtext', 'getiterator', 'getroot', 'iter', 'iterfind', 'parse', 'write', 'write_c14n']


In [63]:
root = document.getroot()

In [64]:
root.attrib

{}

In [495]:
[child.tag for child in root[7].iter()]

['{http://www.w3.org/2005/Atom}entry',
 '{http://www.w3.org/2005/Atom}title',
 '{http://www.w3.org/2005/Atom}author',
 '{http://www.w3.org/2005/Atom}email',
 '{http://www.w3.org/2005/Atom}name',
 '{http://www.w3.org/2005/Atom}uri',
 '{http://www.w3.org/2005/Atom}link',
 '{http://www.w3.org/2005/Atom}category',
 '{http://www.w3.org/2005/Atom}category',
 '{http://www.w3.org/2005/Atom}category',
 '{http://www.w3.org/2005/Atom}summary',
 '{http://www.w3.org/2005/Atom}id',
 '{http://www.w3.org/2005/Atom}published',
 '{http://www.w3.org/2005/Atom}updated']

In [None]:
for child in root:
    print(child.tag, child.attrib)

# Extract text
## Inspect structure

In [None]:
# advisory path
for item in document.iterfind('http://www.w3.org/2005/Atom}entry'):
    print(item)

In [None]:
for item in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    print(item.findtext('{http://www.w3.org/2005/Atom}title'))

In [None]:
for item in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    print(item.findtext('{http://www.w3.org/2005/Atom}summary'))

In [None]:
for entry in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    for author in entry.findall('{http://www.w3.org/2005/Atom}author'):
        print(author.findtext('{http://www.w3.org/2005/Atom}name'))

In [None]:
for entry in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    for category in entry.iter('{http://www.w3.org/2005/Atom}category'):
        print(category.attrib)

In [None]:
for entry in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    for category in entry.iter('{http://www.w3.org/2005/Atom}category'):
        name = category.get('label')
        term = category.get('term')
        print(name, term)
        

In [None]:
for item in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    for link in (item.iter('{http://www.w3.org/2005/Atom}link')):
        print(link.get('href'))

## Create dataframe

In [202]:
title = []
link = []
summary = []
date_published = []
date_updated = []


In [203]:
for item in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    title.append(item.findtext('{http://www.w3.org/2005/Atom}title'))
    summary.append(item.findtext('{http://www.w3.org/2005/Atom}summary'))
    date_published.append(item.findtext('{http://www.w3.org/2005/Atom}published'))
    date_updated.append(item.findtext('{http://www.w3.org/2005/Atom}updated'))
    
    for links in (item.iter('{http://www.w3.org/2005/Atom}link')):
        link.append(links.get('href'))



In [231]:
df = pd.DataFrame({'title': title, 'link': link, 'summary': summary, 
                   'date_published': date_published, "date_updated":date_updated})

In [232]:
df.head(3)

Unnamed: 0,title,link,summary,date_published,date_updated
0,Liechtenstein - Level 1: Exercise Normal Preca...,https://travel.state.gov/content/travel/en/tra...,<p><b>Global Health Advisory: Do Not Travel. A...,2019-02-19T15:59:21.369Z,2020-03-24T19:27:00.135Z
1,North Macedonia - Level 1: Exercise Normal Pre...,https://travel.state.gov/content/travel/en/tra...,<p><b>Global Health Advisory: Do Not Travel. A...,2019-02-19T15:59:21.369Z,2020-03-24T20:17:25.924Z
2,Nauru - Level 1: Exercise Normal Precautions,https://travel.state.gov/content/travel/en/tra...,<p><b>Global Health Advisory: Do Not Travel. A...,2019-02-19T15:59:21.369Z,2020-03-24T20:06:39.217Z


In [233]:
# ran into incorrect lengths because published and updated were in the summary text as well
print(len(date_published))
print(len(title))
print(len(date_updated))

209
209
209


In [319]:

terms = []
for entry in document.iterfind('{http://www.w3.org/2005/Atom}entry'):
    for category in entry.iter('{http://www.w3.org/2005/Atom}category'):
        label = category.get('label')
        term = category.get('term')
        tuples = (label, term)
        terms.append(tuples)

        


In [293]:
zipObj = zip(terms)
dictionary = dict(zipObj)


In [320]:
dictionary['Country-Tag'] = 

{'Country-Tag': 'CF',
 'Keyword': 'advisory',
 'Threat-Level': 'Level 2: Exercise Increased Caution'}

# Text Cleaning

In [337]:
df['summary'][3]

'<p><b>Global Health Advisory: Do Not Travel. Avoid all international travel due to the global impact of&nbsp;<a href="https://travel.state.gov/content/travel/en/traveladvisories/ea/travel-advisory-alert-global-level-4-health-advisory-issue.html">COVID-19</a>.</b></p>\n<p>Exercise normal precautions in Palau.</p>\n<p>Read the Safety and Security section on the <a href="https://travel.state.gov/content/travel/en/international-travel/International-Travel-Country-Information-Pages/Palau.html">country information page</a>.</p>\n<p>If you decide to travel to Palau:</p>\n<ul>\n<li>Enroll in the <a href="https://step.state.gov/step/">Smart Traveler Enrollment Program</a> (<a href="https://step.state.gov/step/">STEP</a>) to receive security messages and make it easier to locate you in an emergency.</li>\n<li>Follow the Department of State on <a href="https://www.facebook.com/travelgov/">Facebook</a> and <a href="https://twitter.com/TravelGov">Twitter</a>.</li>\n<li>U.S. citizens who travel abr

In [356]:
import re

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [357]:
test = remove_tags(df['summary'][3])

In [360]:
test = test.replace('\n', '').replace('&nbsp;', ' ')

test

'Global Health Advisory: Do Not Travel. Avoid all international travel due to the global impact ofCOVID-19.Exercise normal precautions in Palau.Read the Safety and Security section on the country information page.If you decide to travel to Palau:Enroll in the Smart Traveler Enrollment Program (STEP) to receive security messages and make it easier to locate you in an emergency.Follow the Department of State on Facebook and Twitter.U.S. citizens who travel abroad should always have a contingency plan for emergency situations. Review the Traveler’s Checklist.Last Updated: Reissued after periodic review without changes.'

In [363]:
# function to remove html tags and to clean up script

def clean_text(texts):
    clean_texts = []
    for text in texts:
        text = TAG_RE.sub('', text) #includes url links as well
        text = text.replace('\n', '').replace('&nbsp;', ' ')
        clean_texts.append(text)
    return clean_texts
        
    
    

In [364]:
df['summary'] = clean_text(df['summary'] )

In [369]:
#inspect
df['summary'][5]



## Alternative parsing method

In [373]:
!pip install xmltodict

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [374]:
import xmltodict

In [375]:
with open('./traveladvisory.xml') as fd:
    advisory_dictionary = xmltodict.parse(fd.read())
    

In [384]:
advisory_dictionary['feed']['entry'][2]

OrderedDict([('title',
              OrderedDict([('@type', 'html'),
                           ('#text',
                            'Nauru - Level 1: Exercise Normal Precautions')])),
             ('author',
              OrderedDict([('email', 'none'),
                           ('name', 'TSG'),
                           ('uri', 'https://travel.state.gov')])),
             ('link',
              OrderedDict([('@href',
                            'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/nauru-travel-advisory.html')])),
             ('category',
              [OrderedDict([('@label', 'Country-Tag'), ('@term', 'NR')]),
               OrderedDict([('@label', 'Keyword'), ('@term', 'advisory')]),
               OrderedDict([('@label', 'Threat-Level'),
                            ('@term',
                             'Level 1: Exercise Normal Precautions')])]),
             ('summary',
              OrderedDict([('@type', 'html'),
                    

# Reflections
4 hours - 

This was my first time working with XML. I found that the most time consuming part was the inspection of tree structure and in particular getting the information within a nested subtree. I'm sure there are faster approaches out there, but it definitely helps to view/know the structure.

Other - text cleaning, paragraph structure lost. Cleaning scripts may depend on what type of analysis - word emebedding or paragraph/document embedding.

Useful resources:
- <a href="https://towardsdatascience.com/parsing-xml-data-in-python-da26288280e1">basic XML parsing</a>
- <a href="https://docs.python.org/2/library/xml.etree.elementtree.html">documentation </a>
- <a href="https://tutorialedge.net/python/removing-html-from-string/">html script cleaning</a>
- <a href="https://docs.python-guide.org/scenarios/xml/">xmltodict</a>

# JSON /API

Veteran Affairs 
https://www.va.gov/webservices/press/documentation/releases.cfm
- Documentation page not maintained, 500 internal errors when looking at demos (Perhaps a common problem)

OpenFEC 
- API for Federal Election Commission filings
- https://api.open.fec.gov/developers/

In [387]:
import requests

In [388]:
response = requests.get("https://developer.nrel.gov/api/alt-fuel-stations/v1/nearest.json?api_key=Z9SNbTyUefPgAdhjZfqPIrCUSqTtfTYlVMeliTN2&location=Denver+CO")
print(response.status_code)

200


In [389]:
import json

In [432]:
# committee id fundings
ted_cruz = requests.get("https://api.open.fec.gov/v1/candidate/S2TX00312/filings/?party=REP&most_recent=true&max_receipt_date=01%2F01%2F2020&per_page=20&sort=-receipt_date&sort_hide_null=false&api_key=Z9SNbTyUefPgAdhjZfqPIrCUSqTtfTYlVMeliTN2&sort_nulls_last=false&sort_null_only=false&page=1")


td_data = ted_cruz.json()

In [433]:
print(td_data)

{'api_version': '1.0', 'results': [{'committee_name': None, 'senate_personal_funds': None, 'cash_on_hand_beginning_period': None, 'candidate_id': 'S2TX00312', 'sub_id': '4120720181620534585', 'debts_owed_by_committee': None, 'cycle': 2018, 'net_donations': None, 'house_personal_funds': None, 'total_receipts': None, 'total_independent_expenditures': None, 'document_type_full': None, 'treasurer_name': None, 'update_date': '2018-12-07T21:00:34', 'debts_owed_to_committee': None, 'total_communication_cost': None, 'request_type': None, 'beginning_image_number': '201812069134971099', 'total_individual_contributions': None, 'ending_image_number': '201812069134971100', 'html_url': 'http://docquery.fec.gov/cgi-bin/forms/S2TX00312/1299312/', 'is_amended': False, 'state': 'TX', 'fec_url': 'http://docquery.fec.gov/dcdev/posted/1299312.fec', 'document_description': 'Statement of candidacy 2018', 'cash_on_hand_end_period': None, 'receipt_date': '2018-12-06T00:00:00', 'report_type': None, 'csv_url': '

In [428]:
## No filings with the query

td_df = pd.DataFrame(td_data["results"])

In [429]:
td_df

Unnamed: 0,total_communication_cost,pdf_url,debts_owed_to_committee,amendment_version,office,committee_type,beginning_image_number,cash_on_hand_beginning_period,house_personal_funds,form_type,...,most_recent_file_number,senate_personal_funds,party,committee_name,previous_file_number,report_type,treasurer_name,candidate_name,net_donations,total_receipts
0,,http://docquery.fec.gov/pdf/099/20181206913497...,,6,S,,201812069134971099,,,F2,...,1299312,,REP,,1272203,,,"CRUZ, RAFAEL EDWARD TED",,


### committee id fundings

resource: https://github.com/NickyThreeNames/openFEC/blob/master/FECOpenAPI.ipynb

In [414]:
url = "https://api.open.fec.gov/v1/committee/C00492785/filings/"

params = {'party': "REP",
         "api_key": key,
         }

In [418]:
data = requests.get("https://api.open.fec.gov/v1/committee/C00492785/filings/?per_page=20&sort=-receipt_date&sort_hide_null=false&api_key=Z9SNbTyUefPgAdhjZfqPIrCUSqTtfTYlVMeliTN2&sort_nulls_last=false&sort_null_only=false&page=1").json()

data['pagination']['count']

190

In [422]:
data

{'api_version': '1.0',
 'pagination': {'count': 190, 'pages': 10, 'per_page': 20, 'page': 1},
 'results': [{'treasurer_name': None,
   'net_donations': None,
   'candidate_name': None,
   'total_communication_cost': None,
   'cycle': 2020,
   'csv_url': 'http://docquery.fec.gov/csv/454/1399454.csv',
   'file_number': 1399454,
   'beginning_image_number': '202004159219170261',
   'committee_name': 'TED CRUZ FOR SENATE',
   'html_url': 'http://docquery.fec.gov/cgi-bin/forms/C00492785/1399454/',
   'debts_owed_to_committee': 0.0,
   'total_individual_contributions': None,
   'party': None,
   'report_type': 'Q1',
   'house_personal_funds': None,
   'most_recent_file_number': 1399454,
   'committee_type': 'S',
   'document_type_full': None,
   'election_year': None,
   'sub_id': '4041520201736791057',
   'fec_url': 'http://docquery.fec.gov/dcdev/posted/1399454.fec',
   'office': 'S',
   'committee_id': 'C00492785',
   'pages': 1983,
   'most_recent': True,
   'amendment_chain': [1399454.0]

In [424]:
df = pd.DataFrame(data["results"])

In [426]:
df.head()

Unnamed: 0,treasurer_name,net_donations,candidate_name,total_communication_cost,cycle,csv_url,file_number,beginning_image_number,committee_name,html_url,...,cash_on_hand_beginning_period,total_disbursements,amendment_indicator,form_type,receipt_date,total_independent_expenditures,coverage_end_date,pdf_url,primary_general_indicator,cash_on_hand_end_period
0,,,,,2020,http://docquery.fec.gov/csv/454/1399454.csv,1399454.0,202004159219170261,TED CRUZ FOR SENATE,http://docquery.fec.gov/cgi-bin/forms/C0049278...,...,1667690.8,1161990.26,N,F3,2020-04-15T00:00:00,,2020-03-31T00:00:00,http://docquery.fec.gov/pdf/261/20200415921917...,P,2115100.83
1,,,,,2020,http://docquery.fec.gov/csv/702/1378702.csv,1378702.0,202001319185070820,TED CRUZ FOR SENATE,http://docquery.fec.gov/cgi-bin/forms/C0049278...,...,1434045.74,1046803.88,N,F3,2020-01-31T00:00:00,,2019-12-31T00:00:00,http://docquery.fec.gov/pdf/820/20200131918507...,P,1667690.8
2,,,,,2020,http://docquery.fec.gov/csv/191/1358191.csv,1358191.0,201910159164218683,TED CRUZ FOR SENATE,http://docquery.fec.gov/cgi-bin/forms/C0049278...,...,804524.07,627114.18,N,F3,2019-10-15T00:00:00,,2019-09-30T00:00:00,http://docquery.fec.gov/pdf/683/20191015916421...,P,1434045.74
3,,,,,2018,http://docquery.fec.gov/csv/493/1349493.csv,1349493.0,201908239163101526,TED CRUZ FOR SENATE,http://docquery.fec.gov/cgi-bin/forms/C0049278...,...,262799.92,485636.43,A,F3,2019-08-23T00:00:00,,2018-12-31T00:00:00,http://docquery.fec.gov/pdf/526/20190823916310...,P,157958.64
4,,,,,2020,http://docquery.fec.gov/csv/495/1349495.csv,1349495.0,201908239163102654,TED CRUZ FOR SENATE,http://docquery.fec.gov/cgi-bin/forms/C0049278...,...,,,,F99,2019-08-23T00:00:00,,,http://docquery.fec.gov/pdf/654/20190823916310...,,


# Reflections

Pretty clean, no need for additional text cleaning. Straight forward and well maintained documentation for openFEC with website and GitHub, as opposed to Veteran's Affairs. When converting json dict to dataframe, lost heading metadata. Any way to maintain that?

Would be helpful to have:
- initial view of results, like a counter to make sure the API made the call or there are results that match
- error handling in general, used DEM instead of REP for Ted Cruz search param, call completed but there was no data to match
- how are keys to be maintained? By user or by software?

## PDF from url

In [439]:
## from openFEC API - list of pdfs
# pip install textract & PyPDF2

# import packages
import textract
from PyPDF2 import PdfFileWriter, PdfFileReader

In [447]:
sample_url = df["pdf_url"][1]

In [473]:
sample_url

'http://docquery.fec.gov/pdf/820/202001319185070820/202001319185070820.pdf'

In [462]:
## initial error - needed to use urllib, pypdf2 is looking for local files
# https://stackoverflow.com/questions/9751197/opening-pdf-urls-with-pypdf


In [None]:
import urllib.request

pdf_data = urllib.request.urlopen(sample_url).read()

# another error "Remote end closed connection without response"
# same error with requests.get(sample_url, stream = True) 

In [477]:
## try another pdf link

nato_url = "https://www.nato.int/nato_static_fl2014/assets/pdf/2020/3/pdf_publications/sgar19-en.pdf"

In [483]:
remoteFile = urllib.request.urlopen(nato_url).read()

In [486]:
import requests
response = requests.get(nato_url)
print(response.text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Notes - messy foray into pdf from url

List of pdf urls in FEC dataset, but unable to access via requests or urllib. Seems to be too large (over 1000 pages per doc). Tried with a link to a different pdf, a NATO report with over 100 pages, similar findings. 

Of course in a tool, the pdf is most likely to be uploaded. Then again, it might be that the user finds all the links in a documents and tries to extract text from that link. 

Maybe worth understanding the upload process a bit. It should be easy enough to text clean if the user can identify what kind of analysis is of interest. Other thoughts, user:

- may want to remove html script, but would lose links
- may want to keep paragraph structure or sentences
- how to do text cleaning that keeps all types of structure? If a dataframe, a column = tokenized_sent, tokenized_words -- user then has the option to use both as necessary. So two different tracks for text cleaning.
