# Scrape Press Briefings

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import json

I found this wonderful website [The American Presidency Project](http://www.presidency.ucsb.edu/index.php) with loads of information on american presidents, including White House press briefing transcripts starting from year 1979.

According to the site, *"The American Presidency Project (APP), non-profit and non-partisan, is the leading source of presidential documents on the internet... hosted at the University of California, Santa Barbara, has been a collaboration between John T. Woolley (UCSB) and Gerhard Peters (Citrus College) since 1999."*

I will be scraping press briefings starting from year 1992 from the site. Press briefings area of the site is structured, so that each year has a summary page containing all the press briefings held, listed by the President, Date and the Title of the briefing. Each title is a link to the detailed transcript of the briefing.

I will be scraping each summary page to obtain date, president and links to transcripts by year. Then drill down to transcripts. Finally storing contents as a json file by each year.


In [1]:
def scrape_urls(summaryUrl):
    baseUrl = 'http://www.presidency.ucsb.edu/'
    response = requests.get(summaryUrl)
    soup = BeautifulSoup(response.text,'lxml')
    
    allHrefNodes = soup.find('form',{'name':'signingstatements'}).find_next_sibling().find_all('a')
    dictList = []
    for a in allHrefNodes:
        dictList.append(
            {'url': baseUrl + a['href'].replace('../',''),
             'title': a.text,
             'heldBy':a.find_parent().find_previous_sibling().find_previous_sibling().text,
             'date': a.find_parent().find_previous_sibling().text
            })
    return(dictList)



def scrape_pressBriefing(summaryDict):
    response = requests.get(summaryDict['url'])
    soup = BeautifulSoup(response.text, 'lxml')
    
    dictList =[]
    for e in soup.find_all('p'):
        dictList.append({'paragraph':e.text})
        
    summaryDict['briefing'] = dictList

# This method calls scrape_urls for a given year to parse the press briefing summary page for that year, 
# then for each press briefing on that page calls
# scrape_pressBriefing to scrape the content of the briefing

def scrape_briefingForYear(year):
    print('Scraping summary links for year:'+str(year))
    summaryDicts = scrape_urls('http://www.presidency.ucsb.edu/press_briefings.php?year='+str(year)+'&Submit=DISPLAY')
    
    for summaryDict in summaryDicts:
        print('Scraping briefing for '+ summaryDict['date'])
        scrape_pressBriefing(summaryDict)
     
    jsonForYear = json.dumps(summaryDicts)
    
    with open('/Users/cooldude/DataScience/Metis/classProjects/Project_4/whiteHousePressBriefings_Data/whiteHousePressBriefings.'+str(year)+'.json', 'w') as outfile:
        outfile.write(jsonForYear)
        
    print('Done with ',year)
        
    

        

In [173]:
for year in range(2017,1992,-1):
    scrape_briefingForYear(year)

Scraping summary links for year:2017
Scraping briefing for January 3, 2017
Scraping briefing for January 4, 2017
Scraping briefing for January 5, 2017
Scraping briefing for January 6, 2017
Scraping briefing for January 9, 2017
Scraping briefing for January 10, 2017
Scraping briefing for January 11, 2017
Scraping briefing for January 12, 2017
Scraping briefing for January 12, 2017
Scraping briefing for January 13, 2017
Scraping briefing for January 17, 2017
Scraping briefing for January 21, 2017
Scraping briefing for January 31, 2017
Scraping briefing for February 1, 2017
Scraping briefing for February 2, 2017
Scraping briefing for February 3, 2017
Scraping briefing for February 6, 2017
Scraping briefing for February 7, 2017
Scraping briefing for February 8, 2017
Scraping briefing for February 9, 2017
Scraping briefing for February 9, 2017
Scraping briefing for February 14, 2017
Scraping briefing for February 17, 2017
Scraping briefing for February 19, 2017
Scraping briefing for Februar