In [4]:
#Import relevant libraries
import matplotlib.pyplot as plt
import numpy as np, pandas as pd
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import sqlalchemy
import re
import urllib.request

## Importing ARPA-E project information

For my research, I wanted to look at all the projects funded by ARPA-E. To do this, I scraped their website to get a dataframe of all the projects along with project information. I also noted from reading ARPA-E's funding reports that the "Year" noted in the project information was different than the year that the project was actually funded, I called this sponsoring year. I assumed that the "sponsoring year" was the year that the specific project showed up on the ARPA-E website.

In [8]:
def scrape(save):
  url='https://arpa-e.energy.gov/technologies/projects'
  page = 1
  projects = []
  while url:

    #Print Page number
    print("Page: ", page)

    #Ping URL
    response = requests.get(url)

    #Create BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')

    #Get all of the projects on a given page
    blog_titles = soup.findAll('span', attrs={"class":"bold-link"})
    for link in blog_titles:

      #For each project, find and go to the link of it's respective project page
      url1 = "https://arpa-e.energy.gov" + link.find('a')['href']
      response1 = requests.get(url1)

      #Create BeautifulSoup object for the project
      soup1 = BeautifulSoup(response1.text, 'html.parser')

      #Get company name for the project
      company = soup1.find('h1', attrs={"class":"inner-page-title"}).text.strip()
      print(company)
      tags = soup1.find('div', attrs = {"class": "tag"}).text.strip().split('\n')

      #Get critical need that the project addresses
      need = soup1.find("h3", text="Critical Need:").parent.find('div').text if soup1.find("h3", text="Critical Need:") else []

      #Get the advantage that the project promises
      advantage = soup1.find("h3", text="Project Innovation + Advantages: ").parent.find('div').text.strip()

      #If the award amount is given then get that as well
      if '$'in soup1.find("span", text="Award:").parent.text:
        award = soup1.find("span", text="Award:").parent.text.strip().split('$')[1].strip()
      else:
        continue

      #Find the location (city, state) along with the project year, funding year and status for the project and the partner organizations
      location_state = soup1.find("span", text="Location:").parent.text.strip().split(':')[1].split(',')[1].strip() if ',' in soup1.find("span", text="Location:").parent.text.strip().split(':')[1] else []
      location_city = soup1.find("span", text="Location:").parent.text.strip().split(':')[1].split(',')[0].strip() if ',' in soup1.find("span", text="Location:").parent.text.strip().split(':')[1] else []
      project_year = soup1.find("span", text="Project Term:").parent.text.strip().split(':')[1].split('-')[0].strip()[-4:]
      funding_year = soup1.find('div', attrs = {"class": "project-bottom"}).text.strip()[-4:] if 'Release Date' in soup1.find('div', attrs = {"class": "project-bottom"}).text.strip() else []
      status = soup1.find("span", text="Status:").parent.text.strip().split(':')[1].strip()
      partners = soup1.find('div', attrs = {"class": "project-bottom"}).find('div', attrs={"class":"col"}).text.strip().split('\n') if soup1.find('div', attrs = {"class": "project-bottom"}).find('div', attrs={"class":"col"}) else []
      m = [company, project_year, funding_year, float(award.replace(',', '')), status, location_city, location_state, tags, need, advantage, partners]
      #Append it to the list of all projects
      projects.append(m)
    #Update URL to point to next page
    url = "https://arpa-e.energy.gov/technologies/projects"+ soup.find('li', attrs={"class": "pager__item pager__item--next"}).find('a')['href'] if soup.find('li', attrs={"class": "pager__item pager__item--next"}) else None
    page += 1
  #Create dataframe with all projects and it's information
  data = pd.DataFrame(projects, columns=['Company', 'ProjectYear', 'FundingYear', 'Award', 'Status', 'City', 'State', 'Tags', 'Need', 'Advantage', 'Partners'])

  #If save is True then save the CSV to the location
  if save:
    from google.colab import drive
    drive.mount('/content/drive')
    data.to_csv('/content/drive/MyDrive/Grad School-Files/U-I partnerships: Energy/Data/arpa_new.csv')
  return data
data = scrape(True)

<a href="https://open.gsa.gov/api/dap/" rel="noopener" target="_blank">API</a>
<a href="https://analytics.usa.gov/data/live/all-pages-realtime.csv">Download the full dataset.</a>
<a href="https://analytics.usa.gov/data/live/all-domains-30-days.csv">Download the full dataset.</a>
<a class="external-link" href="https://digital.gov/services/dap/">Digital Analytics Program</a>
<a class="external-link" href="https://digital.gov/services/dap/common-questions-about-dap-faq/#part-4">does not track individuals</a>
<a class="external-link" href="https://support.google.com/analytics/answer/2763052?hl=en">anonymizes the IP addresses</a>
<a class="external-link" href="https://analytics.usa.gov/data/live/second-level-domains.csv">400 executive branch government domains</a>
<a class="external-link" href="https://analytics.usa.gov/data/live/sites.csv">about 5,700 total websites</a>
<a href="https://open.gsa.gov/api/dap/" rel="noopener" target="_blank">API</a>
<a class="usa-button usa-button-secondary-