# Webscraping lists of climate change accepting institutions

Webscrape the list from the Desmoblog websites. 
http://www.aag.org/cs/programs/interdisciplinary/climatechange/clearinghouse/organizations
and
https://climatestore.com/take-action/get-involved/non-profit-organizations-working-on-climate-change

The goal is to then use google news search ' "[institute name]" "climate change" ' and get the first few pages.
    
This will allow us to get a dataset of climate change accepting articles with the body of text, year, institution

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

## Climatestore

In [2]:
url = "https://climatestore.com/take-action/get-involved/non-profit-organizations-working-on-climate-change"
page = requests.get(url)
soup = bs(page.content, 'html.parser')

In [7]:
results = soup.find_all(class_='grid-nx3-1fr')

In [8]:
category_names = []
insts = {}
grid_elements = results[0].find_all('div')#[0].find_all('a')[1]
for category in grid_elements:
    # Separate each category
    cat_name = category.find('h3').text
    category_names.append(cat_name)
    # Get names of institutions for each category
    inst_names = category.find_all('a')
    institutions = []
    for inst in inst_names:
        name = inst.contents[0]
        # Some have an additional '- Podcast' or other media to the name. Remove this
        name = name.split('-')[0]
        institutions.append(name)
    institutions.remove('\n')
    #Some have duplicates after removing the additional media (podcast, radio etc)
    institutions = list(np.unique(institutions))
    insts[cat_name] = institutions

    


In [9]:
insts

{'Climate Change Action': ['350.org',
  'Citizens Climate Lobby',
  'Climate Reality Project',
  'Climate Solutions',
  'ConservAmerica',
  'Earth Justice',
  'GreenPeace USA',
  'Moms Clean Air Force',
  'Mothers Out Front',
  'NextGen Climate',
  'Union of Concerned Scientists',
  'iMatter'],
 'Climate Change Education': ['ACE ',
  'CAMEL Climate Adaption Mitigation E',
  'CCEP ',
  'Climate Reality Project ',
  'I See Change '],
 'Wildlife/Ecosytem Conservation': ['Audubon Society',
  'Conservation International',
  'Conservation Land Foundation',
  'Environmental Defense Fund (EDF)',
  'Land Trust Alliance',
  'National Wildlife Federation (NWF)',
  'Natural Resources Defense Council (NRDC)',
  'Nature Conservancy',
  'Polar Bears International',
  'Rainforest Action Network (RAN)',
  'Sierra Club',
  'The Wilderness Society',
  'World Wildlife Federation (WWF)'],
 'Ocean Conservation': ['Marine Conservation Institute',
  'Ocean Conservancy',
  'Oceana',
  'The Safina Center'],
 'F

In [224]:
climate_trusting_climatestore =pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in insts.items() ]))
climate_trusting_climatestore.to_excel('climate_trusting_climatestore.xlsx')

## American Association of Geographers (AAG)

In [12]:
url = "http://www.aag.org/cs/programs/interdisciplinary/climatechange/clearinghouse/organizations#content"
page = requests.get(url)
soup = bs(page.content, 'html.parser')

In [13]:
results = soup.find_all(class_='bcms-searchable')

In [14]:
# Extract the names of the recommended organisations
names = []
res = results[0].find_all('p')
for r in res:
    name = r.find_all('a')[0].contents
    names.append(name[0])

In [15]:
names

['U.S. Environmental Protection Agency (EPA) - Climate Change Science',
 'NOAA Education - Climate Change and Our Planet',
 'Intergovernmental Panel on Climate Change (IPCC)',
 'National Center for Atmospheric Research (NCAR)',
 'Center for Remote Sensing of Ice Sheets (CReSIS)',
 'National Climate Data Center (NCDC)',
 'World Meteorological Organization',
 'United Nations Environment Programme (UNEP), Climate Change',
 'United Nations Framework Convention on Climate Change (UNFCCC)',
 'Pew Center on Global Climate Change',
 'Food and Agriculture Organization (FAO) of the United Nations – Climate Change',
 'National Snow and Ice Data Center (NSIDC)',
 'International Geosphere-Biosphere Programme (IGBP)']

In [16]:
# Clean up the names 


# remove acronyms using regex
import re
names = [re.sub(' \([^()]*\)', '', n) for n in names]

# remove parts after "-" and ","
names = [n.split(' - ')[0] for n in names]
names = [n.split(' , ')[0] for n in names]

In [17]:
names

['U.S. Environmental Protection Agency',
 'NOAA Education',
 'Intergovernmental Panel on Climate Change',
 'National Center for Atmospheric Research',
 'Center for Remote Sensing of Ice Sheets',
 'National Climate Data Center',
 'World Meteorological Organization',
 'United Nations Environment Programme, Climate Change',
 'United Nations Framework Convention on Climate Change',
 'Pew Center on Global Climate Change',
 'Food and Agriculture Organization of the United Nations – Climate Change',
 'National Snow and Ice Data Center',
 'International Geosphere-Biosphere Programme']

In [18]:
climate_trusting_aag = pd.DataFrame({'American Association of Geographers recommendations': names})
climate_trusting_aag.to_excel('climate_trusting_aag.xlsx')

In [19]:
climate_trusting_aag

Unnamed: 0,American Association of Geographers recommendations
0,U.S. Environmental Protection Agency
1,NOAA Education
2,Intergovernmental Panel on Climate Change
3,National Center for Atmospheric Research
4,Center for Remote Sensing of Ice Sheets
5,National Climate Data Center
6,World Meteorological Organization
7,"United Nations Environment Programme, Climate ..."
8,United Nations Framework Convention on Climate...
9,Pew Center on Global Climate Change
