In [7]:
#import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests import get
import json

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import asyncio
import concurrent.futures
import re
import os
import nltk
from textblob import Word
from nltk.tag import pos_tag
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk import download as nltk_download
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
""
stemmer = SnowballStemmer("english")
stop_words=stopwords.words('english')

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)




## IEA Data

In [8]:
#get all countries with entiries in the IEA database
get_countries = "https://www.iea.org/policiesandmeasures/climatechange/"
g_c = get(get_countries)

soup_c = BeautifulSoup(g_c.text, 'html.parser')
c_list = soup_c.find("ul", class_ ="listexpander")
c_loop = c_list.find_all("label")

country_store = []
for c in c_loop:
    country_store.append(c.text)
    
full_list = country_store[4:]
print(full_list)

['Albania', 'Algeria', 'Australia', 'Austria', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Canada', 'China', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'European Union', 'Finland', 'France', 'Germany', 'Ghana', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Japan', 'Korea', 'Latvia', 'Luxembourg', 'Malaysia', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Slovak Republic', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom', 'United States']


In [9]:
#get each countries directory of database entries for climate policies and measure, renewable energy, 
#and energy efficiency

def get_countries(link_store, url):
    for country in full_list:
        if " " in country:
            amend = country.replace(" ", "%20")
            policy_link = url + amend 
            link_store.append(policy_link)
        else:
            policy_link = url + country
            link_store.append(policy_link)
        
#for climate policies and measure database
policy_link_store = []
get_countries(policy_link_store, "https://www.iea.org/policiesandmeasures/climatechange/?country=")
print(len(policy_link_store))

51


In [10]:
#for renewable energy
renewables_store = []
get_countries(renewables_store, "https://www.iea.org/policiesandmeasures/renewableenergy/?country=")
print(len(renewables_store))

51


In [11]:
#energy efficiency
efficiency_store = []
get_countries(efficiency_store, "https://www.iea.org/policiesandmeasures/energyefficiency/?country=")
print(len(efficiency_store))


51


In [12]:
#get individual links for each entry in db

def store_pages(store, ptype):
    for p in ptype:
        g_p = session.get(p)
        soup_p = BeautifulSoup(g_p.text, 'html.parser')
        entries = soup_p.find_all("tr")[1:]
        for entry in entries:
            ext = entry.find("a", href = True)
            link = "https://www.iea.org/" + ext._attr_value_as_string("href")
            store.append(link)



In [13]:
#get individual links for each policy and measure page for each country          
page_link_store = []
store_pages(page_link_store, policy_link_store) 
print(len(page_link_store))

2790


In [None]:
#get individual links for each renewable energy policy for each country 
renewpage_link_store = []
store_pages(renewpage_link_store,renewables_store)
print(len(renewpage_link_store))

In [None]:
#get individual links for each energy efficiency policy for each country 
eff_page_link_store = []
store_pages(eff_page_link_store, efficiency_store)
print(len(eff_page_link_store))

In [None]:
#Scrape each climate policy webpage in db
full_set = []

async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in page_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            soup_e = BeautifulSoup(response.text, 'html.parser')

            table = soup_e.find("tbody").find_all("td")

            entry = dict()
            for i in range(len(table)):

                
                if "Policy Type" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

                elif "Description" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

            full_set.append(entry)



loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [None]:
len(full_set)

In [None]:
#Scrape each renewable energy policy webpage in db


async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in renewpage_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            soup_e = BeautifulSoup(response.text, 'html.parser')

            table = soup_e.find("tbody").find_all("td")

            entry = dict()
            for i in range(len(table)):

    
                if "Policy Type" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

                elif "Description" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

            full_set.append(entry)


loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [None]:
len(full_set)

In [None]:
#Scrape each energy efficiency policy webpage in db


async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in eff_page_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            soup_e = BeautifulSoup(response.text, 'html.parser')

            table = soup_e.find("tbody").find_all("td")

            entry = dict()
            for i in range(len(table)):

     
                if "Policy Type" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

                elif "Description" in table[i].text:
                    entry[table[i].text.lower()] = table[i+1].text.lower()

            full_set.append(entry)


loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [None]:
len(full_set)

In [None]:
full_set

In [None]:
print('climate change policy db:', len(page_link_store))
print('renewable energy policy db:', len(renewpage_link_store))
print('energy efficiency db:', len(eff_page_link_store))

print('total:', len(full_set))

In [None]:
with open('iea_separate.json', 'w') as outfile:
    json.dump(full_set, outfile)

In [None]:
load = pd.read_json('iea_separate.json', 'column')
fixed = load.drop(load.columns[2:4], axis=1)
fixed['label'] = 1
fixed

## EU Climate Change Mitigation Policies and Measures Dataset

In [5]:
import pandas as pd

#load data
eu_db = pd.read_csv('../Data/EU_climate_change_mitigation_policies_and_measures.csv')

eu_db.head()

links = eu_db['ReportID:text'].tolist()
print(len(links))
print(links[0])

description = eu_db[['Description:text']]
description['label'] = 1
description.head()

eu_db.head()

links = eu_db['ReportID:text'].tolist()
print(len(links))
print(links[0])

description = eu_db[['Description:text']]
description['label'] = 1
description.shape

1323
http://cdr.eionet.europa.eu/Converters/run_conversion?file=/ec/mmr/art04-13-14_lcds_pams_projections/colvzkuna/envvzkvxq/CZ_MMR_PAM__20150626.xml&conv=524&source=remote#pam21
1323
http://cdr.eionet.europa.eu/Converters/run_conversion?file=/ec/mmr/art04-13-14_lcds_pams_projections/colvzkuna/envvzkvxq/CZ_MMR_PAM__20150626.xml&conv=524&source=remote#pam21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(1323, 2)