In [62]:
#import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests import get
import json

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import asyncio
import concurrent.futures
import re
import os
import nltk
from textblob import Word
from nltk.tag import pos_tag
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk import download as nltk_download
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups

stemmer = SnowballStemmer("english")
stop_words=stopwords.words('english')

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)




## IEA Data

In [68]:
#get all countries with entiries in the IEA database
get_countries = "https://www.iea.org/policiesandmeasures/climatechange/"
g_c = get(get_countries)

soup_c = BeautifulSoup(g_c.text, 'html.parser')
c_list = soup_c.find("ul", class_ ="listexpander")
c_loop = c_list.find_all("label")

country_store = []
for c in c_loop:
    country_store.append(c.text)
    
full_list = country_store[4:]
print(full_list)

['Albania', 'Algeria', 'Australia', 'Austria', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Canada', 'China', 'Croatia', 'Czech Republic', 'Denmark', 'Estonia', 'European Union', 'Finland', 'France', 'Germany', 'Ghana', 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Ireland', 'Israel', 'Italy', 'Japan', 'Korea', 'Latvia', 'Luxembourg', 'Malaysia', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Netherlands', 'New Zealand', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Slovak Republic', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom', 'United States']


In [70]:
#get each countries directory of database entries for climate policies and measure, renewable energy, 
#and energy efficiency

def get_countries(link_store, url):
    for country in full_list:
        if " " in country:
            amend = country.replace(" ", "%20")
            policy_link = url + amend 
            link_store.append(policy_link)
        else:
            policy_link = url + country
            link_store.append(policy_link)
        
#for climate policies and measure database
policy_link_store = []
get_countries(policy_link_store, "https://www.iea.org/policiesandmeasures/climatechange/?country=")
print(len(policy_link_store))

51


In [72]:
#for renewable energy
renewables_store = []
get_countries(renewables_store, "https://www.iea.org/policiesandmeasures/renewableenergy/?country=")
print(len(renewables_store))

51


In [73]:
#energy efficiency
efficiency_store = []
get_countries(efficiency_store, "https://www.iea.org/policiesandmeasures/energyefficiency/?country=")
print(len(efficiency_store))


51


In [74]:
#get individual links for each entry in db

def store_pages(store, ptype):
    for p in ptype:
        g_p = session.get(p)
        soup_p = BeautifulSoup(g_p.text, 'html.parser')
        entries = soup_p.find_all("tr")[1:]
        for entry in entries:
            ext = entry.find("a", href = True)
            link = "https://www.iea.org/" + ext._attr_value_as_string("href")
            store.append(link)

#get individual links for each policy and measure page for each country          
page_link_store = []
store_pages(page_link_store, policy_link_store) 
print(len(page_link_store))

2790


In [75]:
#get individual links for each renewable energy policy for each country 
renewpage_link_store = []
store_pages(renewpage_link_store,renewables_store)
print(len(renewpage_link_store))

1492


In [41]:
#get individual links for each energy efficiency policy for each country 
eff_page_link_store = []
store_pages(eff_page_link_store, efficiency_store)
print(len(renewpage_link_store))

In [51]:
#Scrape each climate policy webpage in db
full_set = []

async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in page_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            #entry = dict()
            soup_e = BeautifulSoup(response.text, 'html.parser')

            for script in soup_e(["script", "style"]):
                script.extract()

            text = soup_e.get_text()
            
                #entry["url"] = link
                #entry["text"] = text
            
            full_set.append(text)


loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [54]:
#Scrape each renewable energy policy webpage in db
full_renew_set = []

async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in renewpage_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            entry = dict()
            soup_e = BeautifulSoup(response.text, 'html.parser')
            
            for script in soup_e(["script", "style"]):
                script.extract()

            text = soup_e.get_text()
            
                #entry["url"] = link
                #entry["text"] = text
            
            full_renew_set.append(text)


loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [56]:
#Scrape each energy efficiency policy webpage in db
full_eff_set = []

async def main():
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:

        loop = asyncio.get_event_loop()
        futures = [
            loop.run_in_executor(
                executor, 
                session.get, 
                link
            )
            for link in eff_page_link_store
        ]
        
        for response in await asyncio.gather(*futures):
            entry = dict()
            soup_e = BeautifulSoup(response.text, 'html.parser')

            for script in soup_e(["script", "style"]):
                script.extract()

            text = soup_e.get_text()
            
                #entry["url"] = link
                #entry["text"] = text
            
            full_eff_set.append(text)


loop = asyncio.get_event_loop()
loop.run_until_complete(main())

In [65]:
print('climate change policy db:', len(full_set))
print('renewable energy policy db:', len(full_renew_set))
print('energy efficiency db:', len(full_eff_set))
complete = full_set + full_renew_set + full_eff_set
print('total:', len(complete))

climate change policy db: 2790
renewable energy policy db: 1492
energy efficiency db: 1995
total: 6277


In [66]:
print(complete[1])






IEA - Albania







International Energy Agency

Pусский中文网页



AboutDelegatesOur MissionLeadershipJobsTechnology PlatformTraining
NewsroomEventsNewsNewslettersPresentationsWorkshops

TopicsCarbon capture and storageClean energy technologies Climate changeCoalElectricityEnergy accessEnergy and GenderEnergy efficiencyEnergy security InvestmentNatural gasNuclearOilRenewablesTransportEngagement worldwide
Countries
Statistics
Data & Publications








Home » Policies and Measures » PAMS » Albania
Albanian National Strategy of Energy

Country:Albania
Year:2003
Policy status:In Force
Jurisdiction:National
Date Effective:2003
Policy Type:Policy Support>Strategic planning
Policy Target:Framework/ Multi-sectoral Policy
Agency:Ministry of Industry and Energy, National Agency of Energy
URL:http://unfccc.int/files/meetings/seminar/application/pdf/sem_albania_sup1.pdf
Description:The main goal of the Albanian National Strategy of Energy is to develop an effective energy sector that guarante

In [63]:
ieadb = {'text': complete}
with open('ieapandm.json', 'w') as outfile:
    json.dump(ieadb, outfile)

In [78]:
load = pd.read_json('ieapandm.json', 'column')
load['label'] = 1
positive_df = load[['text', 'label']]
positive_df.head()

Unnamed: 0,text,label
0,\n\n\n\n\nIEA - Albania\n\n\n\n\n\n\n\nInterna...,1
1,\n\n\n\n\nIEA - Albania\n\n\n\n\n\n\n\nInterna...,1
2,\n\n\n\n\nIEA - Australia\n\n\n\n\n\n\n\nInter...,1
3,\n\n\n\n\nIEA - Australia\n\n\n\n\n\n\n\nInter...,1
4,\n\n\n\n\nIEA - Australia\n\n\n\n\n\n\n\nInter...,1


## EU Climate Change Mitigation Policies and Measures Dataset

In [81]:
#load data
eu_db = pd.read_csv('EU_climate_change_mitigation_policies_and_measures.csv')

In [90]:
eu_db.head()

Unnamed: 0,_id:text,Country:text,ID_of_policy_or_measure:text,Name_of_policy_or_measure:text,Single_policy_or_measure__or_group_of_measures:text,Policies_or_measures_included_in_the_group:text,Type_of_policy_instrument:text,Status_of_implementation:text,Impact_of_policy_or_measure___EU_ETS__outside_the_ETS___ESD___or_LULUCF_:text,Sector_s__affected:text,...,Reference_for_realised_costs_and_benefits:text,Web_link_for_realised_costs_and_benefits:text,ReportID:text,Status_of_implementation_clean:text,Implementation_period_start_clean:text,Projection_scenario_in_which_the_policy_or_measure_is_included_clean:text,Is_the_policy_or_measure_related_to_a_Union_policy__clean:text,Union_policy_lookup_only4facets_clean:text,Projected_net_cost_per_year__EUR_:text,Realised_net_cost_per_year__EUR_:text
0,Czech Republic_21,Czech Republic,21,Combined transportation support,Single,Single PaM,Planning,Implemented,ESD,Transport,...,,,http://cdr.eionet.europa.eu/Converters/run_con...,Implemented,2000,With existing measures,Yes,Other (Union policy not listed above or additi...,121204.0,
1,Czech Republic_24,Czech Republic,24,Eco-labelling in transport,Single,Single PaM,Information,Implemented,ESD,Transport,...,,,http://cdr.eionet.europa.eu/Converters/run_con...,Implemented,2000,With existing measures,Yes,Other (Union policy not listed above or additi...,80803.0,
2,Romania_67,Romania,67,WEM Scenario for Agriculture sector,Group,"1, 5, 45, 46, 47, 48, 49, 50","Regulatory, Economic",See individual PaMs,"EU ETS, ESD","Cross-cutting, Agriculture",...,,,http://cdr.eionet.europa.eu/Converters/run_con...,See individual PaMs,See individual PaMs,See individual PaMs,Yes,Effort Sharing Decision 406/2009/EC; Industria...,,
3,Romania_38,Romania,38,Modernization of the residential sector,Single,Single PaM,"Planning, Regulatory",Planned,ESD,Energy Consumption,...,,,http://cdr.eionet.europa.eu/Converters/run_con...,Planned,2015,With additional measures,Yes,Energy Efficiency Directive 2012/27/EU; Recast...,,
4,Belgium_115,Belgium,115,Create economic conditions favourable to RES d...,Group,"2, 3",Fiscal,See individual PaMs,EU ETS,Energy Supply,...,,,http://cdr.eionet.europa.eu/Converters/run_con...,See individual PaMs,See individual PaMs,See individual PaMs,Yes,Cogeneration Directive 2004/8/EC; Energy Taxat...,,


In [101]:
links = eu_db['ReportID:text'].tolist()
print(len(links))
print(links[0])

1323
http://cdr.eionet.europa.eu/Converters/run_conversion?file=/ec/mmr/art04-13-14_lcds_pams_projections/colvzkuna/envvzkvxq/CZ_MMR_PAM__20150626.xml&conv=524&source=remote#pam21


In [134]:
description = eu_db[['Description:text']]
description['label'] = 1
description.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Description:text,label
0,"Introduction of ""Park and Ride"" systém, combi...",1
1,To provide with information about CO2 emission...,1
2,Reduction of GHG emissions in Agriculture sector,1
3,Reduction of the energy consumption by the fol...,1
4,Support RES financially and tax fossil fuels,1
