In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from data.data_preparation import ELECTRIFICATION_OPTIONS, RISE_SUB_INDICATOR_STRUCTURE

In [None]:
raw_data = pd.read_csv('data/raw_data.csv')
ndc_countries = raw_data['country_wb'].to_list()

*Source*: https://rise.worldbank.org/country

In [None]:
# without headers the website rejects the request (look at your Network Monitor to see which one to use)
HDR = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
}

def assign_value_to_answer(answer, indicator, idx):
    """Assign the portion of the RISE score depending on the answer"""
    if answer == 'Yes':
        return RISE_SUB_INDICATOR_STRUCTURE[indicator][idx]
    else:
        return 0  


def get_country_info(country, country_iso=None):
    """Extract RISE sub-indicators of a country on the World Bank website"""
    
    page = requests.get('https://rise.worldbank.org/country/{}'.format(country), headers=HDR)
    
    soup = BeautifulSoup(page.text, 'html.parser')
    
    results = []
    
    for opt_name, opt_num in zip(ELECTRIFICATION_OPTIONS, [3, 4, 5]):
        # find the panel for each electrification options
        opt = soup.find('div',class_=f'indicator-{opt_num}')
        rows = opt.find_all('div', class_='row')

        idx = -1
        for row in list(rows)[1:]:
            is_subtitle = row.find('span', class_='ans-hyphen') is not None
            if is_subtitle:
                # find the subgroup of the RISE sub-indicator
                subtitle = row.find('div', class_='title-wrap').get_text()
                idx = idx + 1
            else:
                # find the question of the RISE sub-indicator
                question = row.find('div', class_='title-wrap').get_text().replace('\n', ' ')
                if question not in list_q:
                    print(question)
                else:
                    # find the answer to the RISE sub-indicator's question
                    answer = row.find('span', class_='ans').get_text()
                    value = assign_value_to_answer(answer, opt_name, idx)
                    results.append(['rise_{}'.format(opt_name), subtitle, question, value])
    
    results = np.vstack(results)
    column_names = ['indicator', 'sub_indicator_group', 'sub_indicator_text', 'value']
    results = pd.DataFrame.from_records(results, columns=column_names)    
    results['country'] = country
    if country_iso is not None:
        results['country_iso'] = country_iso
    
    return results

Get list of countries of World Bank database

In [None]:
page = requests.get('https://rise.worldbank.org/countries', headers=HDR)
soup = BeautifulSoup(page.text, 'html.parser')

countries = soup.find_all('div', class_='country-name')
countries_names = []
for i in range(len(countries)):
    country_name = list(countries[i].children)[0]['href'].split('/')[2]
    countries_names.append(country_name)

Prepare the list of the NDC-OG countries matching the WB database

In [None]:
raw_data = pd.read_csv('data/raw_data.csv')
ndc_countries = raw_data['country_wb'].to_list()
raw_data = raw_data.set_index('country_wb')

missing = []
matching = []

for c in ndc_countries:
    if c in countries_names:
        matching.append((c, raw_data.loc[c].country_iso))
    else:
        missing.append((c, raw_data.loc[c].country_iso))

# peru has apparently a rise score of 100 and no "yes/no" answers
# list of countries which RISE score are not provided by the World Bank

NAN_COUNTRIES = missing + [matching.pop(matching.index(('peru','PER')))]  

Download the RISE subindicators for the NDC-OG countries

In [None]:
# Download the info of the matching countries
df = get_country_info(*matching[0])
for c in matching[1:]:
    df = df.append(get_country_info(*c), ignore_index=True)

Fill the non matching countries with rise scores of 0 with all questions' answers set to no

In [None]:
RISE_SUB_INDICATORS = pd.read_csv('data/RISE_indicators.csv')

for c in missing:
    nan_df = RISE_SUB_INDICATORS.copy()
    nan_df = nan_df.drop(['score_count_yes'], axis=1)
    nan_df['country'] = c[0]
    nan_df['country_iso'] = c[1]
    nan_df = nan_df.rename(columns={"score_count_no": "value"})
    nan_df
    save_df = df.append(nan_df, ignore_index=True)


In [None]:
test_df = pd.read_csv('data/RISE_subindicators_country.csv')

In [None]:
#save_df.to_csv('data/RISE_subindicators_country.csv')

In [None]:
sub_group = 'Funding support for grid electrification'
sub_group_df = RISE_SUB_INDICATORS.loc['rise_grid']
sub_group_df = sub_group_df.loc[sub_group_df.sub_indicator_group == sub_group]
sub_group_df.count()