In [1]:
import os, time, pickle
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from collections import defaultdict

In [2]:
SELENIUM_PATH = 'C:\\GIT\\SELENIUM_DRIVERS\\chromedriver_win32\\chromedriver'

In [3]:
def load_driver(chrome_driver_filepath):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(chrome_driver_filepath, chrome_options=options)
    driver.implicitly_wait(3)
    return driver

def read_options(driver):
    institution_option_values = []
    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_institutions_filter'))
    options = driver.find_element_by_id('cbspeeches_institutions_filter').find_elements_by_tag_name("option")

    for option in options:
        content = option.get_attribute('outerHTML')
        content_name = option.get_attribute('innerHTML').strip()

        left_txt = '<option value="'
        right_txt = '">'
        option_value = content[content.index(left_txt)+len(left_txt):content.index(right_txt)]
        
        institution_option_values.append((option_value, content_name))
        
    country_option_values = []
    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_countries_filter'))
    options = driver.find_element_by_id('cbspeeches_countries_filter').find_elements_by_tag_name("option")

    for option in options:
        content = option.get_attribute('outerHTML')
        content_name = option.get_attribute('innerHTML').strip()

        left_txt = '<option value="'
        right_txt = '">'
        option_value = content[content.index(left_txt)+len(left_txt):content.index(right_txt)]
        
        country_option_values.append((option_value, content_name))
    
    return institution_option_values, country_option_values

def get_soup(driver):
    try: soup = BeautifulSoup(driver.page_source, "html.parser")
    except: soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

def pick_txt(left_text, right_text, content):
    return content[content.index(left_txt)+len(left_txt):content.index(right_txt)].strip()

def sleep_(config_sleep=5, random=True):
    if random:
        sleeptime = config_sleep+np.random.randint(1,10)*0.3
    else:
        sleeptime = config_sleep
    print('sleep ', sleeptime)
    time.sleep(sleeptime)

In [4]:
driver = load_driver(SELENIUM_PATH)
institution_option_values, country_option_values = read_options(driver)

  driver = webdriver.Chrome(chrome_driver_filepath, chrome_options=options)


In [7]:
item_dict = dict()
visited_dict = defaultdict(int)
unscraped_institutions = []

# institution
for option_value, institution_name in institution_option_values:
    print('Processing', institution_name)

    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_institutions_filter'))
    select.select_by_value(option_value)

    try:
        driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
        driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(1) > div > a').click()
    except: 
        unscraped_institutions.append(institution_name)
        continue
        
    while True:    
        sleep_()
        pageof = driver.find_element_by_css_selector('div.pageof').get_attribute('innerHTML')
        print(pageof, end=' ')
        visited_dict[pageof] += 1
        if visited_dict[pageof] >= 3: break
            
        soup = get_soup(driver)
        a_list = soup.select("table.documentList tr a.dark")
        for a_element in a_list:
            id_value = a_element['href'].split('/')[-1][:-4]
            item_dict[id_value] = [institution_name]

        print('[{}] '.format(len(item_dict)))

        try: driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
        except:
            try:
                navs = soup.select('a.navcarot')
                if len(navs) == 2:
                    next_url = "https://www.bis.org/cbspeeches/" + navs[-1]['href']
                    driver.get(next_url)
            except: break
        
# country
# unscraped_countries = []
# for option_value, country_name in country_option_values:
#     print('Processing', country_name)

#     driver.get('https://www.bis.org/cbspeeches/')
#     select = Select(driver.find_element_by_id('cbspeeches_countries_filter'))
#     select.select_by_value(option_value)

#     try:
#         driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
#         driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(1) > div > a').click()
#     except: 
#         unscraped_countries.append(country_name)
#         continue
        
#     while True:    
#         sleep_()
#         pageof = driver.find_element_by_css_selector('div.pageof').get_attribute('innerHTML')
#         print(pageof, end=' ')
#         visited_dict[pageof] += 1
#         if visited_dict[pageof] >= 3: break
            
#         soup = get_soup(driver)
#         a_list = soup.select("table.documentList tr a.dark")
#         for a_element in a_list:
#             id_value = a_element['href'].split('/')[-1][:-4]
            
#             if item_dict.get(id_value) is None:
#                 item_dict[id_value] = ['', country_name]
#             else:
#                 item_dict[id_value].append(country_name)

#         print('[{}] '.format(len(item_dict)))

#         try: driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
#         except:
#             try:
#                 navs = soup.select('a.navcarot')
#                 if len(navs) == 2:
#                     next_url = "https://www.bis.org/cbspeeches/" + navs[-1]['href']
#                     driver.get(next_url)
#             except: break

# print('unscraped_institutions', unscraped_institutions)
# print('unscraped_countries', unscraped_countries)

# with open('item_dict.pkl', 'wb') as f:
#     pickle.dump(item_dict, f)
    
# print(len(item_dict))

Processing The People's Bank of China
sleep  5.3
Page <span style="font-weight:600;">1 of 4</span> [10] 
sleep  5.3
Page <span style="font-weight:600;">2 of 4</span> [20] 
sleep  5.9
Page <span style="font-weight:600;">3 of 4</span> [30] 
sleep  5.3
Page <span style="font-weight:600;">4 of 4</span> [40] 
sleep  7.1
Page <span style="font-weight:600;">4 of 4</span> [40] 
sleep  7.4
Page <span style="font-weight:600;">4 of 4</span> Processing Banco de Portugal
sleep  6.5
Page <span style="font-weight:600;">1 of 7</span> [50] 
sleep  6.2
Page <span style="font-weight:600;">2 of 7</span> [60] 
sleep  5.6
Page <span style="font-weight:600;">3 of 7</span> [70] 
sleep  5.6
Page <span style="font-weight:600;">4 of 7</span> [80] 
sleep  7.1
Page <span style="font-weight:600;">5 of 7</span> [90] 
sleep  6.8
Page <span style="font-weight:600;">6 of 7</span> [100] 
sleep  6.2
Page <span style="font-weight:600;">7 of 7</span> [104] 
sleep  6.5
Page <span style="font-weight:600;">7 of 7</span> [104]

Page <span style="font-weight:600;">30 of 49</span> [1001] 
sleep  5.9
Page <span style="font-weight:600;">31 of 49</span> [1011] 
sleep  5.9
Page <span style="font-weight:600;">32 of 49</span> [1021] 
sleep  7.699999999999999
Page <span style="font-weight:600;">33 of 49</span> [1031] 
sleep  7.1
Page <span style="font-weight:600;">34 of 49</span> [1041] 
sleep  7.699999999999999
Page <span style="font-weight:600;">35 of 49</span> [1051] 
sleep  6.8
Page <span style="font-weight:600;">36 of 49</span> [1061] 
sleep  7.1
Page <span style="font-weight:600;">37 of 49</span> [1071] 
sleep  6.8
Page <span style="font-weight:600;">38 of 49</span> [1081] 
sleep  7.699999999999999
Page <span style="font-weight:600;">39 of 49</span> [1091] 
sleep  5.9
Page <span style="font-weight:600;">40 of 49</span> [1101] 
sleep  5.3
Page <span style="font-weight:600;">41 of 49</span> [1111] 
sleep  6.5
Page <span style="font-weight:600;">42 of 49</span> [1121] 
sleep  6.2
Page <span style="font-weight:600;"

Page <span style="font-weight:600;">20 of 25</span> [1959] 
sleep  6.2
Page <span style="font-weight:600;">21 of 25</span> [1969] 
sleep  6.2
Page <span style="font-weight:600;">22 of 25</span> [1979] 
sleep  6.8
Page <span style="font-weight:600;">23 of 25</span> [1989] 
sleep  7.1
Page <span style="font-weight:600;">24 of 25</span> [1999] 
sleep  5.3
Page <span style="font-weight:600;">25 of 25</span> [2005] 
sleep  5.9
Page <span style="font-weight:600;">25 of 25</span> [2005] 
sleep  5.9
Page <span style="font-weight:600;">25 of 25</span> Processing Bank of Jamaica
sleep  6.2
Page <span style="font-weight:600;">1 of 2</span> [2015] 
sleep  7.4
Page <span style="font-weight:600;">2 of 2</span> Processing Bank of Japan
sleep  7.699999999999999
Page <span style="font-weight:600;">1 of 46</span> [2025] 
sleep  7.1
Page <span style="font-weight:600;">2 of 46</span> [2035] 
sleep  7.699999999999999
Page <span style="font-weight:600;">3 of 46</span> [2045] 
sleep  5.6
Page <span style="fo

Page <span style="font-weight:600;">14 of 22</span> [2898] 
sleep  6.2
Page <span style="font-weight:600;">15 of 22</span> [2908] 
sleep  5.9
Page <span style="font-weight:600;">16 of 22</span> [2918] 
sleep  5.3
Page <span style="font-weight:600;">17 of 22</span> [2928] 
sleep  6.5
Page <span style="font-weight:600;">18 of 22</span> [2938] 
sleep  5.9
Page <span style="font-weight:600;">19 of 22</span> [2948] 
sleep  5.9
Page <span style="font-weight:600;">20 of 22</span> [2958] 
sleep  6.2
Page <span style="font-weight:600;">21 of 22</span> [2968] 
sleep  5.3
Page <span style="font-weight:600;">22 of 22</span> Processing Bank of Tanzania
Processing Bank of Thailand
sleep  7.4
Page <span style="font-weight:600;">1 of 13</span> [2978] 
sleep  6.2
Page <span style="font-weight:600;">2 of 13</span> [2988] 
sleep  6.8
Page <span style="font-weight:600;">3 of 13</span> [2998] 
sleep  6.8
Page <span style="font-weight:600;">4 of 13</span> [3008] 
sleep  5.6
Page <span style="font-weight:600

Page <span style="font-weight:600;">67 of 73</span> [3962] 
sleep  5.3
Page <span style="font-weight:600;">68 of 73</span> [3972] 
sleep  5.3
Page <span style="font-weight:600;">69 of 73</span> [3982] 
sleep  6.5
Page <span style="font-weight:600;">70 of 73</span> [3992] 
sleep  5.3
Page <span style="font-weight:600;">71 of 73</span> [4002] 
sleep  5.9
Page <span style="font-weight:600;">72 of 73</span> [4012] 
sleep  5.3
Page <span style="font-weight:600;">73 of 73</span> [4015] 
sleep  6.2
Page <span style="font-weight:600;">73 of 73</span> [4015] 
sleep  7.4
Page <span style="font-weight:600;">73 of 73</span> Processing Bulgarian National Bank
sleep  6.8
Page <span style="font-weight:600;">1 of 4</span> Processing Central Bank of Argentina
Processing Central Bank of Aruba
Processing Central Bank of Bahrain
sleep  7.1
Page <span style="font-weight:600;">1 of 3</span> Processing Central Bank of Barbados
sleep  7.1
Page <span style="font-weight:600;">1 of 8</span> [4025] 
sleep  6.8
Pa

Page <span style="font-weight:600;">22 of 34</span> [4836] 
sleep  7.699999999999999
Page <span style="font-weight:600;">23 of 34</span> [4846] 
sleep  5.3
Page <span style="font-weight:600;">24 of 34</span> [4856] 
sleep  6.8
Page <span style="font-weight:600;">25 of 34</span> [4866] 
sleep  6.5
Page <span style="font-weight:600;">26 of 34</span> [4876] 
sleep  6.8
Page <span style="font-weight:600;">27 of 34</span> [4886] 
sleep  7.4
Page <span style="font-weight:600;">28 of 34</span> [4896] 
sleep  6.5
Page <span style="font-weight:600;">29 of 34</span> [4906] 
sleep  6.5
Page <span style="font-weight:600;">30 of 34</span> [4916] 
sleep  7.4
Page <span style="font-weight:600;">31 of 34</span> [4926] 
sleep  6.5
Page <span style="font-weight:600;">32 of 34</span> [4936] 
sleep  5.9
Page <span style="font-weight:600;">33 of 34</span> [4946] 
sleep  6.8
Page <span style="font-weight:600;">34 of 34</span> [4952] 
sleep  5.6
Page <span style="font-weight:600;">34 of 34</span> [4952] 
sle

Page <span style="font-weight:600;">42 of 54</span> [5702] 
sleep  7.1
Page <span style="font-weight:600;">43 of 54</span> [5712] 
sleep  7.1
Page <span style="font-weight:600;">44 of 54</span> [5722] 
sleep  5.6
Page <span style="font-weight:600;">45 of 54</span> [5732] 
sleep  7.1
Page <span style="font-weight:600;">46 of 54</span> [5742] 
sleep  6.5
Page <span style="font-weight:600;">47 of 54</span> [5752] 
sleep  5.3
Page <span style="font-weight:600;">48 of 54</span> [5762] 
sleep  6.2
Page <span style="font-weight:600;">49 of 54</span> [5772] 
sleep  6.8
Page <span style="font-weight:600;">50 of 54</span> [5782] 
sleep  7.4
Page <span style="font-weight:600;">51 of 54</span> [5792] 
sleep  7.4
Page <span style="font-weight:600;">52 of 54</span> [5802] 
sleep  7.4
Page <span style="font-weight:600;">53 of 54</span> [5812] 
sleep  6.8
Page <span style="font-weight:600;">54 of 54</span> [5815] 
sleep  6.5
Page <span style="font-weight:600;">54 of 54</span> [5815] 
sleep  5.6
Page <

Page <span style="font-weight:600;">98 of 164</span> [6795] 
sleep  5.6
Page <span style="font-weight:600;">99 of 164</span> [6805] 
sleep  6.2
Page <span style="font-weight:600;">100 of 164</span> [6815] 
sleep  6.2
Page <span style="font-weight:600;">101 of 164</span> [6825] 
sleep  7.4
Page <span style="font-weight:600;">102 of 164</span> [6835] 
sleep  5.3
Page <span style="font-weight:600;">103 of 164</span> [6845] 
sleep  5.3
Page <span style="font-weight:600;">104 of 164</span> [6855] 
sleep  5.3
Page <span style="font-weight:600;">105 of 164</span> [6865] 
sleep  7.1
Page <span style="font-weight:600;">106 of 164</span> [6874] 
sleep  6.5
Page <span style="font-weight:600;">107 of 164</span> [6884] 
sleep  6.5
Page <span style="font-weight:600;">108 of 164</span> [6894] 
sleep  6.8
Page <span style="font-weight:600;">109 of 164</span> [6904] 
sleep  6.5
Page <span style="font-weight:600;">110 of 164</span> [6914] 
sleep  5.6
Page <span style="font-weight:600;">111 of 164</span>

Page <span style="font-weight:600;">3 of 12</span> [7780] 
sleep  6.8
Page <span style="font-weight:600;">4 of 12</span> [7790] 
sleep  5.3
Page <span style="font-weight:600;">5 of 12</span> [7800] 
sleep  7.699999999999999
Page <span style="font-weight:600;">6 of 12</span> [7810] 
sleep  7.1
Page <span style="font-weight:600;">7 of 12</span> [7820] 
sleep  5.9
Page <span style="font-weight:600;">8 of 12</span> [7830] 
sleep  6.8
Page <span style="font-weight:600;">9 of 12</span> [7840] 
sleep  6.2
Page <span style="font-weight:600;">10 of 12</span> [7850] 
sleep  7.1
Page <span style="font-weight:600;">11 of 12</span> [7860] 
sleep  7.1
Page <span style="font-weight:600;">12 of 12</span> Processing Magyar Nemzeti Bank
Processing Maldives Monetary Authority
Processing Monetary Authority of Macao
sleep  5.9
Page <span style="font-weight:600;">1 of 2</span> Processing Monetary Authority of Singapore
sleep  5.9
Page <span style="font-weight:600;">1 of 19</span> [7870] 
sleep  7.6999999999

Page <span style="font-weight:600;">15 of 59</span> [8713] 
sleep  5.9
Page <span style="font-weight:600;">16 of 59</span> [8723] 
sleep  7.4
Page <span style="font-weight:600;">17 of 59</span> [8733] 
sleep  7.1
Page <span style="font-weight:600;">18 of 59</span> [8743] 
sleep  6.5
Page <span style="font-weight:600;">19 of 59</span> [8753] 
sleep  6.2
Page <span style="font-weight:600;">20 of 59</span> [8763] 
sleep  5.9
Page <span style="font-weight:600;">21 of 59</span> [8773] 
sleep  6.2
Page <span style="font-weight:600;">22 of 59</span> [8783] 
sleep  5.9
Page <span style="font-weight:600;">23 of 59</span> [8793] 
sleep  6.5
Page <span style="font-weight:600;">24 of 59</span> [8803] 
sleep  7.1
Page <span style="font-weight:600;">25 of 59</span> [8813] 
sleep  5.6
Page <span style="font-weight:600;">26 of 59</span> [8823] 
sleep  5.6
Page <span style="font-weight:600;">27 of 59</span> [8833] 
sleep  6.5
Page <span style="font-weight:600;">28 of 59</span> [8843] 
sleep  5.6
Page <

Page <span style="font-weight:600;">15 of 20</span> [9702] 
sleep  6.2
Page <span style="font-weight:600;">16 of 20</span> [9712] 
sleep  5.6
Page <span style="font-weight:600;">17 of 20</span> [9722] 
sleep  6.2
Page <span style="font-weight:600;">18 of 20</span> [9732] 
sleep  6.8
Page <span style="font-weight:600;">19 of 20</span> [9742] 
sleep  6.2
Page <span style="font-weight:600;">20 of 20</span> [9745] 
sleep  6.5
Page <span style="font-weight:600;">20 of 20</span> [9745] 
sleep  5.9
Page <span style="font-weight:600;">20 of 20</span> Processing United Nations Secretay-General's Special Advocate for Inclusive Finance for Development
Processing Universidad de Chile


In [9]:
print('unscraped_institutions', unscraped_institutions)
# print('unscraped_countries', unscraped_countries)

with open('item_dict_all_institutions.pkl', 'wb') as f:
    pickle.dump(item_dict, f)
    
print(len(item_dict))

unscraped_institutions ['Bank Al-Maghrib (Central Bank of Morocco)', 'Bank for International Settlements', 'Bank of Algeria', 'Bank of Guatemala', 'Bank of Guyana', 'Bank of Latvia', 'Bank of Mozambique', 'Bank of Sierra Leone', 'Bank of Slovenia', 'Bank of Tanzania', 'Central Bank of Argentina', 'Central Bank of Aruba', 'Central Bank of Belize', 'Central Bank of Bolivia', 'Central Bank of Bosnia and Herzegovina', 'Central Bank of Brazil', 'Central Bank of Colombia', 'Central Bank of Cyprus', 'Central Bank of Jordan', 'Central Bank of Kuwait', 'Central Bank of Samoa', 'Central Bank of Uruguay', 'Central Bank of the United Arab Emirates', 'Croatian National Bank', 'Eastern Caribbean Central Bank', 'Federal Reserve Bank of Atlanta', 'Federal Reserve Bank of Boston', 'Federal Reserve Bank of Chicago', 'Federal Reserve Bank of Richmond', 'Federal Reserve Bank of San Francisco', 'Magyar Nemzeti Bank', 'Maldives Monetary Authority', 'National Bank of Cambodia', 'National Bank of Slovakia', '