In [1]:
import os, time, pickle
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from collections import defaultdict

In [2]:
SELENIUM_PATH = 'C:\\GIT\\SELENIUM_DRIVERS\\chromedriver_win32\\chromedriver'

In [3]:
def load_driver(chrome_driver_filepath):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(chrome_driver_filepath, chrome_options=options)
    driver.implicitly_wait(3)
    return driver

def read_options(driver):
    institution_option_values = []
    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_institutions_filter'))
    options = driver.find_element_by_id('cbspeeches_institutions_filter').find_elements_by_tag_name("option")

    for option in options:
        content = option.get_attribute('outerHTML')
        content_name = option.get_attribute('innerHTML').strip()

        left_txt = '<option value="'
        right_txt = '">'
        option_value = content[content.index(left_txt)+len(left_txt):content.index(right_txt)]
        
        institution_option_values.append((option_value, content_name))
        
    country_option_values = []
    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_countries_filter'))
    options = driver.find_element_by_id('cbspeeches_countries_filter').find_elements_by_tag_name("option")

    for option in options:
        content = option.get_attribute('outerHTML')
        content_name = option.get_attribute('innerHTML').strip()

        left_txt = '<option value="'
        right_txt = '">'
        option_value = content[content.index(left_txt)+len(left_txt):content.index(right_txt)]
        
        country_option_values.append((option_value, content_name))
    
    return institution_option_values, country_option_values

def get_soup(driver):
    try: soup = BeautifulSoup(driver.page_source, "html.parser")
    except: soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

def pick_txt(left_text, right_text, content):
    return content[content.index(left_txt)+len(left_txt):content.index(right_txt)].strip()

def sleep_(config_sleep=3, random=True):
    if random:
        sleeptime = np.random.randint(1,10)*0.3*config_sleep
    else:
        sleeptime = config_sleep
    print('sleep ', sleeptime)
    time.sleep(sleeptime)

In [4]:
driver = load_driver(SELENIUM_PATH)
institution_option_values, country_option_values = read_options(driver)

  driver = webdriver.Chrome(chrome_driver_filepath, chrome_options=options)


In [8]:
institution_option_values = [item for item in institution_option_values if 'Federal' in item[1]]
institution_option_values

[('24', 'Board of Governors of the Federal Reserve System'),
 ('95', 'Federal Reserve Bank of Atlanta'),
 ('88', 'Federal Reserve Bank of Boston'),
 ('96', 'Federal Reserve Bank of Chicago'),
 ('263', 'Federal Reserve Bank of Dallas'),
 ('85', 'Federal Reserve Bank of Kansas City'),
 ('101', 'Federal Reserve Bank of Minneapolis'),
 ('22', 'Federal Reserve Bank of New York'),
 ('99', 'Federal Reserve Bank of Philadelphia'),
 ('100', 'Federal Reserve Bank of Richmond'),
 ('105', 'Federal Reserve Bank of San Francisco')]

In [9]:
item_dict = dict()
visited_dict = defaultdict(int)

# institution
for option_value, institution_name in institution_option_values:
    print('Processing', institution_name)

    driver.get('https://www.bis.org/cbspeeches/')
    select = Select(driver.find_element_by_id('cbspeeches_institutions_filter'))
    select.select_by_value(option_value)

    try:
        driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
        driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(1) > div > a').click()
    except: 
        driver.get('https://www.bis.org/cbspeeches/')
        select = Select(driver.find_element_by_id('cbspeeches_institutions_filter'))
        select.select_by_value(option_value)
        
        soup = get_soup(driver)
        a_list = soup.select("table.documentList tr a.dark")
        for a_element in a_list:
            id_value = a_element['href'].split('/')[-1][:-4]
            item_dict[id_value] = [institution_name]

        print('[{}] '.format(len(item_dict)))
        continue
        
    while True:    
        sleep_(config_sleep=5)
        pageof = driver.find_element_by_css_selector('div.pageof').get_attribute('innerHTML')
        print(pageof, end=' ')
        visited_dict[pageof] += 1
        if visited_dict[pageof] >= 3: break
            
        soup = get_soup(driver)
        a_list = soup.select("table.documentList tr a.dark")
        for a_element in a_list:
            id_value = a_element['href'].split('/')[-1][:-4]
            item_dict[id_value] = [institution_name]

        print('[{}] '.format(len(item_dict)))

        try: driver.find_element_by_css_selector('#cbspeeches_list > div > div.navigation.page_nav_bottom > nav > div > div:nth-child(2) > div > div:nth-child(3) > div > a').click()
        except:
            try:
                navs = soup.select('a.navcarot')
                if len(navs) == 2:
                    next_url = "https://www.bis.org/cbspeeches/" + navs[-1]['href']
                    driver.get(next_url)
            except: break

Processing Board of Governors of the Federal Reserve System
sleep  7.5
Page <span style="font-weight:600;">1 of 73</span> [10] 
sleep  3.0
Page <span style="font-weight:600;">2 of 73</span> [20] 
sleep  4.5
Page <span style="font-weight:600;">3 of 73</span> [30] 
sleep  7.5
Page <span style="font-weight:600;">4 of 73</span> [40] 
sleep  6.0
Page <span style="font-weight:600;">5 of 73</span> [50] 
sleep  1.5
Page <span style="font-weight:600;">6 of 73</span> [60] 
sleep  12.0
Page <span style="font-weight:600;">7 of 73</span> [70] 
sleep  3.0
Page <span style="font-weight:600;">8 of 73</span> [80] 
sleep  7.5
Page <span style="font-weight:600;">9 of 73</span> [90] 
sleep  3.0
Page <span style="font-weight:600;">10 of 73</span> [100] 
sleep  7.5
Page <span style="font-weight:600;">11 of 73</span> [110] 
sleep  1.5
Page <span style="font-weight:600;">12 of 73</span> [120] 
sleep  10.5
Page <span style="font-weight:600;">13 of 73</span> [130] 
sleep  9.0
Page <span style="font-weight:600;"

Page <span style="font-weight:600;">23 of 31</span> [1005] 
sleep  7.5
Page <span style="font-weight:600;">24 of 31</span> [1015] 
sleep  4.5
Page <span style="font-weight:600;">25 of 31</span> [1025] 
sleep  4.5
Page <span style="font-weight:600;">26 of 31</span> [1035] 
sleep  10.5
Page <span style="font-weight:600;">27 of 31</span> [1045] 
sleep  9.0
Page <span style="font-weight:600;">28 of 31</span> [1055] 
sleep  6.0
Page <span style="font-weight:600;">29 of 31</span> [1065] 
sleep  10.5
Page <span style="font-weight:600;">30 of 31</span> [1075] 
sleep  13.499999999999998
Page <span style="font-weight:600;">31 of 31</span> [1084] 
sleep  9.0
Page <span style="font-weight:600;">31 of 31</span> [1084] 
sleep  6.0
Page <span style="font-weight:600;">31 of 31</span> Processing Federal Reserve Bank of Philadelphia
sleep  6.0
Page <span style="font-weight:600;">1 of 4</span> [1094] 
sleep  4.5
Page <span style="font-weight:600;">2 of 4</span> [1104] 
sleep  3.0
Page <span style="font-w

In [10]:
with open('item_dict_Federal.pkl', 'wb') as f:
    pickle.dump(item_dict, f)