In [1]:
import pickle
import pandas as pd

In [2]:
with open('item_dict_9850.pkl', 'rb') as f:
    filter_dict = pickle.load(f)
    
print(len(filter_dict))

records = []
for key, val in filter_dict.items():
    if len(val) == 1:
        institution, country = val[0], ''
    else:
        institution, country = val[0], val[1]
    records.append([key, institution, country])

9850


In [3]:
df = pd.DataFrame(records, columns=['id', 'institution', 'country'])
df.head()

Unnamed: 0,id,institution,country
0,r210416a,The People's Bank of China,
1,r210322a,The People's Bank of China,
2,r210127j,The People's Bank of China,
3,r210217b,The People's Bank of China,
4,r201223b,The People's Bank of China,


In [4]:
def pick_text(left_text, content):
    return content[content.index(left_text)+len(left_text):].strip()

def fill_country(institution_name, country_name):
    
    if institution_name == 'Board of Governors of the Federal Reserve System': return 'United States'
    elif institution_name == 'European Central Bank': return 'Euro area'
    
    elif institution_name == 'Banco de Portugal': return 'Portugal'
    elif institution_name == 'Bank Indonesia': return 'Indonesia'
    elif institution_name == 'Deutsche Bundesbank': return 'Germany'
    elif institution_name == 'Hong Kong Monetary Authority': return 'Hong Kong'
    elif institution_name == 'Monetary Authority of Singapore': return 'Singapore'
    elif institution_name == 'South African Reserve Bank': return 'Republic of South Africa'
    elif institution_name == 'Sveriges Riksbank': return 'Sweden'
    elif institution_name == 'Swiss National Bank': return 'Swiss'
    elif institution_name == 'Federal Reserve Bank of New York': return 'United States'
    
    elif 'Bank of' in institution_name and 'Federal' not in institution_name:
        return pick_text('Bank of', institution_name)
    
    elif country_name != '': 
        return country_name
    
    else:
        return ''

Fill country name based on institution name

In [5]:
df[df.country==""].institution.unique()

array(["The People's Bank of China", 'Banco de Portugal',
       'Bank Indonesia', 'Bank of Albania', 'Bank of Botswana',
       'Bank of Canada', 'Bank of England', 'Bank of Estonia',
       'Bank of Finland', 'Bank of France', 'Bank of Ghana',
       'Bank of Greece', 'Bank of Israel', 'Bank of Italy',
       'Bank of Jamaica', 'Bank of Japan', 'Bank of Korea',
       'Bank of Lithuania', 'Bank of Mauritius', 'Bank of Mexico',
       'Bank of Namibia', 'Bank of Spain', 'Bank of Thailand',
       'Bank of Uganda', 'Bank of Zambia',
       'Board of Governors of the Federal Reserve System',
       'Central Bank of Barbados', 'Central Bank of Chile',
       'Central Bank of Iceland', 'Central Bank of Ireland',
       'Central Bank of Kenya', 'Central Bank of Malaysia',
       'Central Bank of the Philippines (Bangko Sentral ng Pilipinas)',
       'Deutsche Bundesbank', 'European Central Bank',
       'Federal Reserve Bank of New York', 'Hong Kong Monetary Authority',
       'Monetary Au

In [6]:
df['country'] = df.apply(lambda x: fill_country(x['institution'], x['country']), axis=1)
df[df.country==""].institution.unique()

array([], dtype=object)

Fill institution name

In [7]:
df[df.institution==""].country.unique()

array(['United States'], dtype=object)

In [8]:
with open('item_dict_Federal.pkl', 'rb') as f:
    filter_dict = pickle.load(f)
    
len(filter_dict)

records = []
for key, val in filter_dict.items():
    institution = val[0]
    records.append([key, institution])
    
df_federal = pd.DataFrame(records, columns=['id', 'institution'])
df_federal

Unnamed: 0,id,institution
0,r210415a,Board of Governors of the Federal Reserve System
1,r210331b,Board of Governors of the Federal Reserve System
2,r210330a,Board of Governors of the Federal Reserve System
3,r210326f,Board of Governors of the Federal Reserve System
4,r210325a,Board of Governors of the Federal Reserve System
...,...,...
1111,r111110h,Federal Reserve Bank of Philadelphia
1112,r111014b,Federal Reserve Bank of Philadelphia
1113,r111004d,Federal Reserve Bank of Philadelphia
1114,r110622g,Federal Reserve Bank of Philadelphia


In [9]:
def fill_institution(id_value, institution_name):
    if institution_name != '': return institution_name
    else: 
        items = df_federal[df_federal.id==id_value]
        if len(items) > 0:
            return items.iloc[0]['institution'] 
    return ''
  
df['institution'] = df.apply(lambda x: fill_institution(x['id'], x['institution']), axis=1)

Additional data (All institutions)

In [10]:
with open('item_dict_all_institutions.pkl', 'rb') as f:
    filter_dict = pickle.load(f)
    
len(filter_dict)

records = []
for key, val in filter_dict.items():
    institution = val[0]
    records.append([key, institution])
    
df_inst_all = pd.DataFrame(records, columns=['id', 'institution'])
df_inst_all

Unnamed: 0,id,institution
0,r210416a,The People's Bank of China
1,r210322a,The People's Bank of China
2,r210127j,The People's Bank of China
3,r210217b,The People's Bank of China
4,r201223b,The People's Bank of China
...,...,...
9740,r090427e,Swiss National Bank
9741,r090422a,Swiss National Bank
9742,r090402e,Swiss National Bank
9743,r090320d,Swiss National Bank


In [11]:
from tqdm import tqdm

additional_rows = []
for _, row in tqdm(df_inst_all.iterrows()):
    if len(df[df.id==row.id])==0:
        additional_rows.append([row.id, row.institution])

9745it [00:31, 305.32it/s]


In [12]:
for i in range(len(additional_rows)):
    additional_rows[i].append('Euro area')
additional_df = pd.DataFrame(additional_rows, columns=['id', 'institution', 'country'])

In [13]:
df = pd.concat([df, additional_df])
df

Unnamed: 0,id,institution,country
0,r210416a,The People's Bank of China,China
1,r210322a,The People's Bank of China,China
2,r210127j,The People's Bank of China,China
3,r210217b,The People's Bank of China,China
4,r201223b,The People's Bank of China,China
...,...,...,...
5,r131125i,European Central Bank,Euro area
6,r131125h,European Central Bank,Euro area
7,r131122h,European Central Bank,Euro area
8,r131122g,European Central Bank,Euro area


check missing values

In [14]:
df[df.country==""]

Unnamed: 0,id,institution,country


We can observe that a few files are not mapped to the institution filter. Those unmapped files are in the 'Unmapped' directory (ex. 'Speech_texts/PDF/2020/Unmapped').

Let me explain an example of the unmapped files:   

https://www.bis.org/review/r180718c.htm is related to the "Federal Reserve Bank of New York".
But this file is not mapped to the "Federal Reserve Bank of New York" with respect to the institution filter. 

Please see the below link:   
https://www.bis.org/cbspeeches/?cbspeeches=ZnJvbT0mdGlsbD0maW5zdGl0dXRpb25zPTIyJm9iamlkPWNic3BlZWNoZXMmcGFnZT03JnBhZ2luZ19sZW5ndGg9MTAmc29ydF9saXN0PWRhdGVfZGVzYyZ0aGVtZT1jYnNwZWVjaGVzJm1sPWZhbHNlJm1sdXJsPSZlbXB0eWxpc3R0ZXh0PQ%253D%253D

You can check that the https://www.bis.org/review/r180718c.htm does not appear in the filtering results of "Federal Reserve Bank of New York".

In [15]:
df[df.institution==""]

Unnamed: 0,id,institution,country
9735,r180718c,,United States
9743,r141023m,,United States
9744,r141023o,,United States
9746,r141009a,,United States
9767,r140414a,,United States
9771,r140328e,,United States
9788,r120319a,,United States
9789,r120319b,,United States
9791,r120302b,,United States
9797,r120112a,,United States


SAVE

In [16]:
df.to_csv('filter_info.csv', index=False)