In [1]:
import re
import os
import pandas as pd
from datetime import date

In [2]:
ccmatch = re.compile(r"(?:loan|credit agreement)\s*(?:numbers?|no[.]?)\s*(ibrd)?\s*([0-9\s]{4,5}(?:-?[0-9]?)*)[-\s]*([a-z]{2,4})")

grantmatch = re.compile(r"grant\s*number\s*(tf[0-9]*)[-\s]*([a-z]{2,4})")

In [3]:
source_dir = "world_bank_loans_txt"

In [4]:
id_date_regex = re.compile(r"([0-9]{4})\_(\w*)\_([0-9]{1,2})\_([0-9]*)")

In [5]:
file_list = os.listdir(source_dir)
bad_captures = []
ids = []
country_codes = []
dates = []
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12}
for f in file_list:
    if f[-3:] != 'txt':
        continue
    info = id_date_regex.match(f)
    id = info.groups()[3]
    ids.append(id)
    y = info.groups()[0]
    m = info.groups()[1]
    d = info.groups()[2]
    dates.append(date(int(y),months[m],int(d)))
    with open(source_dir + '/' + f,'r') as fh:
        agreement = fh.read()
        captures = ccmatch.search(agreement.lower())
        if captures is None:
            captures = grantmatch.search(agreement.lower())
            if captures is None:
                bad_captures.append(f)
                country_codes.append(None)
            else:
                country_codes.append(captures.groups()[1])
        else:
            country_codes.append(captures.groups()[2])

In [6]:
#bad_captures

In [7]:
f = source_dir + "/2010_july_21_376051489609974334_ctf-loan-agreement-for-tf096291-mx-conformed.txt"
with open(f) as fh:
    agreement = fh.read()
agreement = agreement.lower()

In [8]:
iso_codes = pd.read_csv("wikipedia-iso-country-codes.csv")
country_code_dict = dict()
for idx, row in iso_codes.iterrows():
    code2 = str(row['Alpha-2 code']).lower()
    code3 = row['Alpha-3 code'].lower()
    country_name = row['English short name lower case']
    country_code_dict[code2] = country_name
    country_code_dict[code3] = country_name

In [9]:
country_code_dict['yu'] = 'Yugoslavia'
country_code_dict['cha'] = "China"
country_code_dict['egt'] = "Egypt"
country_code_dict['sw'] = "Swaziland"
country_code_dict['mor'] = "Morocco"
country_code_dict['waf'] = "West African Monetary Union (?)"
country_code_dict['le'] = "Lebanon"
country_code_dict['bul'] = "Bulgaria"
country_code_dict['ko'] = "Korea"
country_code_dict['tu'] = "Turkey"
country_code_dict['yf'] = "Serbia"
country_code_dict['cob'] = "People's Republic of the Congo" #should just be Republic of the Congo, its modern name?
country_code_dict['slu'] = "Saint Lucia"
country_code_dict['ur'] = "Uruguay"
country_code_dict['zim'] = "Zimbabwe"
country_code_dict['fij'] = "Fiji"
country_code_dict['uni'] = "Nigeria"
country_code_dict['mas'] = "Mauritius"
country_code_dict['rom'] = "Romania"
country_code_dict['slo'] = "Slovenia"
country_code_dict['ld'] = "Indonesia" #ocr error
country_code_dict['nd'] = "Indonesia" #ocr error
country_code_dict['mi'] = "Moldova" #ocr error
country_code_dict['he'] = "Mexico" #ocr error
country_code_dict['oro'] = "Romania" #ocr error
country_code_dict['bot'] = "Botswana"
country_code_dict['na'] = "Namibia"
country_code_dict['bar'] = "Barbados"
country_code_dict['crg'] = "Caribbean Development Bank"
country_code_dict['ivc'] = "Cote d'Ivoire"
country_code_dict['sivc'] = "Cote d'Ivoire" #ocr error
country_code_dict['ho'] = "Honduras"
country_code_dict['sey'] = "Seychelles"
country_code_dict['cs'] = "Slovak Republic"
country_code_dict['loan'] = None #ocr/regex error
country_code_dict['dear'] = None #ocr/regex error
country_code_dict['addi'] = None #ocr/regex error
country_code_dict['cred'] = None #ocr/regex error


In [10]:
countries_frame = pd.DataFrame({"id": ids, "date": dates, "country_code": country_codes, "country_name": ['']*len(ids)})

In [11]:
for i in range(len(countries_frame)):
    code = countries_frame.country_code.iloc[i]
    if code is not None:
        if code in country_code_dict:
            countries_frame.country_name.iloc[i] = country_code_dict[code]
        else: 
            print(countries_frame.iloc[i])
            

In [12]:
countries_frame.sample(10)

Unnamed: 0,id,date,country_code,country_name
2270,572511468296114374,1998-10-06,pol,Poland
1656,879391468223155847,1997-08-29,cha,China
2124,504431468202130944,2007-08-16,ar,Argentina
20,202591468248697830,2010-08-02,co,Colombia
991,126261574264006073,2019-10-24,ar,Argentina
1646,416371468193468439,2009-02-27,ar,Argentina
2783,493441468240861878,2008-06-19,co,Colombia
446,987261468303048645,2012-12-11,uy,Uruguay
979,756351531341849623,2018-06-29,tn,Tunisia
2436,801331468270320140,1997-01-24,kz,Kazakhstan


In [13]:
countries_frame.describe()

Unnamed: 0,id,date,country_code,country_name
count,3201,3201,3116,3194
unique,3201,2100,139,107
top,756211468015006363,1994-02-14,in,India
freq,1,11,191,336


In [17]:
countries_frame = countries_frame.sort_values("date",axis=0)

In [19]:
countries_frame.to_csv("id_date_country.csv",index=False)