In [1]:
import time
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [2]:
pd.set_option("display.max_columns", 2000)
pd.set_option("display.max_rows", 2000)

In [3]:
section_names = ['Africa', 'East Asia and the Pacific',
                 'Europe and Eurasia', 'Near East and North Africa',
                 'South Asia', 'Western Hemisphere']

In [4]:
def scrape_indexes(year, verbose=False):
    
    url = 'https://www.state.gov/j/drl/rls/hrrpt/{}/index.htm'.format(year)
    if verbose:
        print("Scraping Data from: ", url)
    
    r = requests.get(url)
    text = r.text
    soup = BeautifulSoup(text, "html5lib")

    section_links = []
    ignore_sections = ['Appendices', 'Appendixes', 'Front', 
                       'Material', 'Preface', 'Overview',
                       'Acknowledgements', 'Introduction']
    
    for link in soup.find('ul',{"class": "menu"}).find_all('a'):
        text = link.text
        url = link.get('href')
        if verbose:
            print("Found Link: ", text, url)
        section_links.append([text, url])

    keep_section_links = []
    for text, url in section_links:
        keep = True    
        for i in ignore_sections:
            if verbose:
                print(i, '>>>', text, '>>>', i in text)
            if i in text:
                keep = False
        if keep:
            keep_section_links.append([text, url])

    return keep_section_links

In [5]:
x = scrape_indexes(2005)
x

[['Africa', 'http://www.state.gov/j/drl/rls/hrrpt/2005/c17092.htm'],
 ['East Asia and the Pacific',
  'http://www.state.gov/j/drl/rls/hrrpt/2005/c17093.htm'],
 ['Europe and Eurasia',
  'http://www.state.gov/j/drl/rls/hrrpt/2005/c17094.htm'],
 ['Near East and North Africa',
  'http://www.state.gov/j/drl/rls/hrrpt/2005/c17095.htm'],
 ['South Asia', 'http://www.state.gov/j/drl/rls/hrrpt/2005/c17097.htm'],
 ['Western Hemisphere',
  'http://www.state.gov/j/drl/rls/hrrpt/2005/c17099.htm']]

In [6]:
def scrape_sections(year, section_url, 
                    base_url = 'https://www.state.gov', verbose=False,
                    delay=0.1):
    
    year = str(year)
    
    if verbose:
        print("Scraping Data from: ", section_url)
    time.sleep(delay)
    r = requests.get(section_url)
    text = r.text
    soup = BeautifulSoup(text, "html5lib")

    country_links = []
    
    for link in soup.find_all('a',{"target":"_self"}):
        text = link.text
        url = link.get('href')
        if verbose:
            print("Found Link: ", text, url)
        if url is not None:
            if year in url:
                country_links.append([year, text, base_url + url])

    return country_links

In [7]:
verbose=False

In [8]:
years = list(range(1999, 2019))

In [9]:
all_links = []
for year in years:
    section_urls = scrape_indexes(year=year, verbose=verbose)
    for text, url in section_urls:
        if verbose:
            print(text, '>>>', url)
        all_links = all_links + scrape_sections(year=year, 
                                                section_url=url, 
                                                verbose=verbose)

In [10]:
df = pd.DataFrame(all_links, columns=['year', 'country', 'url'])

In [11]:
df.head()

Unnamed: 0,year,country,url
0,1999,Angola,https://www.state.gov/j/drl/rls/hrrpt/1999/223...
1,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...
2,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...
3,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...
4,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...


In [12]:
def scrape_country(url):
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html5lib")
    return soup.find("div", {"id": "centerblock"}).text

In [13]:
all_links[0][1]

'Angola'

In [14]:
df['text'] = df.url.apply(scrape_country)

In [15]:
df.head()

Unnamed: 0,year,country,url,text
0,1999,Angola,https://www.state.gov/j/drl/rls/hrrpt/1999/223...,The Republic of Angola's transition from a sin...
1,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...,The Republic of Benin is a constitutional demo...
2,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...,"Botswana is a longstanding, multiparty democra..."
3,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...,President Blaise Compaore continued to dominat...
4,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...,Burundi is ruled by an authoritarian military ...


### Gapminder Names to UN Codes

In [16]:
fp = "../data/GapMinder_Raw_CSVs/UNCTRY_CODES_GapMinderNames.csv"
df_un = pd.read_csv(fp)

In [17]:
df_un.rename(columns={'CTRY':'country', 'UNCTRY':'code'}, inplace=True)

In [18]:
country_un_mapper = {k:v for k,v in zip(df_un.country, df_un.code)}

In [19]:
def apply_mapper(country_name):
    try:
        return country_un_mapper[country_name]
    except:
        return -1

### US State Department Country Names

In [20]:
fp_state = "../data/Custom_State_Dep_Reports/state_dept_country_names.csv"
df_state = pd.read_csv(fp_state)

In [21]:
df_state = df_state[['country', 'simple']].copy()
df_state.reset_index(inplace=True, drop=True)
df_state.rename(columns={'country': 'country_full', 'simple':'country'}, 
                inplace=True)

In [22]:
df_state['code_full'] = df_state.country_full.apply(apply_mapper)

In [23]:
df_state['code_short'] = df_state.country.apply(apply_mapper)

In [24]:
df_state['code'] = df_state[['code_full','code_short']].apply(max, axis=1)

In [25]:
# check that all country names are handled (in some way)
for cu in df.country.unique():
    cu = cu.strip()
    _df_cu = df_state[df_state.country == cu]
    _df_cu_f = df_state[df_state.country_full == cu]
    if (len(_df_cu) == 0) & (len(_df_cu_f) == 0):
        print(cu)

Make a name to code mapper

In [26]:
_df_state = df_state[["country_full","code"]].copy()
_df_state.rename(columns={"country_full":"country"}, inplace=True)

In [27]:
_df_state.append(df_state[["country", "code"]], ignore_index=True)
_df_state = _df_state.drop_duplicates()

In [28]:
state_codes1 = [(k,v) for k, v in zip(df_state.country, _df_state.code)]
state_codes2 = [(k,v) for k, v in zip(df_state.country_full, _df_state.code)]

In [29]:
mapper_state = {k:v for k,v in set(state_codes1 + state_codes2)}

Apply to State Department Sraping

In [30]:
df.country = df.country.str.strip()

In [31]:
df.code = df.country.map(mapper_state)

In [32]:
df.head()

Unnamed: 0,year,country,url,text
0,1999,Angola,https://www.state.gov/j/drl/rls/hrrpt/1999/223...,The Republic of Angola's transition from a sin...
1,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...,The Republic of Benin is a constitutional demo...
2,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...,"Botswana is a longstanding, multiparty democra..."
3,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...,President Blaise Compaore continued to dominat...
4,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...,Burundi is ruled by an authoritarian military ...


In [33]:
df.tail()

Unnamed: 0,year,country,url,text
3869,2018,Saint Vincent and the Grenadines,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF\n\n\n \n\n\nPermalink: http://www.s...
3870,2018,Suriname,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF\n\n\n \n\n\nPermalink: http://www.s...
3871,2018,Trinidad and Tobago,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...
3872,2018,Uruguay,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...
3873,2018,Venezuela,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...


In [34]:
df.to_csv('us_state_dept_reports_1999_2018.csv', index=False)

##### Missing / Error Codes

Code == 9000  
States Added, but UN / US State Department category does not exist

In [35]:
df_state[df_state.code  == 9000]

Unnamed: 0,country_full,country,code_full,code_short,code
55,Hong Kong,Hong Kong,9000,9000,9000
60,Macau,Macau,9000,9000,9000
76,Taiwan,Taiwan,9000,9000,9000
185,Western Sahara,Western Sahara,-1,9000,9000
217,"Yugoslavia, Federal Republic of",Yugoslavia,-1,9000,9000
226,Western Sahara,Western Sahara,9000,9000,9000
248,Kosovo,Kosovo,9000,9000,9000
252,Kosovo,Kosovo,9000,9000,9000
255,Western Sahara,Morocco,9000,504,9000
266,Cabo Verde,Cabo Verde,9000,9000,9000


Code == -1  
A parsing or labeling error from the scraping steps.  In this case, some yearly editions contained prefaces and introductions.

In [36]:
df_state[df_state.code  == -1]

Unnamed: 0,country_full,country,code_full,code_short,code
240,Preface,Preface,-1,-1,-1
241,Introduction,Introduction,-1,-1,-1
