In [1]:
from bs4 import BeautifulSoup
import os
import codecs
import re
import pandas as pd
import numpy as np
from pymongo import MongoClient

client = MongoClient()
db = client.world

The early years of the CIA website have more simplistic HTML and thus requried a separate code strain to parse. Below is the code used from 2001 - 2007 with minor tweaks from year to year. The code to parse the 2000 pages is at the bottom as that HTML required a different approach.

In [272]:
needs = ['gdp_(purchasing_power_parity)', 'gdp_-_real_growth_rate', 'gdp_-_per_capita_(ppp)', 'natural_resources',
        'exports_-_commodities', 'literacy']

#natural resources

url = 'factbook_01/fields/natural_resources.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
resource_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        name = data[0].text.strip()[:-1]
        info = data[1].text.strip()
        dict_ = {'country': name, 'resources': info}
        resource_dicts.append(dict_)
    except:
        continue
resources_df = pd.DataFrame(resource_dicts)

In [187]:
resources_df.head()

Unnamed: 0,country,resources
0,Afghanistan,"natural gas, petroleum, coal, copper, chromite..."
1,Albania,"petroleum, natural gas, coal, chromium, copper..."
2,Algeria,"petroleum, natural gas, iron ore, phosphates, ..."
3,American Samoa,"pumice, pumicite"
4,Andorra,"hydropower, mineral water, timber, iron ore, lead"


In [204]:
# exports

url = 'factbook_01/fields/exports_-_commodities.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
exports_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        name = data[0].text.strip()[:-1]
        info = data[1].text.strip()
        dict_ = {'country': name, 'exports': info}
        exports_dicts.append(dict_)
    except:
        continue
exports_df = pd.DataFrame(exports_dicts)

In [205]:
exports_df.head()

Unnamed: 0,country,exports
0,Afghanistan,"opium, fruits and nuts, handwoven carpets, woo..."
1,Albania,"textiles and footwear; asphalt, metals and met..."
2,Algeria,"petroleum, natural gas, and petroleum products..."
3,American Samoa,canned tuna 93%
4,Andorra,"tobacco products, furniture"


In [206]:
# literacy

url = 'factbook_01/fields/literacy.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
lit_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        country = data[0].text.strip()[:-1]
        lits = data[1].text.strip()
        nums = re.findall('\d+\.?\d?%', lits)
        e = nums[0]
        m = nums[1]
        f = nums[2]
        lit_dict = {'country': country, 'total_literacy': e, 'male_literacy': m, 'female_literacy': f}
        lit_dicts.append(lit_dict)
    except:
        continue
lit_df = pd.DataFrame(lit_dicts)

In [207]:
lit_df.head()

Unnamed: 0,country,total_literacy,male_literacy,female_literacy
0,Afghanistan,31.5%,47.2%,15%
1,Algeria,61.6%,73.9%,49%
2,American Samoa,97%,98%,97%
3,Angola,42%,56%,28%
4,Anguilla,95%,95%,95%


In [208]:
# gdp

url = 'factbook_01/fields/gdp.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
gdp_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        name = data[0].text.strip()[:-1]
        info = data[1].text.strip()
        info = re.search('\$\d+\.?\d* \w*', info).group()
        dict_ = {'country': name, 'gdp': info}
        gdp_dicts.append(dict_)
    except:
        continue
gdp_df = pd.DataFrame(gdp_dicts)

In [209]:
gdp_df.head()

Unnamed: 0,country,gdp
0,Afghanistan,$21 billion
1,Albania,$10.5 billion
2,Algeria,$171 billion
3,American Samoa,$500 million
4,Andorra,$1.2 billion


In [210]:
# ppp

url = 'factbook_01/fields/gdp_-_per_capita.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
ppp_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        name = data[0].text.strip()[:-1]
        info = data[1].text.strip()
        info = re.search('\$\d+\,?\d*', info).group()
        dict_ = {'country': name, 'ppp': info}
        ppp_dicts.append(dict_)
    except:
        continue
ppp_df = pd.DataFrame(ppp_dicts)

In [211]:
ppp_df.head()

Unnamed: 0,country,ppp
0,Afghanistan,$800
1,Albania,"$3,000"
2,Algeria,"$5,500"
3,American Samoa,"$8,000"
4,Andorra,"$18,000"


In [212]:
# growth

url = 'factbook_01/fields/gdp_-_real_growth_rate.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
growth_dicts = []
table = soup.findAll('table')[1].findAll('tr')
for row in table:
    try:
        data = row.findAll('td')
        name = data[0].text.strip()[:-1]
        info = data[1].text.strip()
        info = re.search('\d+\.?\d*%', info).group()
        dict_ = {'country': name, 'growth': info}
        growth_dicts.append(dict_)
    except:
        continue
growth_df = pd.DataFrame(growth_dicts)

In [213]:
growth_df.head()

Unnamed: 0,country,growth
0,Albania,7.5%
1,Algeria,5%
2,Angola,4.9%
3,Anguilla,7%
4,Antigua and Barbuda,4.6%


In [262]:
full_df = pd.merge(resources_df, exports_df, on='country', how='outer')
full_df = full_df.merge(lit_df, on='country', how='outer')
full_df = full_df.merge(gdp_df, on='country', how='outer')
full_df = full_df.merge(ppp_df, on='country', how='outer')
full_df = full_df.merge(growth_df, on='country', how='outer')

In [263]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 267 entries, 0 to 266
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          267 non-null    object
 1   resources        267 non-null    object
 2   exports          233 non-null    object
 3   total_literacy   179 non-null    object
 4   male_literacy    179 non-null    object
 5   female_literacy  179 non-null    object
 6   gdp              228 non-null    object
 7   ppp              227 non-null    object
 8   growth           202 non-null    object
dtypes: object(9)
memory usage: 20.9+ KB


In [264]:
full_df.dropna(inplace=True)

In [265]:
def per_float(percent):
    percent = re.sub('%', '', percent)
    return round(float(percent)/100, 3)
def dollar_int(dollar):
    dollar = re.sub(',','',dollar)
    return int(dollar[1:])
def gdp_int(gdp):
    try:
        num = float(re.search('\d+\.?\d*', gdp).group())
        power = re.search('(million|billion|trillion)', gdp).group()
        if power == 'billion':
            num *= 10**9
        elif power == 'million':
            num *= 10**6
        elif power == 'trillion':
            num *= 10**12
        return int(num)
    except: 
        print(gdp)

In [266]:
full_df['year'] = 2000

In [267]:
full_df['total_literacy'] = full_df['total_literacy'].apply(per_float)
full_df['male_literacy'] = full_df['male_literacy'].apply(per_float)
full_df['female_literacy'] = full_df['female_literacy'].apply(per_float)
full_df['growth'] = full_df['growth'].apply(per_float)

full_df['ppp'] = full_df['ppp'].apply(dollar_int)

full_df['gdp'] = full_df['gdp'].apply(gdp_int)

In [268]:
full_df.head()

Unnamed: 0,country,resources,exports,total_literacy,male_literacy,female_literacy,gdp,ppp,growth,year
2,Algeria,"petroleum, natural gas, iron ore, phosphates, ...","petroleum, natural gas, and petroleum products...",0.616,0.739,0.49,147600000000,4700,0.039,2000
5,Angola,"petroleum, diamonds, iron ore, phosphates, cop...","crude oil 90%, diamonds, refined petroleum pro...",0.42,0.56,0.28,11600000000,1030,0.04,2000
6,Anguilla,"salt, fish, lobster","lobster, fish, livestock, salt",0.95,0.95,0.95,88000000,7900,0.065,2000
8,Antigua and Barbuda,NEGL; pleasant climate fosters tourism,"petroleum products 48%, manufactures 23%, food...",0.89,0.9,0.88,524000000,8200,0.028,2000
10,Argentina,"fertile plains of the pampas, lead, zinc, tin,...","edible oils, fuels and energy, cereals, feed, ...",0.962,0.962,0.962,367000000000,10000,0.03,2000


In [269]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 163 entries, 2 to 266
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          163 non-null    object 
 1   resources        163 non-null    object 
 2   exports          163 non-null    object 
 3   total_literacy   163 non-null    float64
 4   male_literacy    163 non-null    float64
 5   female_literacy  163 non-null    float64
 6   gdp              163 non-null    int64  
 7   ppp              163 non-null    int64  
 8   growth           163 non-null    float64
 9   year             163 non-null    int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 14.0+ KB


In [270]:
for row in full_df.itertuples():
    country_dict = {'_id': row[1]+str(row[-1]), 'country': row[1], 'year': row[-1], 'gdp': row[7], 'ppp': row[8], 
                    'growth': row[9], 'resources': row[2], 'exports': row[3], 'literacy': row[4], 'male': row[5],
                   'female': row[6]}
    db.resources.insert_one(country_dict)

In [271]:
db.resources.count_documents({})

3446

In [232]:
url = 'factbook_00/fields/natural_resources.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
resource_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        info = data[2]
        dict_ = {'country': country, 'resources': info}
        resource_dicts.append(dict_)
    except:
        continue
resources_df = pd.DataFrame(resource_dicts)

In [233]:
resources_df.head()

Unnamed: 0,country,resources
0,Afghanistan,"natural gas, petroleum, coal, copper, chromite..."
1,Albania,"petroleum, natural gas, coal, chromium, copper..."
2,Algeria,"petroleum, natural gas, iron ore, phosphates, ..."
3,American Samoa,"pumice, pumicite"
4,Andorra,"hydropower, mineral water, timber, iron ore, lead"


In [236]:
url = 'factbook_00/fields/exports_-_commodities.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
exports_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        info = data[2]
        dict_ = {'country': country, 'exports': info}
        exports_dicts.append(dict_)
    except:
        continue
exports_df = pd.DataFrame(exports_dicts)

In [237]:
exports_df.head()

Unnamed: 0,country,exports
0,Afghanistan,"opium, fruits and nuts, handwoven carpets, woo..."
1,Albania,"textiles and footwear; asphalt, metals and met..."
2,Algeria,"petroleum, natural gas, and petroleum products..."
3,American Samoa,canned tuna 93%
4,Andorra,"tobacco products, furniture"


In [260]:
url = 'factbook_00/fields/literacy.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
lit_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        e = re.search('\d+\.?\d*%', data[4]).group()
        m = re.search('\d+\.?\d*%', data[6]).group()
        f = re.search('\d+\.?\d*%', data[8]).group()
        lit_dict = {'country': country, 'total_literacy': e, 'male_literacy': m, 'female_literacy': f}
        lit_dicts.append(lit_dict)
    except:
        continue
lit_df = pd.DataFrame(lit_dicts)

In [261]:
lit_df.tail()

Unnamed: 0,country,total_literacy,male_literacy,female_literacy
174,Wallis and Futuna,50%,50%,50%
175,Yemen,38%,53%,26%
176,Zambia,78.2%,85.6%,71.3%
177,Zimbabwe,85%,90%,80%
178,Taiwan,86%,93%,79%


In [247]:
url = 'factbook_00/fields/gdp.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
gdp_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        info = data[2]
        info = re.search('\$\d+\.?\d* \w*', info).group()
        dict_ = {'country': country, 'gdp': info}
        gdp_dicts.append(dict_)
    except:
        continue
gdp_df = pd.DataFrame(gdp_dicts)

In [249]:
gdp_df.head()

Unnamed: 0,country,gdp
0,Afghanistan,$21 billion
1,Albania,$5.6 billion
2,Algeria,$147.6 billion
3,American Samoa,$150 million
4,Andorra,$1.2 billion


In [250]:
url = 'factbook_00/fields/gdp_-_per_capita.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
ppp_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        info = data[2]
        info = re.search('\$\d+\,?\d*', info).group()
        dict_ = {'country': country, 'ppp': info}
        ppp_dicts.append(dict_)
    except:
        continue
ppp_df = pd.DataFrame(ppp_dicts)

In [251]:
ppp_df.head()

Unnamed: 0,country,ppp
0,Afghanistan,$800
1,Albania,"$1,650"
2,Algeria,"$4,700"
3,American Samoa,"$2,600"
4,Andorra,"$18,000"


In [252]:
url = 'factbook_00/fields/gdp_-_real_growth_rate.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
growth_dicts = []
for row in soup.findAll('p')[2:]:
    try:
        data = row.text.split('\n')
        country = data[0][:-1]
        info = data[2]
        info = re.search('\d+\.?\d*%', info).group()
        dict_ = {'country': country, 'growth': info}
        growth_dicts.append(dict_)
    except:
        continue
growth_df = pd.DataFrame(growth_dicts)

In [253]:
growth_df.head()

Unnamed: 0,country,growth
0,Albania,8%
1,Algeria,3.9%
2,Angola,4%
3,Anguilla,6.5%
4,Antigua and Barbuda,2.8%
