In [3]:
from bs4 import BeautifulSoup
import os
import codecs
import re
import pandas as pd
import numpy as np
from pymongo import MongoClient

client = MongoClient()
db = client.world

This notebook is devoted to pipelining moderately older data, from 2008 - 2014, where the HTML is significantly different and thus requires different code to parse. Otherwise, it follows the same logic as the other notebooks.

In [181]:
needs = ['gdp_(purchasing_power_parity)', 'gdp_-_real_growth_rate', 'gdp_-_per_capita_(ppp)', 'natural_resources',
        'exports_-_commodities', 'literacy']

#natural resources

url = 'factbook_08/fields/natural_resources.html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
resource_dicts = []
for table in soup.findAll('table')[10:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        info = data[2].text.strip()
        resource_dict = {'country': country, 'resources': info}
        resource_dicts.append(resource_dict)
    except:
        continue
resources_df = pd.DataFrame(resource_dicts)

In [182]:
resources_df.head()

In [161]:
# exports

url = 'factbook_09/fields/exports_-_commodities(%).html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
exports_dicts = []
for table in soup.findAll('table')[3:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        info = data[2].text.strip()
        exports_dict = {'country': country, 'exports': info}
        exports_dicts.append(exports_dict)
    except:
        continue
exports_df = pd.DataFrame(exports_dicts)

In [162]:
exports_df.head()

Unnamed: 0,country,exports
0,Afghanistan,"opium, fruits and nuts, handwoven carpets, woo..."
1,Albania,"textiles and footwear; asphalt, metals and met..."
2,Algeria,"petroleum, natural gas, and petroleum products..."
3,American Samoa,canned tuna 93% (2004 est.)
4,Andorra,"tobacco products, furniture"


In [163]:
# literacy

url = 'factbook_09/fields/literacy(%).html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
lit_dicts = []
for table in soup.findAll('table')[3:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        lits = data[2].text.strip()
        nums = re.findall('\d+\.?\d?%', lits)
        e = nums[0]
        m = nums[1]
        f = nums[2]
        lit_dict = {'country': country, 'total_literacy': e, 'male_literacy': m, 'female_literacy': f}
        lit_dicts.append(lit_dict)
    except:
        continue
lits_df = pd.DataFrame(lit_dicts)

In [164]:
lits_df.head()

Unnamed: 0,country,total_literacy,male_literacy,female_literacy
0,Afghanistan,28.1%,43.1%,12.6%
1,Albania,98.7%,99.2%,98.3%
2,Algeria,69.9%,79.6%,60.1%
3,American Samoa,97%,98%,97%
4,Andorra,100%,100%,100%


In [165]:
# gdp

url = 'factbook_09/fields/gdp_(purchasing_power_parity).html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
gdp_dicts = []
for table in soup.findAll('table')[4:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        info = data[2].text.strip()
        info = re.search('\$\d+\.?\d* \w*', info).group()
        gdp_dict = {'country': country, 'gdp': info}
        gdp_dicts.append(gdp_dict)
    except:
        continue
gdp_df = pd.DataFrame(gdp_dicts)

In [166]:
gdp_df.head()

Unnamed: 0,country,gdp
0,Afghanistan,$22.32 billion
1,Albania,$21.86 billion
2,Algeria,$233.5 billion
3,American Samoa,$575.3 million
4,Andorra,$3.66 billion


In [167]:
# ppp

url = 'factbook_09/fields/gdp_-_per_capita_(ppp).html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
ppp_dicts = []
for table in soup.findAll('table')[4:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        info = data[2].text.strip()
        info = re.search('\$\d+\,?\d*', info).group()
        ppp_dict = {'country': country, 'ppp': info}
        ppp_dicts.append(ppp_dict)
    except:
        continue
ppp_df = pd.DataFrame(ppp_dicts)

In [168]:
ppp_df.head()

Unnamed: 0,country,ppp
0,Afghanistan,$800
1,Albania,"$6,000"
2,Algeria,"$6,900"
3,American Samoa,"$8,000"
4,Andorra,"$42,500"


In [169]:
# growth

url = 'factbook_09/fields/gdp_-_real_growth_rate(%).html'
f=codecs.open(url, 'r')
soup = BeautifulSoup(f.read(), 'lxml')
growth_dicts = []
for table in soup.findAll('table')[4:]:
    try:
        data = table.findAll('td')
        country = data[1].text.strip()
        info = data[2].text.strip()
        info = re.search('\d+\.?\d*%', info).group()
        growth_dict = {'country': country, 'growth': info}
        growth_dicts.append(growth_dict)
    except:
        continue
growth_df = pd.DataFrame(growth_dicts)

In [170]:
growth_df.head()

Unnamed: 0,country,growth
0,Afghanistan,3.4%
1,Albania,6.1%
2,Algeria,3.5%
3,American Samoa,3%
4,Andorra,2%


In [171]:
full_df = pd.merge(resources_df, exports_df, on='country', how='outer')
full_df = full_df.merge(lits_df, on='country', how='outer')
full_df = full_df.merge(gdp_df, on='country', how='outer')
full_df = full_df.merge(ppp_df, on='country', how='outer')
full_df = full_df.merge(growth_df, on='country', how='outer')

In [172]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258 entries, 0 to 257
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          258 non-null    object
 1   resources        258 non-null    object
 2   exports          232 non-null    object
 3   total_literacy   205 non-null    object
 4   male_literacy    205 non-null    object
 5   female_literacy  205 non-null    object
 6   gdp              230 non-null    object
 7   ppp              230 non-null    object
 8   growth           218 non-null    object
dtypes: object(9)
memory usage: 20.2+ KB


In [173]:
full_df.dropna(inplace=True)

In [174]:
def per_float(percent):
    percent = re.sub('%', '', percent)
    return round(float(percent)/100, 3)
def dollar_int(dollar):
    dollar = re.sub(',','',dollar)
    return int(dollar[1:])
def gdp_int(gdp):
    try:
        num = float(re.search('\d+\.?\d*', gdp).group())
        power = re.search('(million|billion|trillion)', gdp).group()
        if power == 'billion':
            num *= 10**9
        elif power == 'million':
            num *= 10**6
        elif power == 'trillion':
            num *= 10**12
        return int(num)
    except: 
        print(gdp)

In [175]:
full_df['year'] = 2009

In [176]:
full_df['total_literacy'] = full_df['total_literacy'].apply(per_float)
full_df['male_literacy'] = full_df['male_literacy'].apply(per_float)
full_df['female_literacy'] = full_df['female_literacy'].apply(per_float)
full_df['growth'] = full_df['growth'].apply(per_float)

full_df['ppp'] = full_df['ppp'].apply(dollar_int)

full_df['gdp'] = full_df['gdp'].apply(gdp_int)

In [177]:
full_df.head()

Unnamed: 0,country,resources,exports,total_literacy,male_literacy,female_literacy,gdp,ppp,growth,year
0,Afghanistan,"natural gas, petroleum, coal, copper, chromite...","opium, fruits and nuts, handwoven carpets, woo...",0.281,0.431,0.126,22320000000,800,0.034,2009
1,Albania,"petroleum, natural gas, coal, bauxite, chromit...","textiles and footwear; asphalt, metals and met...",0.987,0.992,0.983,21860000000,6000,0.061,2009
2,Algeria,"petroleum, natural gas, iron ore, phosphates, ...","petroleum, natural gas, and petroleum products...",0.699,0.796,0.601,233500000000,6900,0.035,2009
3,American Samoa,"pumice, pumicite",canned tuna 93% (2004 est.),0.97,0.98,0.97,575300000,8000,0.03,2009
4,Andorra,"hydropower, mineral water, timber, iron ore, lead","tobacco products, furniture",1.0,1.0,1.0,3660000000,42500,0.02,2009


In [178]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 0 to 257
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          197 non-null    object 
 1   resources        197 non-null    object 
 2   exports          197 non-null    object 
 3   total_literacy   197 non-null    float64
 4   male_literacy    197 non-null    float64
 5   female_literacy  197 non-null    float64
 6   gdp              197 non-null    int64  
 7   ppp              197 non-null    int64  
 8   growth           197 non-null    float64
 9   year             197 non-null    int64  
dtypes: float64(4), int64(3), object(3)
memory usage: 16.9+ KB


In [179]:
for row in full_df.itertuples():
    country_dict = {'_id': row[1]+str(row[-1]), 'country': row[1], 'year': row[-1], 'gdp': row[7], 'ppp': row[8], 
                    'growth': row[9], 'resources': row[2], 'exports': row[3], 'literacy': row[4], 'male': row[5],
                   'female': row[6]}
    db.resources.insert_one(country_dict)