In [1]:
import requests
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime 
from bs4 import BeautifulSoup


In [37]:
# Definition of Variables and Parameters
url = 'https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
table_attribs = ["Country","GDP_USD_millions"]
db_name = 'World_Economies.db'
table_name = 'Countries_by_GDP'
conn = sqlite3.connect(db_name)
sql_connection = conn
csv_path = 'Countries_by_GDP.csv'


# Task 1: Extracting information

In [13]:
url = 'https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29'
data = pd.read_html(url)[3]  
data

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Estimate,Year,Estimate,Year,Estimate,Year
0,World,—,105568776,2023,100562011,2022,96698005,2021
1,United States,Americas,26854599,2023,25462700,2022,23315081,2021
2,China,Asia,19373586,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Japan,Asia,4409738,2023,4231141,2022,4940878,2021
4,Germany,Europe,4308854,2023,4072192,2022,4259935,2021
...,...,...,...,...,...,...,...,...
209,Anguilla,Americas,—,—,—,—,303,2021
210,Kiribati,Oceania,248,2023,223,2022,227,2021
211,Nauru,Oceania,151,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [22]:
def clean_data(data):
    # Drop columns: '('IMF[1][13]', 'Year')', '('World Bank[14]', 'Estimate')' and 3 other columns
    data = data.drop(columns=[('IMF[1][13]', 'Year'), ('World Bank[14]', 'Estimate'), ('World Bank[14]', 'Year'), ('United Nations[15]', 'Estimate'), ('United Nations[15]', 'Year')])
    # Drop column: '('UN region', 'UN region')'
    data = data.drop(columns=[('UN region', 'UN region')])
    data = data.drop(0)
    data = data[data[('IMF[1][13]', 'Estimate')].astype(str) != "—"]
    
    
    #Clear Sublevels
    data.columns = data.columns.get_level_values(0)
    # Rename column 'Country/Territory' to 'Country'
    data = data.rename(columns={'Country/Territory': 'Country'})
    # Rename column 'IMF[1][13]' to 'GDP_USD_millions'
    data = data.rename(columns={'IMF[1][13]': 'GDP_USD_millions'})
    #reset Index
    data.reset_index(drop=True, inplace=True)
    
    return data

data_clean = clean_data(data.copy())
data_clean




Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
186,Marshall Islands,291
187,Palau,262
188,Kiribati,248
189,Nauru,151


In [16]:
table_attribs = ["Country","GDP_USD_millions"]

In [17]:
def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page,'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = data.find_all('tbody')
    rows = tables[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col)!=0:
            if col[0].find('a') is not None and '—' not in col[2]:
                data_dict = {"Country": col[0].a.contents[0],
                             "GDP_USD_millions": col[2].contents[0]}
                df1 = pd.DataFrame(data_dict, index=[0])
                df = pd.concat([df,df1], ignore_index=True)
    return df

In [28]:
df = extract(url,table_attribs)
df

Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
186,Marshall Islands,291
187,Palau,262
188,Kiribati,248
189,Nauru,151


# Task 2: Transform information

In [23]:
def transform(df):
    GDP_list = df["GDP_USD_millions"].tolist()
    GDP_list = [float("".join(x.split(','))) for x in GDP_list]
    GDP_list = [np.round(x/1000,2) for x in GDP_list]
    df["GDP_USD_millions"] = GDP_list
    df=df.rename(columns = {"GDP_USD_millions":"GDP_USD_billions"})
    return df

In [31]:
df = transform(df)
df

Unnamed: 0,Country,GDP_USD_billions
0,United States,26854.60
1,China,19373.59
2,Japan,4409.74
3,Germany,4308.85
4,India,3736.88
...,...,...
186,Marshall Islands,0.29
187,Palau,0.26
188,Kiribati,0.25
189,Nauru,0.15


# Task 3: Loading information

In [32]:
def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

In [35]:
load_to_csv(df, csv_path)

In [36]:
def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

In [38]:
load_to_db(df, sql_connection, table_name)

# Task 4: Querying the database table

In [39]:
def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

# Task 5: Logging progress

In [None]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./etl_project_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')