In [12]:
# 로그 기록 함수
def log(describe) :
    import datetime as datetime
    log_txt = open('etl_project_log.txt','a')
    print(datetime.datetime.now(), describe, '\n', file = log_txt)
    log_txt.close()

# GDP data Extract
def extract_gdp():
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from io import StringIO

    log("Extract start")
    
    # Wikipedia GDP web scraping
    # 대용량의 Raw data에 대비하여 stream = True로 설정
    html = requests.get("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29", stream = True).text
    soup = BeautifulSoup(html, 'html.parser')
        
    # Wikipedia GDP 테이블 추출
    gdp_html = soup.find("table", {"class": 'wikitable'})
    df_raw = pd.read_html(StringIO(str(gdp_html)))[0]
    df_raw.to_json('raw_data_gdp.json')
    log("Extract finish")

    return 'raw_data_gdp.json'

# IMF GDP 테이블로 변환
def transform1_gdp(json):
    import pandas as pd
    
    log("Transform1 start")
    
    # 추출된 파일 열기
    df_gdp = pd.read_json(json)
    
    # 테이블에서 국가열과 IMF GDP 열만 추출
    df_gdp = df_gdp.iloc[:,[0,1]].drop(0)

    # 열 이름 변경
    df_gdp.columns = ['country', 'gdp']

    log("Transform1 finish")

    return df_gdp

# GDP열을 1 Billion USD 단위로 변경
def transform2_gdp(df_gdp):
    
    log("Transform2 start")
    df_gdp['gdp'] = df_gdp['gdp'].apply(lambda x : float(x) if x != '—' else None)
    df_gdp['gdp'] = round(df_gdp['gdp']/1000, 2)
    log("Transform2 finish")
    return df_gdp

# 국가 GDP가 높은 순서대로 정렬
def transform3_gdp(df_gdp):

    log("Transform3 start")
    
    df_gdp = df_gdp.sort_values('gdp', ascending = False)
    log("Transform3 finish")
    
    return df_gdp

# 국가에 따른 region 열 추가 
def transform4_gdp(df_gdp):
    
    log("Transform4 start")
    
    # 국가에 따른 region 열 맵핑 함수
    def country_to_continent(x) :
        try : continent = pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(x))
        except :
            # pycountry-convert 패키지에 인식되지 않는 국가들의 region을 직접 매핑
            exception_region_dict = {'DR Congo' : 'AF', 'Kosovo' : 'EU', 'Sint Maarten' : 'EU', 'Zanzibar' : 'AF', 'East Timor' : 'AS'}
            continent = exception_region_dict[x]
        return continent

    df_gdp['continent'] = df_gdp['country'].apply(lambda x : country_to_continent(x))
    log("Transform4 finish")
    
    return df_gdp
    
# GDP data Load
def load_gdp(df_gdp) :
    
    log("Load start")

    # json 파일로 저장하는 것을 Load로 설정.
    df_gdp.to_json('gdp_load.json')

    log("Load finish")

    return 'gdp_load.json'

# GDP display
def display(json) :

    df_gdp = pd.read_json(json)

    # GDP가 100 Billion USD 이상인 행 출력
    df_100B = df_gdp[df_gdp['gdp'] >= 100]
    print(df_100B)

    # 각 Region별로 top5 국가의 GDP 평균을 구해서 출력
    df_region_top5 = df_gdp.groupby('continent').head(5).groupby('continent').mean('gdp').sort_values('gdp', ascending = False)
    print(df_region_top5)

# GDP ETL process
def ETL_gdp() :
    json = extract_gdp()
    df_gdp1 = transform1_gdp(json)
    df_gdp2 = transform2_gdp(df_gdp1)
    df_gdp3 = transform3_gdp(df_gdp2)
    df_gdp4 = transform4_gdp(df_gdp3)
    json = load_gdp(df_gdp4)
    display(json)

In [13]:
ETL_gdp()

          country       gdp continent
1   United States  30337.16        NA
2           China  19534.89        AS
3         Germany   4921.56        EU
4           Japan   4389.33        AS
5           India   4271.92        AS
..            ...       ...       ...
69          Kenya    116.32        AF
70         Angola    113.29        AF
71      Guatemala    112.37        NA
72           Oman    110.99        AS
73      Venezuela    106.33        SA

[72 rows x 3 columns]
                gdp
continent          
NA         6946.500
AS         6327.178
EU         3318.112
SA          797.566
OC          436.658
AF          298.422


In [None]:
json = extract_gdp()
df_gdp1 = transform1_gdp(json)
df_gdp2 = transform2_gdp(df_gdp1)
df_gdp3 = transform3_gdp(df_gdp2)
df_gdp4 = transform4_gdp(df_gdp3)
load_gdp(df_gdp4)

In [None]:
df4 = transform4_gdp(df_gdp3)

In [None]:
df4

In [4]:
import pycountry_convert as pc

In [10]:
import pandas as pd