In [45]:
import pandas as pd

import missions.W1.M3.etl_project_gdp as etl_basic
import missions.W1.M3.etl_project_gdp_with_sql as etl_sqlite

In [48]:
etl_sqlite.main()

[2025-01-09-17-52-45], [START] Starting GDP ETL Process ...
[2025-01-09-17-52-45], [START] Starting GDP extraction process...
[2025-01-09-17-52-45], [START] Attempting to fetch the webpage: https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29
[2025-01-09-17-52-45], [COMPLETE] Successfully fetched the webpage: https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29
[2025-01-09-17-52-45], [START] Starting to parse the webpage: https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29 ...
[2025-01-09-17-52-45], [COMPLETE] Succesfuly finished parsing the webpage: https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29
[2025-01-09-17-52-45], [START] Saving data to missions/W1/M3/data/Countries_by_GDP.json ...
[2025-01-09-17-52-45], [COMPLETE] Data successfully saved to missions/W1/M3/data/Countries_by_GDP.json.
[2025-01-09-17-52-45], [COMPLETE] GDP extraction process completed successfully.
[2025-01-09-17-52-45], [START] Starting GDP Tran

In [49]:
df_gdp = pd.read_json('missions/W1/M3/data/Countries_by_GDP.json')
df_region = pd.read_json('missions/W1/M3/data/cultural-geo-mapper.json')

df_gdp_region = df_gdp.merge(df_region, on='country', how='left')

df_gdp_region['gdp'] = pd.to_numeric(df_gdp_region['gdp'].str.replace(",", ""), errors='coerce')  # str2int
df_gdp_region['gdp'] = (df_gdp_region['gdp'] / 1e3).round(2)  # GDP의 단위는 1B USD이어야 하고 소수점 2자리까지만 표시해 주세요.
df_gdp_region = df_gdp_region.sort_values(by='gdp', ascending=False).reset_index(drop=True)  # 해당 테이블에는 GDP가 높은 국가들이 먼저 나와야 합니다.

df_gdp_region

Unnamed: 0,country,gdp,year,type,region
0,United States,30337.16,2025.0,IMF,North America
1,China,19534.89,2025.0,IMF,East Asia
2,Germany,4921.56,2025.0,IMF,Western Europe
3,Japan,4389.33,2025.0,IMF,East Asia
4,India,4271.92,2025.0,IMF,South Asia
...,...,...,...,...,...
204,Greenland,,,IMF,North America
205,Eritrea,,,IMF,Sub-Saharan Africa
206,Zanzibar,,,IMF,Sub-Saharan Africa
207,Sint Maarten,,,IMF,North America


In [50]:
# GDP가 100B USD이상이 되는 국가만을 구해서 화면에 출력해야 합니다.
df_gdp_region[df_gdp_region['gdp'] >= 100]

Unnamed: 0,country,gdp,year,type,region
0,United States,30337.16,2025.0,IMF,North America
1,China,19534.89,2025.0,IMF,East Asia
2,Germany,4921.56,2025.0,IMF,Western Europe
3,Japan,4389.33,2025.0,IMF,East Asia
4,India,4271.92,2025.0,IMF,South Asia
...,...,...,...,...,...
67,Kenya,116.32,2024.0,IMF,Sub-Saharan Africa
68,Angola,113.29,2024.0,IMF,Sub-Saharan Africa
69,Guatemala,112.37,2024.0,IMF,Latin America
70,Oman,110.99,2024.0,IMF,Middle East


In [None]:
# 1. 'gdp' 값이 NaN인 행 제거
df_region_avg_gdp = df_gdp_region.loc[df_gdp_region['gdp'].notna()].copy()

# 2. 그룹 내 순위 계산 (Rank), Like PARTITION BY
df_region_avg_gdp['rank'] = (
    df_region_avg_gdp
    .sort_values(by=['region', 'gdp'], ascending=[True, False])
    .groupby('region')
    .cumcount() + 1
)

# 3. 그룹별 평균 GDP 계산 (Region Average GDP)
df_region_avg_gdp['region_avg_gdp'] = (
    df_region_avg_gdp.groupby('region')['gdp'].transform('mean').round(2)
)

# 4. Rank <= 5 필터링
df_region_avg_gdp = df_region_avg_gdp.loc[df_region_avg_gdp['rank'] <= 5].copy()

# 5. Region_Avg_GDP 기준 정렬
df_region_avg_gdp = df_region_avg_gdp.sort_values(
    by=['region_avg_gdp', 'rank'], ascending=[False, True]
).reset_index(drop=True)

# 6. 필요한 열만 출력
df_region_avg_gdp[["country", "gdp", "region", "region_avg_gdp"]]


Unnamed: 0,country,gdp,region,region_avg_gdp
0,United States,30337.16,North America,8622.34
1,Canada,2330.31,North America,8622.34
2,Mexico,1817.82,North America,8622.34
3,Aruba,4.07,North America,8622.34
4,China,19534.89,East Asia,3883.61
5,Japan,4389.33,East Asia,3883.61
6,South Korea,1947.13,East Asia,3883.61
7,Taiwan,814.44,East Asia,3883.61
8,Hong Kong,422.06,East Asia,3883.61
9,Germany,4921.56,Western Europe,1093.44


In [None]:
import sqlite3
import pandas as pd

def read_from_sqlite(db_name="missions/W1/M3/data/Countries_by_GDP.db"):
    # SQLite DB 연결
    conn = sqlite3.connect(db_name)
    
    # SQL 쿼리로 테이블 읽기
    query = "SELECT * FROM Countries_by_GDP"
    list_of_gdp = pd.read_sql_query(query, conn)
    
    # 연결 닫기
    conn.close()
    
    return list_of_gdp

def analyze_gdp(list_of_gdp):
    # GDP_USD_billion이 100 이상인 데이터 필터링
    filtered_data = list_of_gdp[list_of_gdp['GDP_USD_billion'] >= 100]
    return filtered_data

list_of_gdp = read_from_sqlite()
analyze_gdp(list_of_gdp)


In [None]:
with sqlite3.connect("missions/W1/M3/data/Countries_by_GDP.db") as conn:
    cursor = conn.cursor()

    query = '''
        SELECT *
        FROM Countries_by_GDP
        WHERE GDP_USD_billion >= 100;
    '''
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        print(row)
    conn.rollback()

    cursor.close()

In [None]:
with sqlite3.connect("missions/W1/M3/data/Countries_by_GDP.db") as conn:
    cursor = conn.cursor()

    query = '''
        SELECT
            Region,
            Country,
            GDP_USD_billion,
            AVG(GDP_USD_billion) OVER (PARTITION BY Region) AS Region_Avg_GDP
        FROM(
            SELECT
                *,
                ROW_NUMBER() OVER (PARTITION BY Region ORDER BY GDP_USD_billion DESC) AS Rank
            FROM
                Countries_by_GDP
            WHERE GDP_USD_billion is not null
            )
        WHERE RANK <= 5
        ORDER BY Region_Avg_GDP DESC
    '''
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        print(row)
    conn.rollback()

    cursor.close()