### 라이브러리 임포트

In [1]:
import pandas as pd
from utils.converter import numstr2num

### 데이터 로드

In [2]:
df = pd.read_json("./data/Countries_by_GDP.json")
df

Unnamed: 0,country,gdp,year,type
0,United States,30337162,2025.0,IMF
1,China,19534894,2025.0,IMF
2,Germany,4921563,2025.0,IMF
3,Japan,4389326,2025.0,IMF
4,India,4271922,2025.0,IMF
...,...,...,...,...
204,Kiribati,311,2024.0,IMF
205,Palau,308,2024.0,IMF
206,Marshall Islands,305,2024.0,IMF
207,Nauru,161,2024.0,IMF


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  209 non-null    object 
 1   gdp      194 non-null    object 
 2   year     194 non-null    float64
 3   type     209 non-null    object 
dtypes: float64(1), object(3)
memory usage: 6.7+ KB


### 분석을 위한 Transform (string -> float)

In [4]:
df['gdp'] = df['gdp'].apply(numstr2num)
df

Unnamed: 0,country,gdp,year,type
0,United States,30337162.0,2025.0,IMF
1,China,19534894.0,2025.0,IMF
2,Germany,4921563.0,2025.0,IMF
3,Japan,4389326.0,2025.0,IMF
4,India,4271922.0,2025.0,IMF
...,...,...,...,...
204,Kiribati,311.0,2024.0,IMF
205,Palau,308.0,2024.0,IMF
206,Marshall Islands,305.0,2024.0,IMF
207,Nauru,161.0,2024.0,IMF


In [5]:
df['gdp'] = df['gdp'].apply(lambda x: round(x/1000, 2) if x is not None else None)
df

Unnamed: 0,country,gdp,year,type
0,United States,30337.16,2025.0,IMF
1,China,19534.89,2025.0,IMF
2,Germany,4921.56,2025.0,IMF
3,Japan,4389.33,2025.0,IMF
4,India,4271.92,2025.0,IMF
...,...,...,...,...
204,Kiribati,0.31,2024.0,IMF
205,Palau,0.31,2024.0,IMF
206,Marshall Islands,0.30,2024.0,IMF
207,Nauru,0.16,2024.0,IMF


### Region 추가 

In [6]:
from utils.converter import country2region
df['region'] = df['country'].map(country2region)
df

Unnamed: 0,country,gdp,year,type,region
0,United States,30337.16,2025.0,IMF,North America
1,China,19534.89,2025.0,IMF,East Asia
2,Germany,4921.56,2025.0,IMF,Western Europe
3,Japan,4389.33,2025.0,IMF,East Asia
4,India,4271.92,2025.0,IMF,South Asia
...,...,...,...,...,...
204,Kiribati,0.31,2024.0,IMF,Oceania
205,Palau,0.31,2024.0,IMF,Oceania
206,Marshall Islands,0.30,2024.0,IMF,Oceania
207,Nauru,0.16,2024.0,IMF,Oceania


In [7]:
over100b = df[df['gdp'] >= 100]
over100b

Unnamed: 0,country,gdp,year,type,region
0,United States,30337.16,2025.0,IMF,North America
1,China,19534.89,2025.0,IMF,East Asia
2,Germany,4921.56,2025.0,IMF,Western Europe
3,Japan,4389.33,2025.0,IMF,East Asia
4,India,4271.92,2025.0,IMF,South Asia
...,...,...,...,...,...
68,Uzbekistan,112.65,2024.0,IMF,Central Asia
69,Guatemala,112.37,2024.0,IMF,Latin America
70,Oman,109.99,2024.0,IMF,Middle East
71,Bulgaria,108.42,2024.0,IMF,Eastern Europe


In [42]:
# 단순 region 별 top 5 평균값
# region_top5_mean = df.groupby('region').apply(lambda x: x['gdp'].nlargest(5).mean())
# region_top5_mean

# 각 그룹의 top 5 국가들과 값을 먼저 구한 후
top_5_by_group = over100b.sort_values('gdp', ascending=False).groupby('region').head(5)

# 국가 이름을 리스트로 모으고
countries_by_group = top_5_by_group.groupby('region')['country'].agg(list).reset_index()

# 평균값 계산
means_by_group = top_5_by_group.groupby('region')['gdp'].mean()

# 결과 merge
result = pd.merge(countries_by_group, means_by_group, on='region')
result


Unnamed: 0,region,country,gdp
0,Central Asia,"[Kazakhstan, Uzbekistan]",209.64
1,East Asia,"[China, Japan, South Korea, Taiwan, Hong Kong]",5421.57
2,Eastern Europe,"[Russia, Turkey, Poland, Romania, Czech Republic]",1066.6
3,Latin America,"[Brazil, Argentina, Colombia, Chile, Peru]",791.566
4,Middle East,"[Saudi Arabia, United Arab Emirates, Israel, I...",598.134
5,North Africa,"[Egypt, Algeria, Morocco]",270.97
6,North America,"[United States, Canada, Mexico]",11495.096667
7,Oceania,"[Australia, New Zealand]",1072.03
8,South Asia,"[India, Bangladesh, Pakistan]",1699.46
9,Southeast Asia,"[Indonesia, Singapore, Thailand, Malaysia, Phi...",711.892
