In [8]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import sqlite3
import pycountry_convert as pc  # 반드시 설치 필요

In [9]:
# URL에서 HTML 문서 가져오기
url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
response = requests.get(url)

# BeautifulSoup 인스턴스 생성
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# 특정 클래스를 가진 테이블 가져오기
table = soup.find('table', {
    'class': 'wikitable'
})

In [11]:
table

<table class="wikitable sortable sticky-header-multi static-row-numbers" style="text-align:right">
<caption>GDP (million US$) by country
</caption>
<tbody><tr class="static-row-header" style="text-align:center;vertical-align:bottom;">
<th rowspan="2">Country/Territory
</th>
<th colspan="2"><a href="/wiki/International_Monetary_Fund" title="International Monetary Fund">IMF</a><sup class="reference" id="cite_ref-GDP_IMF_2-2"><a href="#cite_note-GDP_IMF-2"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-15"><a href="#cite_note-15"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup>
</th>
<th colspan="2"><a href="/wiki/World_Bank" title="World Bank">World Bank</a><sup class="reference" id="cite_ref-16"><a href="#cite_note-16"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup>
</th>
<th colspan="2"><a href="/wiki/United_Nations" title="United Nations">United Na

In [12]:
# 현재 페이지에서 table 태그 모두 선택하기
#table1 = soup.select('table')

In [13]:
#for i in range(7):
#    print(i)
    #print(table1[i])

In [14]:
#table = table1[2]
#table

In [15]:
# 테이블 헤더 추출
#headers = [th.text.strip() for th in table.find_all('th')]
#headers = headers[:2]

headers = ['Country','GDP']
print("Headers:", headers)

Headers: ['Country', 'GDP']


In [16]:
# 테이블 행과 열 추출
rows = []
for tr in table.find_all('tr')[3:]:  # 첫 번째 행은 헤더이므로 제외
    cells = [td.text.strip() for td in tr.find_all('td')]
    #print(cells[1])
    if cells:  # 빈 행 건너뛰기
        rows.append(cells[0:2])

# 출력 확인

for row in rows:  # 상위 5개 행 출력
    print(row)

['United States', '30,337,162']
['China', '19,534,894']
['Germany', '4,921,563']
['Japan', '4,389,326']
['India', '4,271,922']
['United Kingdom', '3,730,261']
['France', '3,283,429']
['Italy', '2,459,597']
['Canada', '2,330,308']
['Brazil', '2,307,162']
['Russia', '2,195,708']
['South Korea', '1,947,133']
['Australia', '1,881,140']
['Spain', '1,827,576']
['Mexico', '1,817,818']
['Indonesia', '1,492,618']
['Turkey', '1,455,413']
['Netherlands', '1,272,960']
['Saudi Arabia', '1,136,580']
['Switzerland', '999,604']
['Poland', '915,451']
['Taiwan', '814,438']
['Belgium', '689,364']
['Sweden', '638,780']
['Argentina', '604,201']
['Ireland', '587,225']
['United Arab Emirates', '568,567']
['Singapore', '561,725']
['Austria', '559,218']
['Israel', '550,905']
['Thailand', '545,341']
['Vietnam', '506,426']
['Norway', '503,466']
['Malaysia', '488,250']
['Philippines', '507,670']
['Iran', '463,747']
['Bangladesh', '455,856']
['Czech Republic', '452,233']
['Denmark', '431,228']
['Hong Kong', '422,0

In [17]:
import json


# 데이터를 JSON 형태로 변환
data = [dict(zip(headers, row)) for row in rows]

# JSON 파일로 저장
with open('Countries_by_GDP.json', 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

In [18]:
# Pandas DataFrame으로 변환
df = pd.DataFrame(rows, columns=headers)
#df = df[['Country/Territory','IMF[1][13]']][1:]

# DataFrame 출력
#df.columns = ['Country','GDP']

df['GDP'] = df['GDP'].str.replace(',', '')
df['GDP'] = df['GDP'].replace('—', '0')
df['GDP'] = df['GDP'].astype(int)
df['GDP'] = df['GDP'].replace(0, np.nan)


df

Unnamed: 0,Country,GDP
0,United States,30337162.0
1,China,19534894.0
2,Germany,4921563.0
3,Japan,4389326.0
4,India,4271922.0
...,...,...
204,Kiribati,311.0
205,Palau,308.0
206,Marshall Islands,305.0
207,Nauru,161.0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  209 non-null    object 
 1   GDP      194 non-null    float64
dtypes: float64(1), object(1)
memory usage: 3.4+ KB


In [20]:
df = df.sort_values(by=['GDP'], ascending =[False])
df.tail(20)

Unnamed: 0,Country,GDP
204,Kiribati,311.0
205,Palau,308.0
206,Marshall Islands,305.0
207,Nauru,161.0
208,Tuvalu,66.0
61,Cuba,
133,Syria,
144,North Korea,
156,New Caledonia,
157,Monaco,


In [21]:
#DR Congo, Kosovo, East Timor, Zanzibar, Sint Maarten

In [22]:
df['GDP'] = round(df['GDP'] * 0.001,2)
df

Unnamed: 0,Country,GDP
0,United States,30337.16
1,China,19534.89
2,Germany,4921.56
3,Japan,4389.33
4,India,4271.92
...,...,...
178,Greenland,
184,Eritrea,
185,Zanzibar,
192,Sint Maarten,


In [23]:
df['Region']=np.nan
df

Unnamed: 0,Country,GDP,Region
0,United States,30337.16,
1,China,19534.89,
2,Germany,4921.56,
3,Japan,4389.33,
4,India,4271.92,
...,...,...,...
178,Greenland,,
184,Eritrea,,
185,Zanzibar,,
192,Sint Maarten,,


In [25]:
# KeyError 발생 횟수 추적
num = 0

# 국가 이름 입력하면 대륙 알려주는 함수 정의
def country_to_continent(country_name):
    global num
    try:
        # 국가 이름을 대륙 이름으로 변환
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except KeyError:
        # KeyError 발생 시 num 증가 및 NaN 반환
        num += 1
        return np.nan

# 벡터화 처리
df['Region'] = df['Country'].apply(country_to_continent)

# 결과 출력
print(f"KeyError 발생 횟수: {num}")


KeyError 발생 횟수: 5


In [26]:
df

Unnamed: 0,Country,GDP,Region
0,United States,30337.16,North America
1,China,19534.89,Asia
2,Germany,4921.56,Europe
3,Japan,4389.33,Asia
4,India,4271.92,Asia
...,...,...,...
178,Greenland,,North America
184,Eritrea,,Africa
185,Zanzibar,,
192,Sint Maarten,,


In [27]:
# GDP 기준으로 내림차순 정렬
sorted_df = df.sort_values(by="GDP", ascending=False)

# 각 Region별 상위 5개 국가 선택
top5_per_region = (
    sorted_df.groupby("Region", group_keys=False)
             .apply(lambda x: x.nlargest(5, "GDP"))
)

# Region별 상위 5개 국가의 GDP 평균 계산
region_top5_avg_gdp = top5_per_region.groupby("Region")["GDP"].mean()

# 결과 출력
print(region_top5_avg_gdp)

Region
Africa            298.422
Asia             6327.178
Europe           3318.112
North America    6946.500
Oceania           436.658
South America     797.566
Name: GDP, dtype: float64


  .apply(lambda x: x.nlargest(5, "GDP"))


In [21]:
conn = sqlite3.connect('gdp.db')
df.to_sql('Countries_by_GDP', conn,if_exists='replace',index =False)

conn.close()
   

In [22]:
conn = sqlite3.connect('gdp.db')

pd.read_sql("""SELECT Region, ROUND(AVG(GDP), 2) AS Avg_Top5_GDP
                    FROM ( SELECT Region, GDP, ROW_NUMBER() OVER (PARTITION BY Region ORDER BY GDP DESC) AS rank
                    FROM Countries_by_GDP
                    WHERE REGION IS NOT NULL
                )
                WHERE rank <= 5
                GROUP BY Region; """,conn)

#conn.close()

Unnamed: 0,Region,Avg_Top5_GDP
0,Africa,298.42
1,Asia,6327.18
2,Europe,3318.11
3,North America,6946.5
4,Oceania,436.66
5,South America,797.57
