# Web scraping - przykład

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

r = requests.get("https://en.wikipedia.org/wiki/List_of_sovereign_states_in_Europe_by_GDP_(nominal)")
r.status_code

200

In [2]:
soup = BeautifulSoup(r.content, "html.parser")
table_tag = soup.find(class_="wikitable")
table_tag

<table class="wikitable sortable static-row-numbers sticky-header col1left" style="text-align:right">
<tbody><tr>
<th>Country 
</th>
<th>2024<sup class="reference" id="cite_ref-4"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup>
</th>
<th>2023<sup class="reference" id="cite_ref-IMFWEO_5-0"><a href="#cite_note-IMFWEO-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup>
</th>
<th>2022
</th>
<th>2021<sup class="reference" id="cite_ref-WPIMFWEO_6-0"><a href="#cite_note-WPIMFWEO-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup></th>
<th width="60">2020<sup class="reference" id="cite_ref-WPIMFWEO_6-1"><a href="#cite_note-WPIMFWEO-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup></th>
<th width="60">2019 </th>
<th width="60">2018 </th>
<th width="60">2017 </th>
<th width="60">2016 </th>
<th width="60">2015 </th>
<th width="60">2014 </th>

In [3]:
row_tags = table_tag.find_all("tr")
header = [th.text.strip() for th in row_tags[0].find_all("th")]
print(header)

['Country', '2024[4]', '2023[5]', '2022', '2021[6]', '2020[6]', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010']


In [4]:
rows = list()
for row_tag in row_tags[1:]:
    row = [td.text.strip() for td in row_tag.find_all("td")]
    rows.append(row)
    print(row)

['Germany', '4,683.233', '4,429.840', '4,256.540', '4,230.172', '3,780.553', '3,863.344', '3,951.340', '3,664.511', '3,496.606', '3,383.091', '3,904.921', '3,753.687', '3,545.946', '3,761.142', '3,423.47']
['United Kingdom', '3,557.465', '3,332.060', '3,376.003', '3,108.416', '2,638.296', '2,743.586', '2,828.833', '2,640.067', '2,669.107', '2,897.060', '3,036.310', '2,755.356', '2,677.082', '2,635.799', '2,455.309']
['France', '3,156.325', '3,049.020', '2,936.702', '2,940.428', '2,551.451', '2,707.074', '2,780.152', '2,591.775', '2,466.152', '2,439.435', '2,856.697', '2,811.957', '2,685.311', '2,864.030', '2,647.537']
['Italy', '2,365.541', '2,186.080', '2,058.330', '2,120.232', '1,848.222', '2,001.440', '2,075.856', '1,950.703', '1,869.973', '1,833.195', '2,155.151', '2,131.159', '2,073.971', '2,278.376', '2,129.021']
['Russia', '2,158.786', '1,862.470', '2,133.092', '1,778.530', '1,464.078', '1,637.892', '1,657.290', '1,579.293', '1,282.663', '1,363.707', '2,056.583', '2,289.244', '2

In [5]:
df = pd.DataFrame(rows, columns=header)
df = df.set_index("Country")
df

Unnamed: 0_level_0,2024[4],2023[5],2022,2021[6],2020[6],2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Germany,4683.233,4429.84,4256.54,4230.172,3780.553,3863.344,3951.34,3664.511,3496.606,3383.091,3904.921,3753.687,3545.946,3761.142,3423.47
United Kingdom,3557.465,3332.06,3376.003,3108.416,2638.296,2743.586,2828.833,2640.067,2669.107,2897.06,3036.31,2755.356,2677.082,2635.799,2455.309
France,3156.325,3049.02,2936.702,2940.428,2551.451,2707.074,2780.152,2591.775,2466.152,2439.435,2856.697,2811.957,2685.311,2864.03,2647.537
Italy,2365.541,2186.08,2058.33,2120.232,1848.222,2001.440,2075.856,1950.703,1869.973,1833.195,2155.151,2131.159,2073.971,2278.376,2129.021
Russia,2158.786,1862.47,2133.092,1778.530,1464.078,1637.892,1657.29,1579.293,1282.663,1363.707,2056.583,2289.244,2202.672,2044.618,1632.841
Spain,1724.379,1620.091,1446.498,1461.245,1289.784,1403.496,1431.643,1321.754,1243.016,1206.165,1380.245,1362.187,1330.995,1487.569,1427.989
Turkey,1312.932,1154.06,905.527,817.508,720.11,759.450,779.694,858.932,869.28,864.071,938.512,957.504,880.141,838.508,776.558
Netherlands,1209.072,1092.75,1013.595,1007.562,886.339,902.355,914.519,833.575,783.852,765.65,892.397,877.198,839.436,904.915,848.133
Switzerland,935.316,905.684,841.969,810.830,707.868,715.360,705.546,680.029,670.247,679.721,709.496,688.747,667.89,699.67,583.053
Poland,908.561,842.172,699.559,655.332,580.894,565.854,585.816,526.749,471.843,477.568,545.284,524.399,500.846,528.571,479.161


In [6]:
df.columns = df.columns.str.replace("\[.*\]", "", regex=True)
df.index = df.index.str.replace("\[.*\]", "", regex=True)
df

Unnamed: 0_level_0,2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Germany,4683.233,4429.84,4256.54,4230.172,3780.553,3863.344,3951.34,3664.511,3496.606,3383.091,3904.921,3753.687,3545.946,3761.142,3423.47
United Kingdom,3557.465,3332.06,3376.003,3108.416,2638.296,2743.586,2828.833,2640.067,2669.107,2897.06,3036.31,2755.356,2677.082,2635.799,2455.309
France,3156.325,3049.02,2936.702,2940.428,2551.451,2707.074,2780.152,2591.775,2466.152,2439.435,2856.697,2811.957,2685.311,2864.03,2647.537
Italy,2365.541,2186.08,2058.33,2120.232,1848.222,2001.440,2075.856,1950.703,1869.973,1833.195,2155.151,2131.159,2073.971,2278.376,2129.021
Russia,2158.786,1862.47,2133.092,1778.530,1464.078,1637.892,1657.29,1579.293,1282.663,1363.707,2056.583,2289.244,2202.672,2044.618,1632.841
Spain,1724.379,1620.091,1446.498,1461.245,1289.784,1403.496,1431.643,1321.754,1243.016,1206.165,1380.245,1362.187,1330.995,1487.569,1427.989
Turkey,1312.932,1154.06,905.527,817.508,720.11,759.450,779.694,858.932,869.28,864.071,938.512,957.504,880.141,838.508,776.558
Netherlands,1209.072,1092.75,1013.595,1007.562,886.339,902.355,914.519,833.575,783.852,765.65,892.397,877.198,839.436,904.915,848.133
Switzerland,935.316,905.684,841.969,810.830,707.868,715.360,705.546,680.029,670.247,679.721,709.496,688.747,667.89,699.67,583.053
Poland,908.561,842.172,699.559,655.332,580.894,565.854,585.816,526.749,471.843,477.568,545.284,524.399,500.846,528.571,479.161


In [7]:
df = df.replace("-", pd.NA).replace("", pd.NA).replace("N/A", pd.NA)
df

Unnamed: 0_level_0,2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Germany,4683.233,4429.84,4256.54,4230.172,3780.553,3863.344,3951.34,3664.511,3496.606,3383.091,3904.921,3753.687,3545.946,3761.142,3423.47
United Kingdom,3557.465,3332.06,3376.003,3108.416,2638.296,2743.586,2828.833,2640.067,2669.107,2897.06,3036.31,2755.356,2677.082,2635.799,2455.309
France,3156.325,3049.02,2936.702,2940.428,2551.451,2707.074,2780.152,2591.775,2466.152,2439.435,2856.697,2811.957,2685.311,2864.03,2647.537
Italy,2365.541,2186.08,2058.33,2120.232,1848.222,2001.44,2075.856,1950.703,1869.973,1833.195,2155.151,2131.159,2073.971,2278.376,2129.021
Russia,2158.786,1862.47,2133.092,1778.53,1464.078,1637.892,1657.29,1579.293,1282.663,1363.707,2056.583,2289.244,2202.672,2044.618,1632.841
Spain,1724.379,1620.091,1446.498,1461.245,1289.784,1403.496,1431.643,1321.754,1243.016,1206.165,1380.245,1362.187,1330.995,1487.569,1427.989
Turkey,1312.932,1154.06,905.527,817.508,720.11,759.45,779.694,858.932,869.28,864.071,938.512,957.504,880.141,838.508,776.558
Netherlands,1209.072,1092.75,1013.595,1007.562,886.339,902.355,914.519,833.575,783.852,765.65,892.397,877.198,839.436,904.915,848.133
Switzerland,935.316,905.684,841.969,810.83,707.868,715.36,705.546,680.029,670.247,679.721,709.496,688.747,667.89,699.67,583.053
Poland,908.561,842.172,699.559,655.332,580.894,565.854,585.816,526.749,471.843,477.568,545.284,524.399,500.846,528.571,479.161


In [8]:
df = df.map(lambda s: s.replace(",", ""), na_action="ignore")
df.head()

Unnamed: 0_level_0,2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Germany,4683.233,4429.84,4256.54,4230.172,3780.553,3863.344,3951.34,3664.511,3496.606,3383.091,3904.921,3753.687,3545.946,3761.142,3423.47
United Kingdom,3557.465,3332.06,3376.003,3108.416,2638.296,2743.586,2828.833,2640.067,2669.107,2897.06,3036.31,2755.356,2677.082,2635.799,2455.309
France,3156.325,3049.02,2936.702,2940.428,2551.451,2707.074,2780.152,2591.775,2466.152,2439.435,2856.697,2811.957,2685.311,2864.03,2647.537
Italy,2365.541,2186.08,2058.33,2120.232,1848.222,2001.44,2075.856,1950.703,1869.973,1833.195,2155.151,2131.159,2073.971,2278.376,2129.021
Russia,2158.786,1862.47,2133.092,1778.53,1464.078,1637.892,1657.29,1579.293,1282.663,1363.707,2056.583,2289.244,2202.672,2044.618,1632.841


In [9]:
df = df.apply(pd.to_numeric)
df.head()

Unnamed: 0_level_0,2024,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Germany,4683.233,4429.84,4256.54,4230.172,3780.553,3863.344,3951.34,3664.511,3496.606,3383.091,3904.921,3753.687,3545.946,3761.142,3423.47
United Kingdom,3557.465,3332.06,3376.003,3108.416,2638.296,2743.586,2828.833,2640.067,2669.107,2897.06,3036.31,2755.356,2677.082,2635.799,2455.309
France,3156.325,3049.02,2936.702,2940.428,2551.451,2707.074,2780.152,2591.775,2466.152,2439.435,2856.697,2811.957,2685.311,2864.03,2647.537
Italy,2365.541,2186.08,2058.33,2120.232,1848.222,2001.44,2075.856,1950.703,1869.973,1833.195,2155.151,2131.159,2073.971,2278.376,2129.021
Russia,2158.786,1862.47,2133.092,1778.53,1464.078,1637.892,1657.29,1579.293,1282.663,1363.707,2056.583,2289.244,2202.672,2044.618,1632.841


## Zadania
Zob.: https://www.scrapethissite.com/