# Scraping Wikipedia for World Coutnry Data

In [1]:
# Importing Cleaning Packages
import re
import pandas as pd

# Importing Data Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Capturing World Population Data by Country

In [22]:
wikipedia = "https://en.wikipedia.org/wiki/" # Wikipedia Root URL
population = "List_of_countries_and_dependencies_by_population" # World Population Sub-URL

tables = pd.read_html(wikipedia + population)

population_df = tables[0]  # Index of the population data table

# Loop through each cell and remove brackets and their inner characters
for col in population_df.columns:
    population_df[col] = population_df[col].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))

# Drop Unecessary Columns

# Drop the columns
population_df = population_df.drop(columns=['Unnamed: 0', 'Unnamed: 6'])

# Dropping unnecessary columns
population_df = population_df.drop(columns=['Date', 'Source (official or from the United Nations)'])

# Rename Columns
population_df = population_df.rename(columns={'% of world': '% of World Population'})

# Display DataFrame
print(population_df)

# Save DataFrame to CSV
population_df.to_csv('population.csv', index=False)

                                Location  Population % of World Population
0                                  World  8098329000                  100%
1                                  China  1409670000                 17.4%
2                                  India  1392329000                 17.2%
3                          United States   335893238                  4.1%
4                              Indonesia   279118866                  3.4%
..                                   ...         ...                   ...
236                                 Niue        1689                    0%
237                         Tokelau (NZ)        1647                    0%
238                         Vatican City         764                    0%
239  Cocos (Keeling) Islands (Australia)         593                    0%
240                Pitcairn Islands (UK)          47                    0%

[241 rows x 3 columns]


### Capturing World Nominal GDP Data by Country

In [23]:
GDP_nominal = "List_of_countries_by_GDP_(nominal)" # World GDP Nominal Sub-URL

tables = pd.read_html(wikipedia + GDP_nominal)

GDP_nominal_df = tables[2]  # Index of the nominal GDP data table

# Loop through each cell and remove brackets and their inner characters
for col in GDP_nominal_df.columns:
    GDP_nominal_df[col] = GDP_nominal_df[col].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))

# Drop unnecessary columns (Only Using IMF Data)
GDP_nominal_df = GDP_nominal_df.drop(columns=['World Bank[14]', 'United Nations[15]'])

# Remove the first level of the MultiIndex for columns
GDP_nominal_df.columns = GDP_nominal_df.columns.droplevel(0)

# Drop year column now that level has been reduced
GDP_nominal_df = GDP_nominal_df.drop(columns=['Year'])

# Rename Columns
GDP_nominal_df = GDP_nominal_df.rename(columns={'Country/Territory': 'Country', 'UN region': 'Region', 'Forecast':'IMF Nominal Forecast (millions)'})

# Replace World Region With String "N/A"
GDP_nominal_df.loc[0, 'Region'] = "N/A"

# Display DataFrame
print(GDP_nominal_df)

# Save DataFrame to CSV
GDP_nominal_df.to_csv('nominal.csv', index=False)

           Country    Region IMF Nominal Forecast (millions)
0            World       N/A                       104476432
1    United States  Americas                        26949643
2            China      Asia                        17700899
3          Germany    Europe                         4429838
4            Japan      Asia                         4230862
..             ...       ...                             ...
208          Palau   Oceania                             267
209       Kiribati   Oceania                             246
210          Nauru   Oceania                             150
211     Montserrat  Americas                               —
212         Tuvalu   Oceania                              63

[213 rows x 3 columns]


  GDP_nominal_df = GDP_nominal_df.drop(columns=['World Bank[14]', 'United Nations[15]'])


### Capturing World PPP GDP Data by Country

In [42]:
GDP_ppp = "List_of_countries_by_GDP_(PPP)" # World GDP PPP Sub-URL

tables = pd.read_html(wikipedia + GDP_ppp)

GDP_ppp_df = tables[1]  # Index of the PPP GDP data table

# Loop through each cell and remove brackets and their inner characters
for col in GDP_ppp_df.columns:
    GDP_ppp_df[col] = GDP_ppp_df[col].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))

# Drop unnecessary columns (Only Using IMF Data)
GDP_ppp_df = GDP_ppp_df.drop(columns=['World Bank[6]', 'CIA[7][8][9]'])

# Remove the first level of the MultiIndex for columns
GDP_ppp_df.columns = GDP_ppp_df.columns.droplevel(0)


print(GDP_ppp_df.columns)
# Drop year column now that level has been reduced
GDP_ppp_df = GDP_ppp_df.drop(columns=['Year', 'UN region'])

# Rename Columns
GDP_ppp_df = GDP_ppp_df.rename(columns={'Country (or territory)': 'Country', 'Forecast':'IMF PPP Forecast (millions)'})

# Display DataFrame
print(GDP_ppp_df)

# Save DataFrame to CSV
GDP_ppp_df.to_csv('ppp.csv', index=False)

Index(['Country (or territory)', 'UN region', 'Forecast', 'Year'], dtype='object')
                                          Country IMF PPP Forecast (millions)
0                                           World                   183950000
1                                           China                    35042689
2                                   United States                    27966553
3                                           India                    14261176
4                                           Japan                     6710984
..                                            ...                         ...
224                                        Tuvalu                          63
225                             Wallis and Futuna                           —
226  Saint Helena, Ascension and Tristan da Cunha                           —
227                                          Niue                           —
228                                       Tokelau          

  GDP_ppp_df = GDP_ppp_df.drop(columns=['World Bank[6]', 'CIA[7][8][9]'])


### Capturing World Size Data by Country

In [44]:
size = "List_of_countries_and_dependencies_by_area" # World Country Size Sub-URL

tables = pd.read_html(wikipedia + size)

size_df = tables[1]  # Index of the size data table

# Loop through each cell and remove brackets and their inner characters
for col in size_df.columns:
    size_df[col] = size_df[col].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))

# Drop unnecessary columns
size_df = size_df.drop(columns=['Unnamed: 0', 'Unnamed: 6', '% water'])

# Rename Columns
size_df = size_df.rename(columns={'Country / dependency': 'Country', 'Total in km2 (mi2)':'Total in km^2', 'Land in km2 (mi2)': 'Land in km^2', 'Water in km2 (mi2)': 'Water in km^2'})

print(size_df)
size_df.to_csv('size.csv', index=False)

                                     Country              Total in km^2  \
0                                      Earth  510,072,000 (196,940,000)   
1                                     Russia     17,098,246 (6,601,667)   
2                                 Antarctica     14,200,000 (5,480,000)   
3                                     Canada      9,984,670 (3,855,100)   
4                                      China      9,596,960 (3,705,410)   
..                                       ...                        ...   
259  Ashmore and Cartier Islands (Australia)                  5.0 (1.9)   
260            Coral Sea Islands (Australia)                  3.0 (1.2)   
261               Spratly Islands (disputed)                 2.0 (0.77)   
262                                   Monaco                 2.0 (0.77)   
263                             Vatican City                0.49 (0.19)   

                 Land in km^2              Water in km^2  
0    148,940,000 (57,506,000)  361,132,0