In [1]:
import requests
from bs4 import BeautifulSoup
import csv

def crawl(url):
    response = requests.get(url)

    if response.status_code == 200:
        print(response.text)  # Print the page content to inspect its structure
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='wikitable')

        if table:
            rows = table.find_all('tr')
            data = []

            for row in rows[1:]:
                cells = row.find_all(['th', 'td'])
                if len(cells) >= 3:
                    country = cells[1].text.strip()
                    elevation = cells[3].text.strip()
                    data.append((country, elevation))

            return data
        else:
            print("Table not found on the page.")
            return None
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None
    
url = "https://en.wikipedia.org/wiki/List_of_countries_by_average_elevation#cite_note-1"
result = crawl(url)

if result:
    for country, elevation in result:
        print(f"{country}: {elevation}")

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8">
<title>List of countries by average elevation - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=document.cookie.match(/(?

In [2]:
def crawl_country_elevations(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table', class_='wikitable')

        if table:
            data = []

            for row in table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) >= 2:
                    country_cell = cells[0]
                    elevation_cell = None

                    # Find the elevation cell after the country cell
                    for i in range(len(cells) - 1):
                        if cells[i] == country_cell:
                            elevation_cell = cells[i + 1]
                            break

                    # Check if both country and elevation cells are found
                    if country_cell and elevation_cell:
                        # Check if country cell contains an anchor tag with the country name
                        country_link = country_cell.find('a')
                        if country_link:
                            country = country_link.text.strip()
                            elevation = elevation_cell.text.strip().replace(',', '')  # Remove commas from elevation
                            data.append((country, elevation))

            return data
        else:
            print("Table not found on the page.")
            return None
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

def clean_data(data):
    # Convert the elevation to numeric (integers)
    df = pd.DataFrame(data, columns=['Country', 'Elevation'])
    df['Elevation'] = pd.to_numeric(df['Elevation'], errors='coerce', downcast='integer')
    df.dropna(subset=['Elevation'], inplace=True)  # Drop rows with invalid elevation values
    return df    

In [3]:
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_countries_by_average_elevation#cite_note-1"
result = crawl_country_elevations(url)
if result:
    for country, elevation in result:
        print(f"{country}: {elevation}")

    # Save the data to a DataFrame and perform cleaning
    df = result

Afghanistan: 1884 m (6181 ft)[2]
Albania: 708 m (2323 ft)[3]
Algeria: 800 m (2625 ft)
Andorra: 1996 m (6549 ft)[2]
Angola: 1112 m (3648 ft)
Antarctica: 2300 m (7546 ft)[2]
Argentina: 595 m (1952 ft)[4]
Armenia: 1792 m (5879 ft)
Australia: 330 m (1083 ft)
Austria: 910 m (2986 ft)
Azerbaijan: 384 m (1260 ft)[5]
Bangladesh: 85 m (279 ft)[6]
Belarus: 170 m (558 ft)[6]
Belgium: 181 m (594 ft)[6]
Belize: 173 m (568 ft)[4]
Benin: 273 m (896 ft)[6]
Bhutan: 3280 m (10761 ft)[2]
Bolivia: 1192 m (3911 ft)[4]
Bosnia and Herzegovina: 500 m (1640 ft)[6]
Botswana: 1013 m (3323 ft)[6]
Brazil: 320 m (1050 ft)[4]
Brunei: 478 m (1568 ft)[6]
Bulgaria: 470 m (1542 ft)
Burkina Faso: 297 m (974 ft)[6]
Burundi: 1504 m (4934 ft)[6]
Cambodia: 126 m (413 ft)[6]
Cameroon: 667 m (2188 ft)[6]
Canada: 487 m (1598 ft)[6]
Central African Republic: 635 m (2083 ft)[6]
Chad: 543 m (1781 ft)[6]
Chile: 1871 m (6138 ft)[4]
China: 1840 m (6037 ft)[2]
Colombia: 593 m (1946 ft)[4]
Costa Rica: 746 m (2448 ft)[4]
Croatia: 331 m 

In [4]:
print(df)

[('Afghanistan', '1884\xa0m (6181\xa0ft)[2]'), ('Albania', '708\xa0m (2323\xa0ft)[3]'), ('Algeria', '800\xa0m (2625\xa0ft)'), ('Andorra', '1996\xa0m (6549\xa0ft)[2]'), ('Angola', '1112\xa0m (3648\xa0ft)'), ('Antarctica', '2300\xa0m (7546\xa0ft)[2]'), ('Argentina', '595\xa0m (1952\xa0ft)[4]'), ('Armenia', '1792\xa0m (5879\xa0ft)'), ('Australia', '330\xa0m (1083\xa0ft)'), ('Austria', '910\xa0m (2986\xa0ft)'), ('Azerbaijan', '384\xa0m (1260\xa0ft)[5]'), ('Bangladesh', '85\xa0m (279\xa0ft)[6]'), ('Belarus', '170\xa0m (558\xa0ft)[6]'), ('Belgium', '181\xa0m (594\xa0ft)[6]'), ('Belize', '173\xa0m (568\xa0ft)[4]'), ('Benin', '273\xa0m (896\xa0ft)[6]'), ('Bhutan', '3280\xa0m (10761\xa0ft)[2]'), ('Bolivia', '1192\xa0m (3911\xa0ft)[4]'), ('Bosnia and Herzegovina', '500\xa0m (1640\xa0ft)[6]'), ('Botswana', '1013\xa0m (3323\xa0ft)[6]'), ('Brazil', '320\xa0m (1050\xa0ft)[4]'), ('Brunei', '478\xa0m (1568\xa0ft)[6]'), ('Bulgaria', '470\xa0m (1542\xa0ft)'), ('Burkina Faso', '297\xa0m (974\xa0ft)[6]'),

In [5]:
import re
# Function to extract the first number from the 'Elevation' string
def extract_elevation(elevation_str):
    elevation_match = re.search(r'\d+', elevation_str)
    return int(elevation_match.group()) if elevation_match else None

# Process the data list and create a new list with the desired output
result = [(country, extract_elevation(elevation)) for country, elevation in df]

print(result)

[('Afghanistan', 1884), ('Albania', 708), ('Algeria', 800), ('Andorra', 1996), ('Angola', 1112), ('Antarctica', 2300), ('Argentina', 595), ('Armenia', 1792), ('Australia', 330), ('Austria', 910), ('Azerbaijan', 384), ('Bangladesh', 85), ('Belarus', 170), ('Belgium', 181), ('Belize', 173), ('Benin', 273), ('Bhutan', 3280), ('Bolivia', 1192), ('Bosnia and Herzegovina', 500), ('Botswana', 1013), ('Brazil', 320), ('Brunei', 478), ('Bulgaria', 470), ('Burkina Faso', 297), ('Burundi', 1504), ('Cambodia', 126), ('Cameroon', 667), ('Canada', 487), ('Central African Republic', 635), ('Chad', 543), ('Chile', 1871), ('China', 1840), ('Colombia', 593), ('Costa Rica', 746), ('Croatia', 331), ('Cuba', 108), ('Cyprus', 91), ('Czech Republic', 430), ('Democratic Republic of the Congo', 726), ('Denmark', 34), ('Djibouti', 430), ('Dominican Republic', 424), ('Ecuador', 1117), ('Egypt', 321), ('El Salvador', 442), ('Equatorial Guinea', 577), ('Eritrea', 853), ('Estonia', 61), ('Eswatini', 305), ('Ethiopi

In [6]:
df = pd.DataFrame(result, columns=['Country', 'Elevation'])
print(df)

            Country  Elevation
0       Afghanistan       1884
1           Albania        708
2           Algeria        800
3           Andorra       1996
4            Angola       1112
..              ...        ...
167         Vietnam        398
168  Western Sahara        256
169           Yemen        999
170          Zambia       1138
171        Zimbabwe        961

[172 rows x 2 columns]


In [7]:
df.to_csv('R_average_elevation.csv', index=False)