In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

###---Creating a Function to Scrape Military Data from a Website called "Global Firepower"---###

def scrape_global_firepower_data():
    # Defining URLs for different metrics
    base_url = 'https://www.globalfirepower.com/countries-listing.php'
    other_sources = {
        'https://www.globalfirepower.com/total-population-by-country.php': 'total_population',
        'https://www.globalfirepower.com/available-military-manpower.php': 'total_military_manpower',
        'https://www.globalfirepower.com/manpower-fit-for-military-service.php': 'fit_for_service',
        'https://www.globalfirepower.com/manpower-reaching-military-age-annually.php': 'population_reaching_military_age_annually',
        'https://www.globalfirepower.com/active-military-manpower.php': 'active_personnel',
        'https://www.globalfirepower.com/active-reserve-military-manpower.php': 'reserve_personnel',
        'https://www.globalfirepower.com/manpower-paramilitary.php': 'paramilitary',
        'https://www.globalfirepower.com/aircraft-total.php': 'total_military_aircraft',
        'https://www.globalfirepower.com/aircraft-total-fighters.php': 'fighter_aircraft',
        'https://www.globalfirepower.com/aircraft-total-attack-types.php': 'attack_aircraft',
        'https://www.globalfirepower.com/aircraft-total-transports.php': 'transport_aircraft',
        'https://www.globalfirepower.com/aircraft-total-trainers.php': 'trainer_aircraft',
        'https://www.globalfirepower.com/aircraft-total-special-mission.php': 'special_mission_aircraft',
        'https://www.globalfirepower.com/aircraft-total-tanker-fleet.php': 'tanker_aircraft',
        'https://www.globalfirepower.com/aircraft-helicopters-total.php': 'total_military_helicopters',
        'https://www.globalfirepower.com/aircraft-helicopters-attack.php': 'attack_helicopters',
        'https://www.globalfirepower.com/armor-tanks-total.php': 'tanks',
        'https://www.globalfirepower.com/armor-apc-total.php': 'armored_fighting_vehicles',
        'https://www.globalfirepower.com/armor-self-propelled-guns-total.php': 'self_propelled_artillery',
        'https://www.globalfirepower.com/armor-towed-artillery-total.php': 'towed_artillery',
        'https://www.globalfirepower.com/armor-mlrs-total.php': 'rocket_projectors',
        'https://www.globalfirepower.com/navy-ships.php': 'total_naval_fleet',
        'https://www.globalfirepower.com/navy-force-by-tonnage.php': 'total_naval_fleet_tonnage_mt',
        'https://www.globalfirepower.com/navy-aircraft-carriers.php': 'aircraft_carriers',
        'https://www.globalfirepower.com/navy-helo-carriers.php': 'helicopter_carriers',
        'https://www.globalfirepower.com/navy-submarines.php': 'submarines',
        'https://www.globalfirepower.com/navy-destroyers.php': 'destroyers',
        'https://www.globalfirepower.com/navy-frigates.php': 'frigates',
        'https://www.globalfirepower.com/navy-corvettes.php': 'corvettes',
        'https://www.globalfirepower.com/navy-patrol-coastal-craft.php': 'coastal_patrol_craft',
        'https://www.globalfirepower.com/navy-mine-warfare-craft.php': 'mine_warfare_craft',
        'https://www.globalfirepower.com/defense-spending-budget.php': 'defense_budget_usd',
        'https://www.globalfirepower.com/external-debt-by-country.php': 'external_debt_usd',
        'https://www.globalfirepower.com/purchasing-power-parity.php': 'purchasing_power_parity_usd',
        'https://www.globalfirepower.com/reserves-of-foreign-exchange-and-gold.php': 'foreign_exchange_and_gold_reserves_usd',
        'https://www.globalfirepower.com/major-serviceable-airports-by-country.php': 'total_serviceable_airports',
        'https://www.globalfirepower.com/labor-force-by-country.php': 'labour_force',
        'https://www.globalfirepower.com/major-ports-and-terminals.php': 'major_ports_and_terminals',
        'https://www.globalfirepower.com/merchant-marine-strength-by-country.php': 'total_merchant_marine_fleet',
        'https://www.globalfirepower.com/railway-coverage.php': 'railway_coverage_km',
        'https://www.globalfirepower.com/roadway-coverage.php': 'roadway_coverage_km',
        'https://www.globalfirepower.com/oil-production-by-country.php': 'oil_production_bbl',
        'https://www.globalfirepower.com/oil-consumption-by-country.php': 'oil_consumption_bbl',
        'https://www.globalfirepower.com/proven-oil-reserves-by-country.php': 'proven_oil_reserves_bbl',
        'https://www.globalfirepower.com/natural-gas-production-by-country.php': 'natural_gas_production_cum',
        'https://www.globalfirepower.com/natural-gas-consumption-by-country.php': 'natural_gas_consumption_cum',
        'https://www.globalfirepower.com/proven-natural-gas-reserves-by-country.php': 'proven_natural_gas_reserves_cum',
        'https://www.globalfirepower.com/coal-production-by-country.php': 'coal_production_cum',
        'https://www.globalfirepower.com/coal-consumption-by-country.php': 'coal_consumption_mt',
        'https://www.globalfirepower.com/proven-coal-reserves-by-country.php': 'proven_coal_reserves_cum',
        'https://www.globalfirepower.com/square-land-area.php': 'total_land_area_sq_km',
        'https://www.globalfirepower.com/coastline-coverage.php': 'coastline_coverage_km',
        'https://www.globalfirepower.com/border-coverage.php': 'border_coverage_km',
        'https://www.globalfirepower.com/waterway-coverage.php': 'waterway_coverage_km'
    }

    # To scrape base ranking data
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'lxml')
    containers = soup.find_all('div', class_='picTrans recordsetContainer boxShadow zoom')

    ranks = []
    countries = []

    for item in containers:
        try:
            rank = item.find('span', class_='textWhite textLarge textBold').text.strip()
            country = item.find('span', class_='textWhite textLarge textShadow').text.strip()
            ranks.append(rank)
            countries.append(country)
        except AttributeError:
            continue

    df_main = pd.DataFrame({'rank': ranks, 'country': countries})

    # To scrape and merge other metrics
    for url, column_name in other_sources.items():
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        containers = soup.find_all('div', class_='picTrans recordsetContainer boxShadow zoom')

        sub_countries = []
        values = []

        for item in containers:
            try:
                country = item.find('span', class_='textWhite textLarge textShadow').text.strip()
                value = item.find_all('span', class_='textWhite textLarge')[-1].text.strip()
                sub_countries.append(country)
                values.append(value)
            except AttributeError:
                continue

        df_sub = pd.DataFrame({'country': sub_countries, column_name: values})
        df_main = df_main.merge(df_sub, on='country', how='left')

    # Clean and enforce data types
    string_columns = []

    for col in df_main.columns[2:]:
        cleaned_values = []

        for val in df_main[col]:
            if pd.isna(val):
                cleaned_values.append(None)
                continue

            val_str = str(val).replace(',', '').replace(' ', '')
            match = re.search(r'-?\d+\.?\d*', val_str)
            if match:
                try:
                    num = float(match.group())
                    cleaned_values.append(int(num) if num.is_integer() else num)
                except:
                    cleaned_values.append(None)
            else:
                cleaned_values.append(None)

        non_null = [v for v in cleaned_values if v is not None]

        try:
            if all(isinstance(x, int) for x in non_null):
                df_main[col] = pd.Series(cleaned_values, dtype="Int64")
            elif all(isinstance(x, (int, float)) for x in non_null):
                df_main[col] = pd.Series(cleaned_values, dtype="float64")
            else:
                raise ValueError
        except:
            df_main[col] = pd.Series(cleaned_values, dtype="string")
            string_columns.append(col)

    if string_columns:
        print("The following columns contain mixed or non-numeric values and were stored as strings:")
        for col in string_columns:
            print(f" - {col}")

    return df_main

# Run the function
df = scrape_global_firepower_data()

# View the dataframe
df

Unnamed: 0,rank,country,total_population,total_military_manpower,fit_for_service,population_reaching_military_age_annually,active_personnel,reserve_personnel,paramilitary,total_military_aircraft,...,natural_gas_production_cum,natural_gas_consumption_cum,proven_natural_gas_reserves_cum,coal_production_cum,coal_consumption_mt,proven_coal_reserves_cum,total_land_area_sq_km,coastline_coverage_km,border_coverage_km,waterway_coverage_km
0,1,United States,341963408,150463900,124816644,4445524,1328000,799500,0,13043,...,1029000000000,914301000000,13402000000000,548849000,476044000,248941000000,9833517,19924,12002,41009
1,2,Russia,140820810,69002197,46189226,1267387,1320000,2000000,250000,4292,...,617830000000,472239000000,47805000000000,508190000,310958000,162166000000,17098242,37653,22407,102000
2,3,China,1415043270,764123366,626864169,19810606,2035000,510000,625000,3309,...,225341000000,366160000000,6654000000000,4827000000,5313000000,143197000000,9596960,14500,22457,27700
3,4,India,1409128296,662290299,522786598,23955181,1455550,1155000,2527000,2229,...,33170000000,58867000000,1381000000000,985671000,1200000000,111052000000,3287263,7000,13888,14500
4,5,South Korea,52081799,26040900,21353538,416654,600000,3100000,120000,1592,...,55127000,59480000000,7079000000,15595000,136413000,326000000,99720,2413,237,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,141,Kosovo,1977093,917371,743387,17794,10000,5000,500,0,...,0,0,0,9074000,8927000,1564000000,10887,0,714,0
141,142,Somalia,13017273,2993973,1796384,130173,15000,0,2000,0,...,0,0,5663000000,0,0,0,637657,3025,2385,0
142,143,Central African Republic,5650957,2203873,1254512,101717,10000,0,1000,6,...,0,0,0,0,0,3000000,622984,0,5920,2800
143,144,Benin,14697052,5584880,3189260,308638,4750,0,0,5,...,0,182131000,1133000000,0,45000,0,112622,121,2123,150


In [9]:
df.to_csv('scraped_dataset.csv', index=False)