In [2]:
import pandas as pd
import os

In [3]:
pop_density_df = pd.read_csv('../../data/processed/population_density_data/population_density_2000_2024.csv')
pop_density_df.columns

Index(['Country', 'Attribute', 'Value'], dtype='object')

In [4]:
# Rename collumns
pop_density_df = pop_density_df.rename(columns={'Attribute': 'Year','Value': 'PopDensity'})
print(pop_density_df.head())

       Country  Year  PopDensity
0  Afghanistan  2000   30.863857
1  Afghanistan  2001   31.099924
2  Afghanistan  2002   32.776970
3  Afghanistan  2003   34.854350
4  Afghanistan  2004   36.123234


In [5]:
print("Creating data availability matrix...")

# Create availability matrix: countries x years (2000-2025)
unique_countries_by_year = {}
for year in range(2000, 2026):
    df_year = pop_density_df[(pop_density_df['Year'] == year) & (pop_density_df['PopDensity'].notnull())]
    countries = df_year['Country'].unique().tolist()
    unique_countries_by_year[year] = countries

# Build availability dataframe
availability_df = pd.DataFrame(columns=['Country'] + [str(year) for year in range(2000, 2026)])
all_countries = set()
for year, countries in unique_countries_by_year.items():
    all_countries.update(countries)
all_countries = sorted(list(all_countries))

for country in all_countries:
    row = {'Country': country}
    for year in range(2000, 2026):
        row[str(year)] = 'yes' if country in unique_countries_by_year.get(year, []) else 'no'
    availability_df = pd.concat([availability_df, pd.DataFrame([row])], ignore_index=True)

availability_df.head()

Creating data availability matrix...


Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,Afghanistan,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no
1,Albania,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no
2,Algeria,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no
3,American Samoa,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no
4,Andorra,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no


In [8]:
from itertools import product

# === Define column names used in the dataset ===
country_column = 'Country'
year_column = 'Year'

# === Expected range of years (2000–2024) ===
expected_years = list(range(2000, 2025))

# === Load expected country list from file (no header) ===
country_df = pd.read_csv('../../data/processed/country_list/mortality_data_2001_2015.csv',
                         header=None, names=[country_column])
expected_countries = set(country_df[country_column].dropna().unique())

# === Use the GDP dataframe that was already loaded earlier ===
data_df = pop_density_df.copy()

# === Extract unique countries and years from the data ===
data_countries = set(data_df[country_column].dropna().unique())
data_years = set(data_df[year_column].dropna().unique())

# === Check missing or extra countries ===
missing_countries = expected_countries - data_countries
extra_countries = data_countries - expected_countries
common_countries = expected_countries & data_countries  # intersection

# === Check missing or extra years overall ===
missing_years = set(expected_years) - data_years
extra_years = data_years - set(expected_years)

# === Print overall country/year results ===
print("=== COUNTRY CHECK ===")
print(f"Total expected countries: {len(expected_countries)}")
print(f"Total countries in data: {len(data_countries)}")
print(f"Countries in both (common): {len(common_countries)}")
print(f"Missing countries in data: {len(missing_countries)} → {sorted(missing_countries)}")
print(f"Extra countries in data: {len(extra_countries)} → {sorted(extra_countries)}")

print("\n=== YEAR CHECK (OVERALL) ===")
print(f"Missing years in data: {sorted(missing_years)}")
print(f"Extra years in data: {sorted(extra_years)}")

# === Check if each country has data for all expected years ===
missing_country_years = {}

for year in expected_years:
    countries_with_data = set(data_df[data_df[year_column] == year][country_column].dropna().unique())
    missing_in_year = expected_countries - countries_with_data
    if missing_in_year:
        missing_country_years[year] = sorted(missing_in_year)

# === Print countries missing per year ===
print("\n=== COUNTRY-YEAR CHECK (MISSING DATA BY YEAR) ===")
if not missing_country_years:
    print("All countries have data for all years.")
else:
    for year, countries in missing_country_years.items():
        print(f"Year {year}: Missing {len(countries)} countries → {countries}")



=== COUNTRY CHECK ===
Total expected countries: 77
Total countries in data: 233
Countries in both (common): 74
Missing countries in data: 3 → ['United Kingdom, England and Wales', 'United Kingdom, Northern Ireland', 'United Kingdom, Scotland']
Extra countries in data: 159 → ['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Benin', 'Bhutan', 'Bolivia', 'Bonaire Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'British Virgin Islands', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'China', 'Comoros', 'Congo', 'Cook Islands', "Cote d'Ivoire", 'Curacao', 'Cyprus', 'Democratic Republic of Congo', 'Djibouti', 'East Timor', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Falkland Islands', 'Faroe Islands', 'Fiji', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia',