# 05 â€” Destinations (top list + multi-year trend)

**Dataset (source file):** `raw_data/hdx_hapi_refugees_afg.csv`  
**Website section:** `js/sections/destinations.js`

This notebook:
1) reads raw refugees data,
2) applies the same filters as the website,
3) detects latest year,
4) finds top 12 destinations in the latest year,
5) builds multi-year series for those top 12,
6) saves:
   - cleaned filtered rows to `dataset/cleaned/refugees_destinations_clean.csv`
   - dropped rows to `dataset/cleaned/refugees_destinations_dropped.csv`
   - derived trend outputs to `dataset/derived/destinations_yearly_top12.csv` + `.json`


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

RAW_DIR = Path("raw_data")        # raw inputs live here
DATASET_DIR = Path("dataset")     # website reads from here
CLEAN_DIR = DATASET_DIR / "cleaned"
DERIVED_DIR = DATASET_DIR / "derived"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
DERIVED_DIR.mkdir(parents=True, exist_ok=True)

def to_number(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s.astype(str).str.replace(",", "").str.strip(), errors="coerce")

def parse_year(val) -> float:
    if pd.isna(val):
        return np.nan
    s = str(val).strip()
    if len(s) >= 4 and s[:4].isdigit():
        return float(s[:4])
    for sep in ("/", "-"):
        parts = s.split(sep)
        if len(parts) == 3 and parts[-1].isdigit():
            return float(parts[-1])
    return np.nan

def show_filter_report(df_before: pd.DataFrame, mask: pd.Series, title: str, sample_n: int = 10):
    kept = df_before[mask].copy()
    dropped = df_before[~mask].copy()
    print(f"=== {title} ===")
    print("rows_before:", len(df_before))
    print("rows_kept:", len(kept))
    print("rows_dropped:", len(dropped))
    if len(dropped):
        print("\nSample dropped rows:")
        display(dropped.head(sample_n))
    return kept, dropped


In [6]:

df = pd.read_csv("hdx_hapi_refugees_afg.csv")

df2 = df.copy()
df2["year"] = df2["reference_period_start"].apply(parse_year)
df2["population_num"] = to_number(df2["population"])

mask = (df2["origin_location_code"].astype(str).str.strip() == "AFG") &        (df2["asylum_location_code"].astype(str).str.strip() != "AFG") &        (df2["year"].notna()) &        (df2["population_num"].notna()) &        (df2["population_num"] > 0)

if "population_group" in df2.columns:
    g = df2["population_group"].astype(str).str.strip().str.upper()
    mask = mask & ((g == "") | (g.isin(["REF","ASY"])))

kept, dropped = show_filter_report(df2, mask, "Destinations filter (AFG origin, dest != AFG, valid year, pop>0, group REF/ASY)")


=== Destinations filter (AFG origin, dest != AFG, valid year, pop>0, group REF/ASY) ===
rows_before: 43927
rows_kept: 14602
rows_dropped: 29325

Sample dropped rows:


Unnamed: 0,origin_location_code,origin_has_hrp,origin_in_gho,asylum_location_code,asylum_has_hrp,asylum_in_gho,population_group,gender,age_range,min_age,max_age,population,reference_period_start,reference_period_end,year,population_num
0,AFG,True,True,AFG,True,True,OOC,f,0-4,0.0,4.0,8751,2020-01-01,2020-12-31,2020.0,8751
1,AFG,True,True,AFG,True,True,OOC,f,5-11,5.0,11.0,8775,2020-01-01,2020-12-31,2020.0,8775
2,AFG,True,True,AFG,True,True,OOC,f,12-17,12.0,17.0,5828,2020-01-01,2020-12-31,2020.0,5828
3,AFG,True,True,AFG,True,True,OOC,f,18-59,18.0,59.0,18958,2020-01-01,2020-12-31,2020.0,18958
4,AFG,True,True,AFG,True,True,OOC,f,60+,60.0,,587,2020-01-01,2020-12-31,2020.0,587
5,AFG,True,True,AFG,True,True,OOC,f,all,,,42899,2020-01-01,2020-12-31,2020.0,42899
6,AFG,True,True,AFG,True,True,OOC,m,0-4,0.0,4.0,8814,2020-01-01,2020-12-31,2020.0,8814
7,AFG,True,True,AFG,True,True,OOC,m,5-11,5.0,11.0,5017,2020-01-01,2020-12-31,2020.0,5017
8,AFG,True,True,AFG,True,True,OOC,m,12-17,12.0,17.0,4785,2020-01-01,2020-12-31,2020.0,4785
9,AFG,True,True,AFG,True,True,OOC,m,18-59,18.0,59.0,19273,2020-01-01,2020-12-31,2020.0,19273


In [8]:
clean = kept.copy()
clean["year"] = clean["year"].astype(int)
clean["iso3"] = clean["asylum_location_code"].astype(str).str.strip().str.upper()
clean = clean[(clean["iso3"] != "") & (clean["iso3"] != "AFG")]

clean_out = CLEAN_DIR / "refugees_destinations_clean.csv"
clean.to_csv(clean_out, index=False)

dropped_out = CLEAN_DIR / "refugees_destinations_dropped.csv"
dropped.to_csv(dropped_out, index=False)

(clean_out, dropped_out)


(WindowsPath('dataset/cleaned/refugees_destinations_clean.csv'),
 WindowsPath('dataset/cleaned/refugees_destinations_dropped.csv'))

In [10]:
latest_year = int(clean["year"].max())
latest = clean[clean["year"] == latest_year]

top12 = (latest.groupby("iso3", as_index=False)["population_num"].sum()
              .sort_values("population_num", ascending=False)
              .head(12))
top12_list = top12["iso3"].tolist()

yearly = (clean[clean["iso3"].isin(top12_list)]
            .groupby(["year","iso3"], as_index=False)["population_num"].sum()
            .sort_values(["iso3","year"]))
yearly.rename(columns={"population_num":"people"}, inplace=True)

out_csv = DERIVED_DIR / "destinations_yearly_top12.csv"
yearly.to_csv(out_csv, index=False)

out_json = DERIVED_DIR / "destinations_yearly_top12.json"
(Path(out_json)).write_text(json.dumps({
    "latest_year": latest_year,
    "top12": top12.to_dict(orient="records"),
    "series": yearly.to_dict(orient="records")
}, indent=2), encoding="utf-8")

(out_csv, out_json)


(WindowsPath('dataset/derived/destinations_yearly_top12.csv'),
 WindowsPath('dataset/derived/destinations_yearly_top12.json'))