# 04 â€” Flows (top destinations in the latest year)

**Dataset (source file):** `raw_data/hdx_hapi_refugees_afg.csv`  
**Geo boundary used by the website:** `raw_data/geo/world_countries.topojson`  
**Website section:** `js/sections/flows.js`

This notebook:
1) reads raw refugees data,
2) applies the same filters as `flows.js`,
3) detects latest year,
4) aggregates destinations in that latest year and picks Top 12,
5) saves:
   - cleaned latest-year rows to `dataset/cleaned/refugees_latest_year_clean.csv`
   - dropped rows to `dataset/cleaned/refugees_latest_year_dropped.csv`
   - top 12 to `dataset/derived/top_destinations_latest.csv` + `dataset/derived/top_destinations_latest.json`


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

RAW_DIR = Path("raw_data")        # raw inputs live here
DATASET_DIR = Path("dataset")     # website reads from here
CLEAN_DIR = DATASET_DIR / "cleaned"
DERIVED_DIR = DATASET_DIR / "derived"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
DERIVED_DIR.mkdir(parents=True, exist_ok=True)

def to_number(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s.astype(str).str.replace(",", "").str.strip(), errors="coerce")

def parse_year(val) -> float:
    if pd.isna(val):
        return np.nan
    s = str(val).strip()
    if len(s) >= 4 and s[:4].isdigit():
        return float(s[:4])
    for sep in ("/", "-"):
        parts = s.split(sep)
        if len(parts) == 3 and parts[-1].isdigit():
            return float(parts[-1])
    return np.nan

def show_filter_report(df_before: pd.DataFrame, mask: pd.Series, title: str, sample_n: int = 10):
    kept = df_before[mask].copy()
    dropped = df_before[~mask].copy()
    print(f"=== {title} ===")
    print("rows_before:", len(df_before))
    print("rows_kept:", len(kept))
    print("rows_dropped:", len(dropped))
    if len(dropped):
        print("\nSample dropped rows:")
        display(dropped.head(sample_n))
    return kept, dropped


In [None]:
raw_path = RAW_DIR / "hdx_hapi_refugees_afg.csv"
df = pd.read_csv(raw_path)
df["year"] = df["reference_period_start"].apply(parse_year)
df["population_num"] = to_number(df["population"])
df.head()


In [None]:
latest_year = int(np.nanmax(df["year"].values))
latest_year


In [None]:
mask = (df["origin_location_code"].astype(str).str.strip() == "AFG") &        (df["asylum_location_code"].astype(str).str.strip() != "AFG") &        (df["year"].notna()) &        (df["year"].astype("Int64") == latest_year) &        (df["population_num"].notna()) &        (df["population_num"] > 0)

if "population_group" in df.columns:
    g = df["population_group"].astype(str).str.strip().str.upper()
    mask = mask & ((g == "") | (g.isin(["REF","ASY"])))

kept, dropped = show_filter_report(df, mask, f"Flows filter (latest_year={latest_year}, AFG origin, dest != AFG, pop>0, group REF/ASY)")


In [None]:
clean_out = CLEAN_DIR / "refugees_latest_year_clean.csv"
kept.to_csv(clean_out, index=False)

dropped_out = CLEAN_DIR / "refugees_latest_year_dropped.csv"
dropped.to_csv(dropped_out, index=False)

(clean_out, dropped_out)


In [None]:
dest = kept.copy()
dest["iso3"] = dest["asylum_location_code"].astype(str).str.strip().str.upper()
dest = dest[(dest["iso3"] != "") & (dest["iso3"] != "AFG")]

top = (dest.groupby("iso3", as_index=False)["population_num"].sum()
          .sort_values("population_num", ascending=False)
          .head(12))
top.rename(columns={"population_num":"people_latest_year"}, inplace=True)

out_csv = DERIVED_DIR / "top_destinations_latest.csv"
top.to_csv(out_csv, index=False)

out_json = DERIVED_DIR / "top_destinations_latest.json"
(Path(out_json)).write_text(json.dumps({"latest_year": latest_year, "top12": top.to_dict(orient="records")}, indent=2), encoding="utf-8")

(out_csv, out_json)
