# Step 1: Mobility Data Construction

## Objective
Construct the base panel dataset of international student flows (Origin-Destination-Year).

## Data Sources
1.  **OECD Mobility Data**: Detailed bilateral student flows (Enrolled, Graduated, New Entrants).
2.  **UNESCO Inbound/Outbound**: Aggregate totals used to estimate bilateral flows where direct OECD data is missing.

## Methodology: Gravity Weighting
Many country pairs have missing bilateral flow data. To maximize coverage, we use a **Gravity Weighting** approach:
1.  Calculate the **share of total outbound students** from Origin $i$ that go to Destination $j$ (based on available data or regional averages).
2.  Multiply this share by the **total outbound students** from Origin $i$ (UNESCO) to estimate the bilateral flow $Students_{ij}$.
3.  Where direct OECD data exists (`flow_source = 'reported'`), we prioritize it over estimates (`flow_source = 'estimated'`).

## Validation
We verify that the calculated weights sum to 1 for each origin-year, ensuring no students are "lost" or double-counted.

In [7]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("/Users/simonedinato/Documents/Classes/Applied Econometrics/Project/Datasets/01_mobility_OD")



In [8]:
csv_kwargs = {"encoding": "utf-8-sig"}

mobile_counts_df = pd.read_csv(
    DATA_PATH / "Number of mobile students enrolled and graduated by country of origin.csv",
    **csv_kwargs,
)
national_abroad_df = pd.read_csv(
    DATA_PATH / "Number of national tertiary students enrolled abroad.csv",
    **csv_kwargs,
)
share_dest_df = pd.read_csv(
    DATA_PATH / "Share of mobile students enrolled at tertiary level by country of destination.csv",
    **csv_kwargs,
)
share_origin_df = pd.read_csv(
    DATA_PATH / "Share of mobile students enrolled at tertiary level by country of origin.csv",
    **csv_kwargs,
)
share_mobility_df = pd.read_csv(
    DATA_PATH / "Share of mobile enrolments, new entrants and graduates.csv",
    **csv_kwargs,
)
total_students_df = pd.read_csv(
    DATA_PATH / "Total number of enrolled students, new entrants and graduates.csv",
    **csv_kwargs,
)

unesco_inbound_path = DATA_PATH / "UNESCO-inbound" / "data.csv"
unesco_outbound_path = DATA_PATH / "UNESCO-outbound" / "data.csv"
unesco_inbound_df = (
    pd.read_csv(unesco_inbound_path, **csv_kwargs)
    if unesco_inbound_path.exists()
    else None
)
unesco_outbound_df = (
    pd.read_csv(unesco_outbound_path, **csv_kwargs)
    if unesco_outbound_path.exists()
    else None
)



In [9]:
for name, df in {
    "mobile_counts_df": mobile_counts_df,
    "national_abroad_df": national_abroad_df,
    "share_dest_df": share_dest_df,
    "share_origin_df": share_origin_df,
    "share_mobility_df": share_mobility_df,
    "total_students_df": total_students_df,
}.items():
    print(f"\n{name} columns:\n", df.columns.tolist())

if unesco_inbound_df is not None:
    print("\nunesco_inbound_df columns:\n", unesco_inbound_df.columns.tolist())
if unesco_outbound_df is not None:
    print("\nunesco_outbound_df columns:\n", unesco_outbound_df.columns.tolist())




mobile_counts_df columns:
 ['STRUCTURE', 'STRUCTURE_ID', 'STRUCTURE_NAME', 'ACTION', 'REF_AREA', 'Reference area', 'EDUCATION_LEV', 'Education level', 'MEASURE', 'Measure', 'EDUCATION_TYPE', 'Education type', 'INTENSITY', 'Intensity', 'EDUCATION_FIELD', 'Field of education', 'GRADE', 'Grade', 'FREQ', 'Frequency of observation', 'ORIGIN', 'Origin', 'DESTINATION', 'Destination', 'INST_TYPE_EDU', 'Type of educational institution', 'MOBILITY', 'Mobility', 'UNIT_MEASURE', 'Unit of measure', 'SEX', 'Sex', 'AGE', 'Age', 'TIME_PERIOD', 'Time period', 'OBS_VALUE', 'Observation value', 'REF_YEAR_AGES', 'Reference date for ages', 'ORIGIN_CRITERION', 'Origin criterion', 'REPYEARSTART', 'Reference year start', 'REPYEAREND', 'Reference year end', 'OBS_STATUS', 'Observation status', 'CONF_STATUS', 'Confidentiality status', 'COMMENT_OBS', 'Observation comment', 'DECIMALS', 'Decimals', 'TIME_PER_COLLECT', 'Time period collection', 'UNIT_MULT', 'Unit multiplier']

national_abroad_df columns:
 ['STRUCTU

In [10]:
def _apply_filters(df, filters):
    mask = pd.Series(True, index=df.index)
    for column, predicate in filters.items():
        if column in df.columns:
            mask &= predicate(df[column])
    return df.loc[mask].copy()

common_filters = {
    "EDUCATION_LEV": lambda s: s.isin(["ISCED11_5T8"]),
    "EDUCATION_TYPE": lambda s: s.eq("FE"),
    "INTENSITY": lambda s: s.eq("_T"),
    "EDUCATION_FIELD": lambda s: s.eq("_T"),
    "GRADE": lambda s: s.eq("_T"),
    "FREQ": lambda s: s.eq("A"),
    "INST_TYPE_EDU": lambda s: s.eq("INST_EDU"),
    "SEX": lambda s: s.eq("_T"),
    "AGE": lambda s: s.eq("_T"),
}

YEAR_START = 2018
YEAR_END = 2023

alpha3_pattern = r"^[A-Z]{3}$"

def _alpha3_mask(series: pd.Series) -> pd.Series:
    return series.astype(str).str.fullmatch(alpha3_pattern)


def _build_country_lookup():
    sources = []
    for df, code_col, name_col in [
        (mobile_counts_df, "REF_AREA", "Reference area"),
        (mobile_counts_df, "ORIGIN", "Origin"),
        (national_abroad_df, "REF_AREA", "Reference area"),
        (share_dest_df, "REF_AREA", "Reference area"),
        (share_origin_df, "REF_AREA", "Reference area"),
    ]:
        if code_col in df.columns and name_col in df.columns:
            tmp = (
                df[[code_col, name_col]]
                .dropna()
                .rename(columns={code_col: "country_code", name_col: "country_name"})
            )
            sources.append(tmp)
    if not sources:
        return {}
    lookup = (
        pd.concat(sources, ignore_index=True)
        .drop_duplicates()
    )
    lookup = lookup.loc[_alpha3_mask(lookup["country_code"])]
    return dict(zip(lookup["country_code"], lookup["country_name"]))


mobile_counts_clean = _apply_filters(
    mobile_counts_df,
    {**common_filters, "MOBILITY": lambda s: s.eq("MOB"), "UNIT_MEASURE": lambda s: s.eq("PS")},
)
mobile_counts_clean = mobile_counts_clean.loc[
    _alpha3_mask(mobile_counts_clean["REF_AREA"]) & _alpha3_mask(mobile_counts_clean["ORIGIN"])
].copy()
mobile_counts_clean = mobile_counts_clean.rename(
    columns={
        "REF_AREA": "destination_country_code",
        "Reference area": "destination_country",
        "ORIGIN": "origin_country_code",
        "Origin": "origin_country",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "obs_value",
        "MEASURE": "measure_code",
    }
)
mobile_counts_clean["year"] = pd.to_numeric(mobile_counts_clean["year"], errors="coerce").astype("Int64")
year_mask = mobile_counts_clean["year"].between(YEAR_START, YEAR_END, inclusive="both")
mobile_counts_clean = mobile_counts_clean.loc[year_mask.fillna(False)]
mobile_counts_clean["obs_value"] = pd.to_numeric(mobile_counts_clean["obs_value"], errors="coerce")

oecd_mobility_df = (
    mobile_counts_clean[
        [
            "destination_country_code",
            "destination_country",
            "origin_country_code",
            "origin_country",
            "year",
            "measure_code",
            "obs_value",
        ]
    ]
    .pivot_table(
        index=[
            "destination_country_code",
            "destination_country",
            "origin_country_code",
            "origin_country",
            "year",
        ],
        columns="measure_code",
        values="obs_value",
        aggfunc="sum",
    )
    .reset_index()
)
oecd_mobility_df = oecd_mobility_df.rename(
    columns={
        "ENRL": "students_enrolled",
        "GRAD": "students_graduated",
        "ENTR": "students_new_entrants",
    }
)
for column in ["students_enrolled", "students_graduated", "students_new_entrants"]:
    if column not in oecd_mobility_df.columns:
        oecd_mobility_df[column] = pd.NA

def _filter_alpha3(df, column):
    return df.loc[_alpha3_mask(df[column])].copy()


national_abroad_clean = _apply_filters(
    national_abroad_df,
    {**common_filters, "MOBILITY": lambda s: s.eq("NMOB"), "UNIT_MEASURE": lambda s: s.eq("PS")},
).rename(
    columns={
        "REF_AREA": "origin_country_code",
        "Reference area": "origin_country",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "students_national_abroad",
    }
)[["origin_country_code", "origin_country", "year", "students_national_abroad"]]
national_abroad_clean = _filter_alpha3(national_abroad_clean, "origin_country_code")
national_abroad_clean["year"] = pd.to_numeric(national_abroad_clean["year"], errors="coerce").astype("Int64")
national_abroad_clean = national_abroad_clean.loc[
    national_abroad_clean["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
national_abroad_clean["students_national_abroad"] = pd.to_numeric(
    national_abroad_clean["students_national_abroad"], errors="coerce"
)

share_dest_clean = _apply_filters(
    share_dest_df,
    {**common_filters, "UNIT_MEASURE": lambda s: s.eq("PT_ST")},
).rename(
    columns={
        "REF_AREA": "destination_country_code",
        "Reference area": "destination_country",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "share_mobile_destination",
        "Destination": "origin_region",
        "DESTINATION": "origin_region_code",
    }
)
share_dest_clean = share_dest_clean.loc[share_dest_clean["origin_region"].eq("World")][
    ["destination_country_code", "destination_country", "year", "share_mobile_destination"]
]
share_dest_clean = _filter_alpha3(share_dest_clean, "destination_country_code")
share_dest_clean["year"] = pd.to_numeric(share_dest_clean["year"], errors="coerce").astype("Int64")
share_dest_clean = share_dest_clean.loc[
    share_dest_clean["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
share_dest_clean["share_mobile_destination"] = pd.to_numeric(
    share_dest_clean["share_mobile_destination"], errors="coerce"
)

share_origin_clean = _apply_filters(
    share_origin_df,
    {**common_filters, "UNIT_MEASURE": lambda s: s.eq("PT_ST")},
).rename(
    columns={
        "REF_AREA": "origin_country_code",
        "Reference area": "origin_country",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "share_mobile_origin",
        "Origin": "destination_region",
        "ORIGIN": "destination_region_code",
    }
)
share_origin_clean = share_origin_clean.loc[
    share_origin_clean["destination_region"].eq("World")
][["origin_country_code", "origin_country", "year", "share_mobile_origin"]]
share_origin_clean = _filter_alpha3(share_origin_clean, "origin_country_code")
share_origin_clean["year"] = pd.to_numeric(share_origin_clean["year"], errors="coerce").astype("Int64")
share_origin_clean = share_origin_clean.loc[
    share_origin_clean["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
share_origin_clean["share_mobile_origin"] = pd.to_numeric(
    share_origin_clean["share_mobile_origin"], errors="coerce"
)

share_mobility_clean = _apply_filters(
    share_mobility_df,
    {**common_filters, "UNIT_MEASURE": lambda s: s.eq("PT_ST")},
).rename(
    columns={
        "REF_AREA": "destination_country_code",
        "Reference area": "destination_country",
        "MEASURE": "measure_code",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "share_value",
    }
)
share_mobility_clean = share_mobility_clean.loc[
    share_mobility_clean["MOBILITY"].eq("MOB")
]
share_mobility_clean = _filter_alpha3(share_mobility_clean, "destination_country_code")
share_mobility_clean["year"] = pd.to_numeric(share_mobility_clean["year"], errors="coerce").astype("Int64")
share_mobility_clean = share_mobility_clean.loc[
    share_mobility_clean["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
share_mobility_clean["share_value"] = pd.to_numeric(
    share_mobility_clean["share_value"], errors="coerce"
)
share_mobility_pivot = (
    share_mobility_clean[
        [
            "destination_country_code",
            "destination_country",
            "year",
            "measure_code",
            "share_value",
        ]
    ]
    .pivot_table(
        index=["destination_country_code", "destination_country", "year"],
        columns="measure_code",
        values="share_value",
        aggfunc="mean",
    )
    .reset_index()
    .rename(
        columns={
            "ENRL": "share_mobile_enrolled",
            "GRAD": "share_mobile_graduated",
            "ENTR": "share_mobile_new_entrants",
        }
    )
)



total_students_clean = _apply_filters(
    total_students_df,
    {**common_filters, "MOBILITY": lambda s: s.eq("MOB"), "UNIT_MEASURE": lambda s: s.eq("PS")},
).rename(
    columns={
        "REF_AREA": "destination_country_code",
        "Reference area": "destination_country",
        "MEASURE": "measure_code",
        "TIME_PERIOD": "year",
        "OBS_VALUE": "total_value",
    }
)
total_students_clean = _filter_alpha3(total_students_clean, "destination_country_code")
total_students_clean["year"] = pd.to_numeric(total_students_clean["year"], errors="coerce").astype("Int64")
total_students_clean = total_students_clean.loc[
    total_students_clean["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
total_students_clean["total_value"] = pd.to_numeric(
    total_students_clean["total_value"], errors="coerce"
)
total_students_pivot = (
    total_students_clean[
        [
            "destination_country_code",
            "destination_country",
            "year",
            "measure_code",
            "total_value",
        ]
    ]
    .pivot_table(
        index=["destination_country_code", "destination_country", "year"],
        columns="measure_code",
        values="total_value",
        aggfunc="sum",
    )
    .reset_index()
    .rename(
        columns={
            "ENRL": "total_students_enrolled",
            "GRAD": "total_students_graduated",
            "ENTR": "total_students_new_entrants",
        }
    )
)

country_lookup = _build_country_lookup()

if unesco_inbound_df is None or unesco_outbound_df is None:
    raise ValueError("UNESCO inbound and outbound datasets are required to build the full mobility matrix.")

unesco_inbound_clean = unesco_inbound_df.copy()
unesco_inbound_clean["year"] = pd.to_numeric(unesco_inbound_clean["year"], errors="coerce").astype("Int64")
unesco_inbound_clean["value"] = pd.to_numeric(unesco_inbound_clean["value"], errors="coerce")
unesco_inbound_clean = unesco_inbound_clean.loc[
    (unesco_inbound_clean["indicatorId"].astype(str) == "26637")
    & _alpha3_mask(unesco_inbound_clean["geoUnit"])
].rename(
    columns={"geoUnit": "destination_country_code", "value": "unesco_inbound_students"}
)
unesco_inbound_clean = unesco_inbound_clean.dropna(subset=["year", "unesco_inbound_students"])
unesco_inbound_clean["destination_country"] = (
    unesco_inbound_clean["destination_country_code"].map(country_lookup)
).fillna(unesco_inbound_clean["destination_country_code"])

unesco_outbound_clean = unesco_outbound_df.copy()
unesco_outbound_clean["year"] = pd.to_numeric(unesco_outbound_clean["year"], errors="coerce").astype("Int64")
unesco_outbound_clean["value"] = pd.to_numeric(unesco_outbound_clean["value"], errors="coerce")
unesco_outbound_clean = unesco_outbound_clean.loc[
    (unesco_outbound_clean["indicatorId"].astype(str) == "OE.5T8.40510")
    & _alpha3_mask(unesco_outbound_clean["geoUnit"])
].rename(
    columns={"geoUnit": "origin_country_code", "value": "unesco_outbound_students"}
)
unesco_outbound_clean = unesco_outbound_clean.dropna(subset=["year", "unesco_outbound_students"])
unesco_outbound_clean["origin_country"] = (
    unesco_outbound_clean["origin_country_code"].map(country_lookup)
).fillna(unesco_outbound_clean["origin_country_code"])

common_years = sorted(
    set(unesco_inbound_clean["year"].dropna()).intersection(unesco_outbound_clean["year"].dropna())
)
common_years = [year for year in common_years if YEAR_START <= year <= YEAR_END]
if not common_years:
    raise ValueError("No overlapping UNESCO data within the specified year range.")
unesco_inbound_clean = unesco_inbound_clean.loc[unesco_inbound_clean["year"].isin(common_years)].copy()
unesco_outbound_clean = unesco_outbound_clean.loc[unesco_outbound_clean["year"].isin(common_years)].copy()

unesco_inbound_clean["total_inbound_year"] = unesco_inbound_clean.groupby("year")[
    "unesco_inbound_students"
].transform("sum")
unesco_inbound_clean = unesco_inbound_clean.loc[
    unesco_inbound_clean["total_inbound_year"] > 0
].copy()
unesco_inbound_clean["share_inbound_destination"] = (
    unesco_inbound_clean["unesco_inbound_students"] / unesco_inbound_clean["total_inbound_year"]
)

mobility_matrix = unesco_outbound_clean.merge(
    unesco_inbound_clean[
        [
            "destination_country_code",
            "destination_country",
            "year",
            "unesco_inbound_students",
            "share_inbound_destination",
        ]
    ],
    on="year",
    how="inner",
)
mobility_matrix["students_enrolled"] = (
    mobility_matrix["unesco_outbound_students"] * mobility_matrix["share_inbound_destination"]
)
mobility_matrix["students_graduated"] = pd.NA
mobility_matrix["students_new_entrants"] = pd.NA
mobility_matrix["flow_source"] = "estimated"

mobility_matrix = mobility_matrix.rename(
    columns={
        "unesco_outbound_students": "students_outbound_total",
        "unesco_inbound_students": "students_inbound_destination",
    }
)

if not oecd_mobility_df.empty:
    mobility_matrix = mobility_matrix.merge(
        oecd_mobility_df[
            [
                "destination_country_code",
                "destination_country",
                "origin_country_code",
                "origin_country",
                "year",
                "students_enrolled",
                "students_graduated",
                "students_new_entrants",
            ]
        ],
        on=[
            "destination_country_code",
            "destination_country",
            "origin_country_code",
            "origin_country",
            "year",
        ],
        how="left",
        suffixes=("", "_oecd"),
    )
    actual_mask = mobility_matrix["students_enrolled_oecd"].notna()
    for column in ["students_enrolled", "students_graduated", "students_new_entrants"]:
        mobility_matrix[column] = mobility_matrix[f"{column}_oecd"].combine_first(
            mobility_matrix[column]
        )
        mobility_matrix.drop(columns=[f"{column}_oecd"], inplace=True)
    mobility_matrix.loc[actual_mask, "flow_source"] = "reported"

mobility_matrix["year"] = mobility_matrix["year"].astype("Int64")

mobility_df = mobility_matrix



In [11]:
for column in ["students_enrolled", "students_graduated", "students_new_entrants"]:
    if column not in mobility_df.columns:
        mobility_df[column] = pd.NA

for column in [
    "students_enrolled",
    "students_outbound_total",
    "students_inbound_destination",
    "share_inbound_destination",
]:
    if column in mobility_df.columns:
        mobility_df[column] = pd.to_numeric(mobility_df[column], errors="coerce")

merged_df = (
    mobility_df.merge(
        share_dest_clean,
        on=["destination_country_code", "destination_country", "year"],
        how="left",
    )
    .merge(
        share_origin_clean,
        on=["origin_country_code", "origin_country", "year"],
        how="left",
    )
    .merge(
        share_mobility_pivot,
        on=["destination_country_code", "destination_country", "year"],
        how="left",
    )
    .merge(
        total_students_pivot,
        on=["destination_country_code", "destination_country", "year"],
        how="left",
    )
    .merge(
        national_abroad_clean,
        on=["origin_country_code", "origin_country", "year"],
        how="left",
    )
)

merged_df = merged_df.loc[
    merged_df["year"].between(YEAR_START, YEAR_END, inclusive="both").fillna(False)
]
merged_df["students_enrolled"] = pd.to_numeric(merged_df["students_enrolled"], errors="coerce")
origin_year_totals = merged_df.groupby(["origin_country_code", "year"])["students_enrolled"].transform("sum")
origin_year_totals = origin_year_totals.where(origin_year_totals != 0)
merged_df["weight_od"] = merged_df["students_enrolled"] / origin_year_totals

merged_df = merged_df.sort_values(
    ["origin_country_code", "year", "destination_country_code"]
).reset_index(drop=True)

weight_sums = (
    merged_df.groupby(["origin_country_code", "year"], dropna=False)["weight_od"]
    .apply(lambda s: s.dropna().sum() if not s.dropna().empty else pd.NA)
    .reset_index(name="weight_sum")
)

print("Merged mobility_df shape:", merged_df.shape)
print(
    "Unique origins: {0}, destinations: {1}".format(
        merged_df["origin_country_code"].nunique(),
        merged_df["destination_country_code"].nunique(),
    )
)
print("Weight sum summary:")
display(weight_sums["weight_sum"].describe())

weight_issues = weight_sums.loc[
    weight_sums["weight_sum"].notna()
    & (~weight_sums["weight_sum"].between(0.999, 1.001))
]
if not weight_issues.empty:
    print("Origin-year combinations with weight sums deviating from 1 (first 5 shown):")
    display(weight_issues.head())
else:
    print("All origin-year combinations have weights summing to ~1.")

print("Merged mobility_df head:")
display(merged_df.head())

output_path = DATA_PATH / "merged_mobility.csv"
merged_df.to_csv(output_path, index=False)
print(f"Merged dataset saved to {output_path}")



Merged mobility_df shape: (134820, 19)
Unique origins: 210, destinations: 143
Weight sum summary:


count    1.260000e+03
mean     1.000000e+00
std      8.372475e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: weight_sum, dtype: float64

All origin-year combinations have weights summing to ~1.
Merged mobility_df head:


Unnamed: 0,indicatorId,origin_country_code,year,students_outbound_total,qualifier,magnitude,origin_country,destination_country_code,destination_country,students_inbound_destination,share_inbound_destination,students_enrolled,students_graduated,students_new_entrants,flow_source,share_mobile_destination,share_mobile_origin,students_national_abroad,weight_od
0,OE.5T8.40510,ABW,2018,365.0,,,Aruba,ALB,Albania,1969.0,0.000364,0.13272,,,estimated,,,,0.000364
1,OE.5T8.40510,ABW,2018,365.0,,,Aruba,AND,Andorra,278.0,5.1e-05,0.018738,,,estimated,,,,5.1e-05
2,OE.5T8.40510,ABW,2018,365.0,,,Aruba,ARE,United Arab Emirates,199958.0,0.036926,13.478095,,,estimated,,,,0.036926
3,OE.5T8.40510,ABW,2018,365.0,,,Aruba,ARG,Argentina,109226.25,0.020171,7.362355,,,estimated,,,,0.020171
4,OE.5T8.40510,ABW,2018,365.0,,,Aruba,ARM,Armenia,4598.0,0.000849,0.309926,,,estimated,,,,0.000849


Merged dataset saved to /Users/simonedinato/Documents/Classes/Applied Econometrics/Project/Datasets/01_mobility_OD/merged_mobility.csv


## Finalize mobility O–D exposures (v2)



In [12]:
from pathlib import Path

import pandas as pd

DATA = Path("/Users/simonedinato/Documents/Classes/Applied Econometrics/Project/Datasets")
MOB_PATHS = [
    Path("/mnt/data/merged_mobility.csv"),
    Path("/home/oai/share/merged_mobility.csv"),
    DATA / "01_mobility_OD" / "merged_mobility.csv",
]

for path in MOB_PATHS:
    if path.exists():
        MOB_FILE = path
        break
else:
    raise FileNotFoundError("merged_mobility.csv not found in default paths")

mob = pd.read_csv(MOB_FILE)

required_columns = [
    "origin_country_code",
    "origin_country",
    "destination_country_code",
    "destination_country",
    "year",
    "students_outbound_total",
    "students_inbound_destination",
    "share_inbound_destination",
]
missing = [col for col in required_columns if col not in mob.columns]
if missing:
    raise ValueError(f"Missing required columns in mobility data: {missing}")

rename_map = {
    "students_outbound_total": "outbound_total_o",
    "students_inbound_destination": "inbound_total_d",
    "share_inbound_destination": "share_inbound_d",
    "students_enrolled": "students_od_raw",
}
mob = mob.rename(columns={k: v for k, v in rename_map.items() if k in mob.columns})

region_terms = {
    "unspecified",
    "Northern America",
    "Latin America and the Caribbean",
    "Africa",
    "Asia",
    "Europe",
    "Oceania",
}
if "origin_country" in mob.columns:
    origin_names = mob["origin_country"].fillna("")
    mask_regions = origin_names.isin(region_terms) | origin_names.str.contains("unspecified", case=False, na=False)
    mob = mob.loc[~mask_regions].copy()

for numeric_col in ["outbound_total_o", "inbound_total_d", "share_inbound_d", "students_od_raw"]:
    if numeric_col in mob.columns:
        mob[numeric_col] = pd.to_numeric(mob[numeric_col], errors="coerce")

if "share_inbound_d" not in mob.columns or mob["share_inbound_d"].isna().all():
    if "inbound_total_d" not in mob.columns:
        raise ValueError("Cannot compute destination shares without inbound totals")

    def _share_within_year(values: pd.Series) -> pd.Series:
        total = values.sum(skipna=True)
        if not total or pd.isna(total):
            return pd.Series(pd.NA, index=values.index)
        return values / total

    mob["dest_share_year"] = mob.groupby("year")["inbound_total_d"].transform(_share_within_year)
    share_col = "dest_share_year"
else:
    share_col = "share_inbound_d"
    mob[share_col] = mob[share_col].fillna(0)

if "outbound_total_o" not in mob.columns:
    raise ValueError("Cannot construct O-D proxy without outbound totals")

mob["students_od"] = mob["outbound_total_o"] * mob[share_col]
missing_students = mob["students_od"].isna().sum()
if missing_students:
    print(f"Warning: {missing_students} rows with missing students_od after proxy computation")

denominator = mob.groupby(["origin_country_code", "year"])["students_od"].transform("sum")
mob["weight_od"] = mob["students_od"] / denominator
mob.loc[denominator.eq(0), "weight_od"] = pd.NA

chk = mob.groupby(["origin_country_code", "year"])["weight_od"].sum().reset_index(name="w_sum")
viol = chk[~chk["w_sum"].between(0.999, 1.001)]
print("Weight-sum violations:", len(viol))
if not viol.empty:
    print(viol.head())

dest_counts = mob.groupby(["origin_country_code", "year"])["destination_country_code"].nunique()
few = dest_counts[dest_counts < 2]
print("Origins with <2 destinations:", few.shape[0])
if not few.empty:
    print(few.head())

exposure_cols = [
    "origin_country_code",
    "origin_country",
    "destination_country_code",
    "destination_country",
    "year",
    "students_od",
    "outbound_total_o",
    "weight_od",
]
missing_exposure = [col for col in exposure_cols if col not in mob.columns]
if missing_exposure:
    raise ValueError(f"Missing columns for exposure export: {missing_exposure}")

exposure = mob[exposure_cols].dropna(subset=["weight_od"]).copy()
output_dir = DATA / "01_mobility_OD"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "od_exposure_v2.csv"
exposure.to_csv(output_file, index=False)
print(f"Exposure panel saved to {output_file}")

display(exposure.head())



Weight-sum violations: 0
Origins with <2 destinations: 0
Exposure panel saved to /Users/simonedinato/Documents/Classes/Applied Econometrics/Project/Datasets/01_mobility_OD/od_exposure_v2.csv


Unnamed: 0,origin_country_code,origin_country,destination_country_code,destination_country,year,students_od,outbound_total_o,weight_od
0,ABW,Aruba,ALB,Albania,2018,0.13272,365.0,0.000364
1,ABW,Aruba,AND,Andorra,2018,0.018738,365.0,5.1e-05
2,ABW,Aruba,ARE,United Arab Emirates,2018,13.478095,365.0,0.036926
3,ABW,Aruba,ARG,Argentina,2018,7.362355,365.0,0.020171
4,ABW,Aruba,ARM,Armenia,2018,0.309926,365.0,0.000849
