# Combine GADM outputs from everyone


In [1]:
%reset -f

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import random
import re
import sys
from pathlib import Path
from functools import reduce
import warnings

import folium
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask import delayed

from google.cloud import storage as gcs

In [4]:
import platform
print(platform.node())

import socket
print(socket.gethostname())

holy7c24103.rc.fas.harvard.edu
holy7c24103.rc.fas.harvard.edu


In [5]:
PROJ = Path(os.path.realpath("."))
if str(PROJ)=="/n/home10/shreyasgm":
    PROJ = Path("/n/holystore01/LABS/hausmann_lab/lab/glocal_aggregations/shreyas/proj/2021-07-28 - GEE")
ROOT = PROJ.parents[1]
DATA = ROOT / "data/"

In [6]:
sys.path.append(str(PROJ))
sys.path.append(str(ROOT / "src/"))
from gee_utils import *
from process_viirs import *
from general_utils import *

In [7]:
os.environ[
    "GOOGLE_APPLICATION_CREDENTIALS"
] = "/nfs/home/S/shg309/google_cloud_platform_auth/gcp_service_account_auth.json"

# Inputs


In [8]:
cntry_selected = "Colombia"
cntry_selected_abbr = "COL"
vcm_type = "vcmsl"

# Helper functions


In [9]:
def add_gadm_names(df, selected_admin_level):
    # Assign names
    gadm_names = pd.read_parquet(
        ROOT
        / f"data/intermediate/gadm_without_geometry/gadm36_{selected_admin_level}.parquet"
    )
    df = df.merge(
        gadm_names[[f"GID_{selected_admin_level}", f"NAME_{selected_admin_level}"]],
        on=f"GID_{selected_admin_level}",
    )

    return df

In [10]:
def add_gadm_ids(df, selected_admin_level):
    # Assign ids
    gadm = pd.read_parquet(
        ROOT
        / f"data/intermediate/gadm_without_geometry/gadm36_{selected_admin_level}.parquet"
    )
    if selected_admin_level == 1:
        id_vars = ["GID_0", "GID_1"]
        df = df.merge(
            gadm[id_vars],
            on="GID_1",
        )
        df = df[id_vars + [x for x in df.columns if x not in id_vars]]
    elif selected_admin_level == 2:
        id_vars = ["GID_0", "GID_1", "GID_2"]
        df = df.merge(
            gadm[id_vars],
            on=f"GID_2",
        )
        df = df[id_vars + [x for x in df.columns if x not in id_vars]]
    return df

In [11]:
def reorder_and_check_for_duplicates(df, id_cols, drop=True):
    """
    Check for duplicates in id_cols.
    If present, drop and warn if drop=True, or raise error if drop=False.
    """
    # Reorder columns
    df = df[id_cols + [x for x in df.columns if x not in id_cols]]
    # Check for duplicates
    if df.duplicated(id_cols).sum() > 0:
        if drop:
            warnings.warn("Duplicates present in index, dropping")
            return df.drop_duplicates(id_cols_level)
        else:
            raise ValueError("Duplicates present in index")
    else:
        return df

In [83]:
def rectangularize(df, cols_list, fill_var=None):
    """
    Rectangularize a dataframe
    
    Args:
        df: Pandas DataFrame
        cols_list: list of names of columns to use for rectangularization
        fill_var: (Optional) value to use to fill missing records
    """
    import pandas as pd

    # Get list of unique values in each column
    unique_vals_list = [df[x].unique() for x in cols_list]
    # Create multiindex with cartesian product
    index = pd.MultiIndex.from_product(unique_vals_list)
    # Create pandas dataframe
    df_index = pd.DataFrame(index=index).reset_index()
    df_index.columns = cols_list
    # Merge into original df
    df_rectangular = df.merge(df_index, on=cols_list, how='right')
    # Fill NA
    if fill_var is not None:
        cols_to_fill = [x for x in df.columns if x not in cols_list]
        df_rectangular[cols_to_fill] = df_rectangular[cols_to_fill].fillna(fill_var)
    return df_rectangular

## Supporting data


In [12]:
gadm_2 = pd.read_parquet(
    DATA / "intermediate/gadm_without_geometry/gadm36_2.parquet",
    columns=["GID_0", "GID_1", "GID_2"],
)
gadm_1 = pd.read_parquet(
    DATA / "intermediate/gadm_without_geometry/gadm36_1.parquet",
    columns=[
        "GID_0",
        "GID_1",
    ],
)
gadm_2.head()

Unnamed: 0,GID_0,GID_1,GID_2
0,AFG,AFG.1_1,AFG.1.1_1
1,AFG,AFG.1_1,AFG.1.2_1
2,AFG,AFG.1_1,AFG.1.3_1
3,AFG,AFG.1_1,AFG.1.4_1
4,AFG,AFG.1_1,AFG.1.5_1


# Read Individual Sources


## VIIRS


In [45]:
# # Download all missing files
# downloaded_files = download_missing_gcp(
#     local_folderpath=DATA / "intermediate/gee_viirs_agg/",
#     gcs_bucketname="earth_engine_aggregations",
# )

In [12]:
def prepare_viirs():
    # Get list of files from GCP
    csv_dict = {}
    for x in range(3):
        # read filelist
        csvlist = list(
            (DATA / "intermediate/gee_viirs_agg/").glob(fr"VIIRS_vcmsl_*_level{x}*")
        )
        # Make sure file exists and has data in it
        csvlist_valid = [x for x in csvlist if x.stat().st_size > 10]
        print(f"{len(csvlist)} became {len(csvlist_valid)}")
        # Read files
        viirs_df = dd.read_csv(csvlist_valid)
        viirs_df["date"] = dd.to_datetime(viirs_df["date"])
        viirs_df = viirs_df.compute()
        # Export
        viirs_df.to_parquet(
            PROJ / f"tables/viirs_vcmsl_world_level{x}.parquet", index=False
        )


# prepare_viirs()

### Monthly


In [13]:
def agg_viirs(df):
    """
    Some admin boundaries are too big for GEE so they were split, so further aggregate
    for those boundaries
    """
    viirs_sum = df["viirs_sum"].sum()
    viirs_count = df["viirs_count"].sum()
    viirs_mean = (df["viirs_mean"] * df["viirs_count"]).sum() / viirs_count
    viirs_median = (df["viirs_median"] * df["viirs_count"]).sum() / viirs_count
    return pd.DataFrame(
        {
            "viirs_sum": [viirs_sum],
            "viirs_count": [viirs_count],
            "viirs_mean": [viirs_mean],
            "viirs_median": [viirs_median],
        },
    )

In [14]:
for level in range(0, 3):
    print(level)
    viirs = pd.read_parquet(PROJ / f"tables/viirs_vcmsl_world_level{level}.parquet")
    viirs["year"] = viirs.date.dt.year
    viirs["month"] = viirs.date.dt.month
    viirs = viirs.drop(columns=["date", "min", "max", "stdDev"])
    viirs = viirs.rename(
        columns={x: f"viirs_{x}" for x in ["mean", "sum", "median", "count"]}
    )
    viirs = add_gadm_ids(viirs, level)
    # To make the groupby efficient, only aggregate those with dups
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year", "month"]
    viirs_dup = viirs[viirs.duplicated(id_cols_level, keep=False)]
    viirs_dup_agg = viirs_dup.groupby(id_cols_level).apply(agg_viirs).reset_index()
    viirs_dup_agg = viirs_dup_agg.drop(
        columns=[x for x in viirs_dup_agg if x.startswith("level_")]
    )
    # Append dups back to original df
    viirs = viirs[~viirs.duplicated(id_cols_level, keep=False)].append(viirs_dup_agg)
    # Check if duplicates exist and warn if yes
    viirs = reorder_and_check_for_duplicates(viirs, id_cols_level)
    # Export
    viirs.to_parquet(
        DATA / f"intermediate/viirs_level_{level}_monthly.parquet",
        index=False,
    )

0
1


  viirs_mean = (df["viirs_mean"] * df["viirs_count"]).sum() / viirs_count
  viirs_median = (df["viirs_median"] * df["viirs_count"]).sum() / viirs_count


2


### Annual


In [15]:
convert_to_annual = (
    lambda df, id_col: df.groupby([pd.Grouper(key="date", freq="AS"), id_col])
    .mean()
    .reset_index()
)

In [16]:
# Read
viirs_2 = dd.read_parquet(PROJ / "tables/viirs_vcmsl_world_level2.parquet")
viirs_2_cntry = viirs_2[viirs_2.GID_2.str.startswith(cntry_selected_abbr)].compute()
viirs_2_cntry = convert_to_annual(viirs_2_cntry, "GID_2")
viirs_2_cntry.head()

Unnamed: 0,date,GID_2,mean,sum,median,stdDev,count,min,max
0,2014-01-01,COL.1.1_1,0.110212,632.687551,0.095657,0.064213,12239.583333,0.053963,1.32505
1,2014-01-01,COL.1.2_1,0.114649,2093.122687,0.100482,0.061478,25776.166667,0.053951,0.842272
2,2014-01-01,COL.1.3_1,0.135824,2599.870063,0.112501,0.082553,27307.916667,0.053994,1.205291
3,2014-01-01,COL.1.4_1,0.710867,712.109317,0.168952,2.034013,4104.583333,0.054192,21.233235
4,2014-01-01,COL.1.5_1,0.122466,1252.15334,0.106779,0.063444,15475.25,0.054161,0.617857


In [17]:
# Read
viirs_0 = pd.read_parquet(PROJ / "tables/viirs_vcmsl_world_level0.parquet")
viirs_0 = convert_to_annual(viirs_0, "GID_0")
viirs_0.head()

Unnamed: 0,date,GID_0,mean,sum,median,stdDev,count,min,max
0,2014-01-01,ABW,9.811129,3477.28283,7.393986,7.518052,388.0,3.081231,48.527639
1,2014-01-01,AFG,0.209336,510760.826644,0.1789,0.975774,2420530.0,0.090451,203.980865
2,2014-01-01,AGO,0.809596,358565.010883,0.393686,17.163215,840613.2,0.098184,5157.775736
3,2014-01-01,AIA,3.697973,1110.302671,3.131623,2.580392,362.75,0.28054,17.436017
4,2014-01-01,ALA,0.552814,3867.249845,0.239572,2.33388,10397.0,0.18689,62.198587


## DMSP-like trend data


In [74]:
def process_dmsp_ext():
    for admin_level in range(3):
        dmsp_ext = pd.read_csv(DATA / f"raw/remote_sensed/DMSP_ext/deduplicated/dmsp_ext_level{admin_level}.csv")
        # Clean varnames
        dmsp_ext = dmsp_ext.drop(columns=["Unnamed: 0"])
        id_cols_level = f"GID_{admin_level}"
        # Reshape to long
        dmsp_ext = dmsp_ext.melt(id_vars=id_cols_level, var_name="year_metric", value_name="metric_val")
        dmsp_ext[["year", "metric"]] = dmsp_ext.year_metric.str.split("_", n=1, expand=True)
        dmsp_ext["year"] = dmsp_ext["year"].astype(int)
        # Drop unnecessary cols
        dmsp_ext = dmsp_ext[~dmsp_ext.metric.isin(["min", "max", "stdDev"])]
        dmsp_ext["metric"] = "ntl_harmonised_ext_" + dmsp_ext["metric"]
        # Pivot back into shape
        dmsp_ext = dmsp_ext.pivot(index=[id_cols_level, "year"], columns="metric", values="metric_val").reset_index().rename_axis(None, axis=1)
        dmsp_ext = dmsp_ext.rename(columns={"ntl_harmonised_ext_weighted_mean": "ntl_harmonised_ext_mean"})
        # Add other identifiers in
        if admin_level == 1:
            dmsp_ext = dmsp_ext.merge(gadm_1, on=id_cols_level)
        elif admin_level==2:
            dmsp_ext = dmsp_ext.merge(gadm_2, on=id_cols_level)
        # Reorder and check for duplicates
        id_cols_level = [f"GID_{x}" for x in range(admin_level + 1)]
        dmsp_ext = reorder_and_check_for_duplicates(dmsp_ext, id_cols_level + ["year"])
        # Export
        dmsp_ext.to_parquet(DATA / f"intermediate/dmsp_ext_level_{admin_level}_yearly.parquet", index=False)

# process_dmsp_ext()

In [75]:
def process_dvnl():
    for admin_level in range(3):
        dvnl = pd.read_csv(DATA / f"raw/remote_sensed/DMSP_DVNL/deduplicated/dvnl_level{admin_level}.csv")
        # Clean varnames
        dvnl = dvnl.drop(columns=["Unnamed: 0"])
        dvnl = dvnl.drop(columns=[x for x in dvnl.columns if x.endswith("_y")])
        dvnl = dvnl.rename(columns={x: x[:-2] for x in dvnl.columns if x.endswith("_x")})
        id_cols_level = f"GID_{admin_level}"
        # Reshape to long
        dvnl = dvnl.melt(id_vars=id_cols_level, var_name="year_metric", value_name="metric_val")
        dvnl[["year", "metric"]] = dvnl.year_metric.str.split("_", n=1, expand=True)
        dvnl["year"] = dvnl["year"].astype(int)
        # Drop unnecessary cols
        dvnl = dvnl[~dvnl.metric.isin(["min", "max", "stdDev"])]
        dvnl["metric"] = "ntl_harmonised_dvnl_" + dvnl["metric"]
        # Pivot back into shape
        dvnl = dvnl.pivot(index=[id_cols_level, "year"], columns="metric", values="metric_val").reset_index().rename_axis(None, axis=1)
        dvnl = dvnl.rename(columns={"ntl_harmonised_dvnl_weighted_mean": "ntl_harmonised_dvnl_mean"})
        # Add other identifiers in
        if admin_level == 1:
            dvnl = dvnl.merge(gadm_1, on=id_cols_level)
        elif admin_level==2:
            dvnl = dvnl.merge(gadm_2, on=id_cols_level)
        # Reorder and check for duplicates
        id_cols_level = [f"GID_{x}" for x in range(admin_level + 1)]
        dvnl = reorder_and_check_for_duplicates(dvnl, id_cols_level + ["year"])
        # Export
        dvnl.to_parquet(DATA / f"intermediate/dvnl_level_{admin_level}_yearly.parquet", index=False)

# process_dvnl()

## Forest change


In [18]:
def prepare_forest_loss():
    # Process global forest change aggregations from Sarah
    forest = pd.read_csv(
        DATA / "raw/forest_change/GADM2toreshape.csv",
        dtype={"GID_4": str, "GID_5": str},
    )
    forest = forest.drop(columns=["Unnamed: 0"])
    # Read counts to aggregate means
    forest_count = pd.read_csv(
        DATA / "raw/forest_change/GADM2.csv",
        dtype={"GID_4": str, "GID_5": str},
    )
    forest_count = forest_count.drop(
        columns=["system:index", "NAME_1", "NAME_2", ".geo"]
    )
    forest_count = forest_count.rename(columns={"treecover2000": "count"})
    gid_cols = ["GID_0", "GID_1", "GID_2", "GID_3", "GID_4", "GID_5"]
    forest_count = forest_count.drop_duplicates(gid_cols)
    # Merge in counts
    forest = forest.merge(forest_count, on=gid_cols, how="left")
    # Aggregate sums and counts
    for admin_level in range(0, 3):
        print(admin_level)
        cols_to_keep = [x for x in list(forest.columns) if x.endswith("_sum")]
        cols_to_keep = [f"GID_{admin_level}", "count"] + cols_to_keep
        forest_level = forest[cols_to_keep].groupby([f"GID_{admin_level}"]).sum()
        # Reshape
        for year in range(2001, 2021):
            forest_level[f"{year}_count"] = forest_level["count"]
        #     forest_level[f"{year}_mean"] = forest_level[f"{year}_sum"] / count
        forest_level = forest_level.drop(columns="count")
        forest_level = (
            forest_level.stack().reset_index().rename(columns={0: "forest_loss"})
        )
        forest_level[["year", "metric"]] = forest_level["level_1"].str.split(
            "_", expand=True
        )
        forest_level = (
            forest_level.pivot(
                index=[f"GID_{admin_level}", "year"],
                columns="metric",
                values="forest_loss",
            )
            .reset_index()
            .rename_axis(None, axis=1)
        )
        forest_level = forest_level.rename(
            columns={"sum": "forest_loss_sum", "count": "forest_loss_count"}
        )
        forest_level["forest_loss_mean"] = (
            forest_level["forest_loss_sum"] / forest_level["forest_loss_count"]
        )
        forest_level = add_gadm_ids(forest_level, admin_level)
        # Make sure year is int
        forest_level["year"] = forest_level["year"].astype(int)
        # Check if duplicates exist and warn if yes
        id_cols_level = [f"GID_{x}" for x in range(admin_level + 1)] + ["year"]
        forest_level = reorder_and_check_for_duplicates(forest_level, id_cols_level)
        # Export
        forest_level.to_parquet(
            DATA / f"intermediate/forest_loss_level_{admin_level}_yearly.parquet",
            index=False,
        )


# prepare_forest_loss()

0
1
2


In [19]:
forest_2 = pd.read_parquet(
    DATA / f"intermediate/forest_loss_level_0_yearly.parquet"
)
forest_2.head()

Unnamed: 0,GID_0,year,forest_loss_count,forest_loss_sum,forest_loss_mean
0,ABW,2001,208333.0,0.0,0.0
1,ABW,2002,208333.0,4663.783,0.022386
2,ABW,2003,208333.0,5196.54,0.024943
3,ABW,2004,208333.0,0.0,0.0
4,ABW,2005,208333.0,0.0,0.0


## Accessibility


In [12]:
# Read access data
access_dict = {}
for level in range(3):
    dflist = []
    for x in ["cities", "large_cities", "medium_cities", "ports", "airports"]:
        df = pd.read_parquet(
            ROOT
            / f"proj/2021-08-11 - Accessibility/tables/time_to_cities/{x}_travel_time_level_{level}.parquet"
        )
        df = df[["date", f"GID_{level}", "median"]]
        df = df.rename(columns={"median": f"time_to_{x}_mins"})
        dflist.append(df)
    df_level = reduce(
        lambda left, right: pd.merge(
            left, right, on=["date", f"GID_{level}"], how="outer"
        ),
        dflist,
    )
    # Add GADM ids
    df_level = add_gadm_ids(df_level, level)
    # Check if duplicates exist and warn if yes
    id_cols_level = [f"GID_{x}" for x in range(level + 1)]
    df_level = reorder_and_check_for_duplicates(df_level, id_cols_level)
    # Fix date
    df_level["date"] = pd.to_datetime("2019-01-01")
    # Export
    access_dict[level] = df_level
    df_level.to_parquet(
        DATA / f"intermediate/access_level_{level}_cross_section.parquet",
        index=False,
    )
access_dict[1].head()



Unnamed: 0,GID_0,GID_1,date,time_to_cities_mins,time_to_large_cities_mins,time_to_medium_cities_mins,time_to_ports_mins,time_to_airports_mins
0,PSE,PSE.1_1,2019-01-01,88.849348,88.849348,88.849348,,
1,PSE,PSE.2_1,2019-01-01,20.756536,86.467379,61.521715,,
2,CIV,CIV.2_1,2019-01-01,66.124239,358.130309,125.995983,,
3,CIV,CIV.3_1,2019-01-01,48.996337,198.219897,198.219897,,
4,CIV,CIV.4_1,2019-01-01,242.498073,435.11964,258.970207,,


In [13]:
# Access from urban areas
urban_access_dict = {}
for level in range(3):
    # Read data
    df = pd.read_parquet(
        ROOT
        / f"proj/2021-08-11 - Accessibility/tables/time_to_cities/dest_travel_time_ghs_gadm_level_{level}.parquet"
    )
    # Convert back to mins
    df["w_median"] = df["w_median"] * 60
    # Explicit column names
    df["dest_type"] = "urban_time_to_" + df["dest_type"] + "_mins"
    df = (
        df.pivot(index=f"GID_{level}", columns="dest_type", values="w_median")
        .rename_axis(None, axis=1)
        .reset_index()
    )
    # Add date
    df["date"] = "2019-01-01"
    df["date"] = df["date"].astype("datetime64[ns]").dt.date
    # Add GADM ids
    df = add_gadm_ids(df, level)
    # Fix date
    df["date"] = pd.to_datetime("2019-01-01")
    # Check if duplicates exist and warn if yes
    id_cols_level = [f"GID_{x}" for x in range(level + 1)]
    df = reorder_and_check_for_duplicates(df, id_cols_level)
    # Export
    urban_access_dict[level] = df
    df.to_parquet(
        DATA / f"intermediate/urban_access_level_{level}_cross_section.parquet",
        index=False,
    )
urban_access_dict[1].head()

Unnamed: 0,GID_0,GID_1,urban_time_to_airports_mins,urban_time_to_large_cities_mins,urban_time_to_medium_cities_mins,urban_time_to_ports_mins,date
0,AFG,AFG.10_1,,392.714577,378.720111,,2019-01-01
1,AFG,AFG.11_1,,116.874328,45.617087,,2019-01-01
2,AFG,AFG.12_1,,41.157663,41.157663,,2019-01-01
3,AFG,AFG.13_1,,382.240754,98.629767,,2019-01-01
4,AFG,AFG.14_1,,6.845313,6.845313,,2019-01-01


## JR Data Dump


### Function definitions


In [87]:
def rename_gadm(df):
    df = df.rename(
        columns={x: x.replace("adm", "GID_") for x in ["adm0", "adm1", "adm2"]}
    )
    return df

In [88]:
def split_by_level(df, additional_ids: list, metric_cols_agg: dict, export=False):
    """
    Split JR's data into multiple levels
    """
    for level in range(3):
        print(level)
        id_cols_level = [f"GID_{x}" for x in range(level + 1)] + additional_ids
        # Aggregate
        df_level = df.groupby(id_cols_level).agg(metric_cols_agg).reset_index()
        # Check for dups
        df_level = reorder_and_check_for_duplicates(df_level, id_cols_level)
        # Export
        if export:
            if additional_ids == ["year", "month"]:
                freq = "monthly"
            elif additional_ids == ["year"]:
                freq = "yearly"
            df_level.to_parquet(
                DATA / f"intermediate/{export}_level_{level}_{freq}.parquet"
            )
    print(df_level.head())

In [89]:
def fix_identifiers(df):
    """Split GADM identifiers into adm0, adm1 and adm2"""
    id_vars = ["GID_0", "GID_1", "GID_2"]
    # Merge with GADM ids
    df = df.merge(gadm_2[id_vars], left_on="id", right_on="GID_2", how="inner")
    df = df.drop(columns="id")
    # Reorder
    df = df[id_vars + [col for col in df.columns if col not in id_vars]]
    return df

### Run aggregations


#### ACLED


In [121]:
# acled = pd.read_csv(DATA / "raw/remote_sensed_corrected/ACLED/ACLED_mun_monthly.csv")
# # Only keep adm2 as id
# acled = acled.drop(columns=["adm0", "adm1"])
# id_vars = [
#     "adm2",
#     "year",
#     "month",
# ]
# acled = acled.groupby(id_vars + ["event_type"])["count"].sum().reset_index()
# acled = (
#     acled.pivot(index=id_vars, columns="event_type", values="count")
#     .rename_axis(None, axis=1)
#     .fillna(0)
# )
# acled = (
#     acled.rename(columns={x: f"violence_{x}" for x in acled.columns})
#     .reset_index()
#     .sort_values(id_vars)
# )
# # Fill in implicitly missing values at the region level
# acled["yearmon"] = acled["year"].astype(str) + "-" + acled["month"].astype(str).str.zfill(2) + "-01"
# acled["yearmon"] = pd.to_datetime(df["yearmon"])
# def fill_in_acled(df):
#     print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#     print(df)
#     print("====================================================================")
#     daterange = list(pd.date_range(start=df["yearmon"].min(), end=df["yearmon"].max(), freq="MS"))
#     index = pd.MultiIndex.from_product([[adm], daterange], names=["adm2", "yearmon"])
#     df = df.set_index(["adm2", "yearmon"]).reindex(index, fill_value=0).sort_values("yearmon").reset_index(drop=True)
#     print("====================================================================")
#     print(df)
#     print("====================================================================")
#     return df

# acled = acled.groupby("adm2").apply(fill_in_acled)
# acled.head()


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
          adm2  year  month  violence_Battles  \
0   AFG.1.10_1  2018      9               1.0   
1   AFG.1.10_1  2019      5               1.0   
2   AFG.1.10_1  2019      7               3.0   
3   AFG.1.10_1  2019      9               1.0   
4   AFG.1.10_1  2020      2               2.0   
5   AFG.1.10_1  2020      3               1.0   
6   AFG.1.10_1  2020      6               1.0   
7   AFG.1.10_1  2020      8               2.0   
8   AFG.1.10_1  2020     10               1.0   
9   AFG.1.10_1  2020     11               1.0   
10  AFG.1.10_1  2021      1               1.0   

    violence_Explosions/Remote violence  violence_Protests  violence_Riots  \
0                                   0.0                0.0             0.0   
1                                   0.0                0.0             0.0   
2                                   0.0                0.0             0.0   
3                         

ValueError: cannot handle a non-unique multi-index!

In [130]:
# adm = list(acled.adm2.unique())[0]

# df.head()

Unnamed: 0,adm2,yearmon,year,month,violence_Battles,violence_Explosions/Remote violence,violence_Protests,violence_Riots,violence_Strategic developments,violence_Violence against civilians
0,AFG.1.10_1,2018-09-01,2018,9,1.0,0.0,0.0,0.0,0.0,0.0
1,AFG.1.10_1,2018-10-01,2019,5,1.0,0.0,0.0,0.0,0.0,0.0
2,AFG.1.10_1,2018-11-01,2019,7,3.0,0.0,0.0,0.0,0.0,0.0
3,AFG.1.10_1,2018-12-01,2019,9,1.0,0.0,0.0,0.0,0.0,0.0
4,AFG.1.10_1,2019-01-01,2020,2,2.0,0.0,1.0,0.0,0.0,0.0


In [131]:

# df = acled[acled.adm2==adm].copy()

# daterange = list(pd.date_range(start=df["yearmon"].min(), end=df["yearmon"].max(), freq="MS"))
# index = pd.MultiIndex.from_product([[adm], daterange], names=["adm2", "yearmon"])
# # df = df.set_index(["adm2", "yearmon"]).reindex(index, fill_value=0).reset_index()


In [92]:
# ACLED
def process_acled():
    acled = pd.read_csv(DATA / "raw/remote_sensed_corrected/ACLED/ACLED_mun_monthly.csv")
    # Only keep adm2 as id
    acled = acled.drop(columns=["adm0", "adm1"])
    id_vars = [
        "adm2",
        "year",
        "month",
    ]
    acled = acled.groupby(id_vars + ["event_type"])["count"].sum().reset_index()
    acled = (
        acled.pivot(index=id_vars, columns="event_type", values="count")
        .rename_axis(None, axis=1)
        .fillna(0)
    )
    acled = (
        acled.rename(columns={x: f"violence_{x}" for x in acled.columns})
        .reset_index()
        .sort_values(id_vars)
    )
    # Fill in implicitly missing values at the region level
    acled_adm_list = []
    adm_vals = acled["adm2"].unique()
    def fill_in_acled(df):
        year_vals = df["year"].unique()
        month_vals = range(1, 13)
        index = pd.MultiIndex.from_product([adm_vals, year_vals, month_vals], names=id_vars)
        acled = acled.set_index(id_vars).reindex(index, fill_value=0).reset_index()
    # Add nan for explicitly missing values
    acled[id_vars] = acled[id_vars].replace("Not available", np.nan)
    # Add other vars
    acled = acled.rename(columns={"adm2": "GID_2"})
    acled = acled.merge(gadm_2, on="GID_2")
    # Check for dups
    id_cols_level = [f"GID_{x}" for x in range(3)] + ["year", "month"]
    acled = reorder_and_check_for_duplicates(acled, id_cols_level)
    # Split and export
    metric_cols = [x for x in acled.columns if x.startswith("violence")]
    metric_cols_agg = {k: "sum" for k in metric_cols}
    split_by_level(
        acled,
        additional_ids=["year", "month"],
        metric_cols_agg=metric_cols_agg,
        export="acled",
    )


# process_acled()

0
1
2
  GID_0     GID_1       GID_2  year  month  violence_Battles  \
0   AFG  AFG.10_1  AFG.10.1_1  1997      1               0.0   
1   AFG  AFG.10_1  AFG.10.1_1  1997      2               0.0   
2   AFG  AFG.10_1  AFG.10.1_1  1997      3               0.0   
3   AFG  AFG.10_1  AFG.10.1_1  1997      4               0.0   
4   AFG  AFG.10_1  AFG.10.1_1  1997      5               0.0   

   violence_Explosions/Remote violence  violence_Protests  violence_Riots  \
0                                  0.0                0.0             0.0   
1                                  0.0                0.0             0.0   
2                                  0.0                0.0             0.0   
3                                  0.0                0.0             0.0   
4                                  0.0                0.0             0.0   

   violence_Strategic developments  violence_Violence against civilians  
0                              0.0                                  0.0 

#### GDELT


In [66]:
def process_gdelt():
    # Prepare GDELT files
    gdelt_files = list((DATA / "raw/gdelt_jr").glob("*.csv"))
    gdelt_files_df = pd.DataFrame({"filepath": gdelt_files, "filename": [x.stem for x in gdelt_files]})
    gdelt_files_df[["region", "freq"]] = gdelt_files_df.filename.str.replace(r"^GDELT_(.+)_new$", r"\1", regex=True).str.split("_", expand=True)
    gdelt_files_df["level"] = gdelt_files_df["region"].replace({"country": 0, "edo": 1, "mun": 2})
    gdelt_files_df.head()

    # Aggregate GDELT
    for i, row in tqdm(gdelt_files_df.iterrows(), total=len(gdelt_files_df)):
        gdelt_df = pd.read_csv(row.filepath)
        # Rename time columns
        if row.freq=="yearly":
            gdelt_df = gdelt_df.rename(columns={"time_value": "year"})
            additional_cols = ["year"]
        elif row.freq=="monthly":
            gdelt_df["year"] = gdelt_df.time_value.astype(str).str[:4].astype(int)
            gdelt_df["month"] = gdelt_df.time_value.astype(str).str[-2:].astype(int)
            gdelt_df = gdelt_df.drop(columns="time_value")
            additional_cols = ["year", "month"]
        # Rename other columns
        gdelt_df = gdelt_df.rename(columns={"gid": f"GID_{row.level}"})
        gdelt_df = gdelt_df.rename(columns={x: f"gdelt_{x}" for x in ["coercion", "protest"]})
        # Add other GID values if any
        if row.level==1:
            gdelt_df = gdelt_df.merge(gadm_1, on="GID_1")
        elif row.level==2:
            gdelt_df = gdelt_df.merge(gadm_2, on="GID_2")
        # Check if duplicates exist and warn if yes
        id_cols_level = [f"GID_{x}" for x in range(row.level + 1)] + additional_cols
        gdelt_df = reorder_and_check_for_duplicates(gdelt_df, id_cols_level)
        
        # Export
        gdelt_df.to_parquet(
            DATA
            / f"intermediate/gdelt_level_{row.level}_{row.freq}.parquet",
            index=False,
        )

100%|██████████| 6/6 [00:04<00:00,  1.41it/s]


#### GPCP


In [15]:
# Rain - GPCP
def process_gpcp():
    rain_gpcp = pd.read_csv(DATA / "raw/remote_sensed_corrected/gpcp/gpcp.csv")
    rain_gpcp = rain_gpcp.drop(columns=["intersection_area"])
    # Fix identifiers
    rain_gpcp = fix_identifiers(rain_gpcp)
    # Reshape
    id_vars = ["GID_0", "GID_1", "GID_2"]
    # Only consider average precipitation
    rain_gpcp = rain_gpcp[
        id_vars + [x for x in rain_gpcp.columns if x.startswith("average_precip_")]
    ]
    rain_gpcp = rain_gpcp.melt(
        id_vars=id_vars, var_name="yearmon", value_name="rain_gpcp"
    )
    # Only consider mean precipitation
    rain_gpcp["year"] = rain_gpcp["yearmon"].str[-6:-2].astype(int)
    rain_gpcp["month"] = rain_gpcp["yearmon"].str[-2:].astype(int)
    rain_gpcp = (
        rain_gpcp.drop(columns=["yearmon"])
        .sort_values(id_vars + ["year", "month"])
        .reset_index(drop=True)
    )
    rain_gpcp = rain_gpcp[id_vars + ["year", "month"] + ["rain_gpcp"]]
    # Split and export
    metric_cols_agg = {"rain_gpcp": "mean"}
    split_by_level(
        rain_gpcp,
        additional_ids=["year", "month"],
        metric_cols_agg=metric_cols_agg,
        export="rain_gpcp",
    )


# process_gpcp()

0
1
2
  GID_0     GID_1       GID_2  year  month  rain_gpcp
0   AFG  AFG.10_1  AFG.10.1_1  1979      1   2.153200
1   AFG  AFG.10_1  AFG.10.1_1  1979      2   0.579116
2   AFG  AFG.10_1  AFG.10.1_1  1979      3   1.807815
3   AFG  AFG.10_1  AFG.10.1_1  1979      4   0.049770
4   AFG  AFG.10_1  AFG.10.1_1  1979      5   0.359804


#### DMSP


In [16]:
# Nightlights
def process_dmsp():
    dmsp = pd.read_csv(DATA / "raw/remote_sensed_corrected/nightlights/nightlights.csv")
    dmsp = dmsp.drop(columns="intersection_area")
    dmsp = dmsp.rename(columns={"value": "nightlights_dmsp"})
    dmsp = rename_gadm(dmsp)
    # Split and export
    metric_cols_agg = {"nightlights_dmsp": "mean"}
    split_by_level(
        dmsp,
        additional_ids=["year"],
        metric_cols_agg=metric_cols_agg,
        export="dmsp",
    )


# process_dmsp()

0
1
2
  GID_0 GID_1 GID_2  year  nightlights_dmsp
0   ABW   ABW   ABW  1992            7823.0
1   ABW   ABW   ABW  1993            8076.0
2   ABW   ABW   ABW  1994            8607.0
3   ABW   ABW   ABW  1995            8886.0
4   ABW   ABW   ABW  1996            8591.0


#### Elevation


In [17]:
# Elevation
def process_elevation():
    elevation = pd.read_csv(DATA / "raw/remote_sensed_corrected/elevation/elevation.csv")
    elevation = elevation.drop(columns="intersection_area").rename(
        columns={"value": "elevation"}
    )
    elevation = fix_identifiers(elevation)
    elevation["year"] = 1996
    # Split and export
    metric_cols_agg = {"elevation": "mean"}
    split_by_level(
        elevation,
        additional_ids=["year"],
        metric_cols_agg=metric_cols_agg,
        export="elevation",
    )


# process_elevation()

0
1
2
  GID_0     GID_1       GID_2  year    elevation
0   AFG  AFG.10_1  AFG.10.1_1  1996  2754.222065
1   AFG  AFG.10_1  AFG.10.2_1  1996  3186.425143
2   AFG  AFG.10_1  AFG.10.3_1  1996  2642.572782
3   AFG  AFG.10_1  AFG.10.4_1  1996  2165.457552
4   AFG  AFG.10_1  AFG.10.5_1  1996  2682.581492


#### Ruggedness


In [29]:
# Ruggedness
def process_ruggedness():
    ruggedness = pd.read_csv(JR_DATA/ "ruggedness.csv")
    ruggedness = ruggedness.drop(columns=["intersection_area", "cellarea"]).rename(
        columns={"tri": "terrain_ruggedness_index"}
    )
    elevation = fix_identifiers(ruggedness)
    elevation["year"] = 2012
    # Split and export
    metric_cols_agg = {"slope": "mean", "terrain_ruggedness_index": "mean"}
    split_by_level(
        elevation,
        additional_ids=["year"],
        metric_cols_agg=metric_cols_agg,
        export="ruggedness",
    )

process_ruggedness()

0
1
2
  GID_0     GID_1       GID_2  year         slope  terrain_ruggedness_index
0   AFG  AFG.10_1  AFG.10.1_1  2012   8557.155437             291848.755339
1   AFG  AFG.10_1  AFG.10.2_1  2012  11008.951185             372732.049248
2   AFG  AFG.10_1  AFG.10.3_1  2012   9815.227268             336322.599480
3   AFG  AFG.10_1  AFG.10.4_1  2012  12100.534802             415293.840364
4   AFG  AFG.10_1  AFG.10.5_1  2012   8048.132153             274906.837092


#### Population


In [26]:
# Population
def process_population():
    pop = pd.read_csv(DATA / "raw/remote_sensed_corrected/population/population.csv")
    pop = pop.drop(columns="intersection_area")
    # Fix identifiers
    pop = fix_identifiers(pop)
    # Reshape
    id_vars = ["GID_0", "GID_1", "GID_2"]
    # Drop extra cols
    pop = pop.drop(
        columns=[
            x for x in pop.columns if x.startswith(("average_Count", "sum_Density"))
        ]
    )
    pop = pop.drop(columns=["n_grids"])
    pop = (
        pd.wide_to_long(
            df=pop,
            stubnames=["sum_Count", "average_Density"],
            i=id_vars,
            j="year",
            sep="_",
        )
        .reset_index()
        .rename(
            columns={
                "sum_Count": "population_count",
                "average_Density": "population_density",
            }
        )
    )
    pop = pop.sort_values(id_vars + ["year"]).reset_index(drop=True)
    # Split and export
    metric_cols_agg = {"population_count": "sum"}
    split_by_level(
        pop,
        additional_ids=["year"],
        metric_cols_agg=metric_cols_agg,
        export="population",
    )


# process_population()

0
1
2
  GID_0     GID_1       GID_2  year  population_count
0   AFG  AFG.10_1  AFG.10.1_1  2000          0.000000
1   AFG  AFG.10_1  AFG.10.1_1  2005     163473.081512
2   AFG  AFG.10_1  AFG.10.1_1  2010     187808.197848
3   AFG  AFG.10_1  AFG.10.1_1  2015     215773.896657
4   AFG  AFG.10_1  AFG.10.1_1  2020     247912.926893


#### Temperature


In [16]:
# Temperature
def process_temperature():
    temperature = pd.read_csv(DATA / "raw/remote_sensed_corrected/temperature/temperature.csv")
    temperature = temperature.drop(columns="intersection_area")
    # Fix identifiers
    temperature = fix_identifiers(temperature)
    # Only keep post-1990 monthly values
    id_vars = ["GID_0", "GID_1", "GID_2"]
    cols = pd.Series([x for x in temperature.columns if x not in id_vars])
    cols_expanded = pd.DataFrame({"cols": cols})
    cols_expanded[["aggtype", "type", "yearmon"]] = cols_expanded.cols.str.split(
        "_", expand=True
    )
    cols_expanded = cols_expanded[cols_expanded["yearmon"].notnull()]
    cols_expanded["year"] = cols_expanded["yearmon"].str[0:4].astype(int)
    drop_cols = cols_expanded[
        (cols_expanded["type"] == "stn")
        | (cols_expanded["aggtype"] == "sum")
        | (cols_expanded["year"] < 1990)
        | (cols_expanded["yearmon"].str.contains("\."))
    ]
    temperature = temperature.drop(columns=drop_cols.cols)
    temperature = temperature.drop(columns="n_grids")
    # Reshape
    temperature = temperature.melt(
        id_vars=id_vars, var_name="yearmon", value_name="temperature"
    )
    temperature["year"] = temperature["yearmon"].str[-6:-2].astype(int)
    temperature["month"] = temperature["yearmon"].str[-2:].astype(int)
    temperature = (
        temperature.drop(columns=["yearmon"])
        .sort_values(id_vars + ["year", "month"])
        .reset_index(drop=True)
    )
    temperature = temperature[id_vars + ["year", "month"] + ["temperature"]]
    # Split and export
    metric_cols_agg = {"temperature": "mean"}
    split_by_level(
        temperature,
        additional_ids=["year", "month"],
        metric_cols_agg=metric_cols_agg,
        export="temperature",
    )


# process_temperature()

0
1
2
  GID_0     GID_1       GID_2  year  month  temperature
0   AFG  AFG.10_1  AFG.10.1_1  1990      1    -5.066602
1   AFG  AFG.10_1  AFG.10.1_1  1990      2    -3.249969
2   AFG  AFG.10_1  AFG.10.1_1  1990      3     1.001768
3   AFG  AFG.10_1  AFG.10.1_1  1990      4     5.815816
4   AFG  AFG.10_1  AFG.10.1_1  1990      5    13.402400


#### FAO


In [23]:
# FAO
def process_fao():
    fao = pd.read_csv(DATA / "raw/remote_sensed_corrected/all_FAO/all_FAO.csv")
    fao = fao.drop(columns="intersection_area")
    # Fix identifiers
    fao = fix_identifiers(fao)
    fao.to_parquet(
        DATA / "intermediate/fao_cross_section.parquet", index=False
    )
    # Selected FAO
    # Select aggregate cols
    id_vars = ["GID_0", "GID_1", "GID_2"]
    selected_cols = [
        "ActualProductionValue_All_Crops_2000",
        "ActualProductionValue_All_Crops_2010",
        "ActualProductionYield_All_2000",
        "ActualProductionYield_All_2010",
        "AggregateYieldAchievementRatio_All_Crops_2000",
        "AggregateYieldAchievementRatio_All_Crops_2010",
    ]
    fao_selected = fao[id_vars + selected_cols].copy()
    stubs = {
        "ActualProductionValue_All_Crops": "fao_crop_production_value",
        "ActualProductionYield_All": "fao_crop_production_yield",
        "AggregateYieldAchievementRatio_All_Crops": "fao_crop_yield_achievement_ratio",
    }

    fao_selected = pd.wide_to_long(
        fao_selected, stubnames=stubs.keys(), i=id_vars, j="year", sep="_"
    ).reset_index()
    fao_selected = fao_selected.rename(columns=stubs)
    # Aggregate
    metric_cols_agg = {
        "fao_crop_production_value": "sum",
        "fao_crop_production_yield": "mean",
        "fao_crop_yield_achievement_ratio": "mean",
    }
    split_by_level(
        fao_selected,
        additional_ids=[
            "year",
        ],
        metric_cols_agg=metric_cols_agg,
        export="fao_selected",
    )


process_fao()

0
1
2
  GID_0     GID_1       GID_2  year  fao_crop_production_value  \
0   AFG  AFG.10_1  AFG.10.1_1  2000                  48.810312   
1   AFG  AFG.10_1  AFG.10.1_1  2010                  66.453074   
2   AFG  AFG.10_1  AFG.10.2_1  2000                  14.850281   
3   AFG  AFG.10_1  AFG.10.2_1  2010                  20.053469   
4   AFG  AFG.10_1  AFG.10.3_1  2000                  27.491401   

   fao_crop_production_yield  fao_crop_yield_achievement_ratio  
0                   2.417479                          2.319956  
1                   2.279900                          2.679670  
2                   0.501677                          1.742954  
3                   0.764499                          2.320466  
4                   1.299344                          0.632035  


# Combine everything


In [68]:
# Get file lists
files_dest = DATA / "intermediate/"
monthly = list(files_dest.glob("*monthly.parquet"))
yearly = list(files_dest.glob("*yearly.parquet"))
cross_section = list(files_dest.glob("*cross_section.parquet"))

In [69]:
def has_level(fname):
    """Whether the file is collected at each level or only at the deepest available level"""
    m = re.search(r"level_(\d)_", str(fname))
    return m is not None

## Monthly


In [77]:
# Bring together datasets
monthly_dict = {}
for level in range(3):
    # Aggregate without level data
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year", "month"]
    # Prepare data with level
    monthly_dict[level] = reduce(
        lambda x, y: x.merge(y, on=id_cols_level, how="outer"),
        [pd.read_parquet(x) for x in monthly if f"level_{level}" in str(x)],
    )
monthly_dict[1].head()

Unnamed: 0,GID_0,GID_1,year,month,rain_gpcp,temperature,gdelt_protest,gdelt_coercion,violence_Battles,violence_Explosions/Remote violence,violence_Protests,violence_Riots,violence_Strategic developments,violence_Violence against civilians,viirs_mean,viirs_sum,viirs_median,viirs_count
0,AFG,AFG.10_1,1979,1,2.587696,,,,,,,,,,,,,
1,AFG,AFG.10_1,1979,2,1.085056,,,,,,,,,,,,,
2,AFG,AFG.10_1,1979,3,2.07838,,,,,,,,,,,,,
3,AFG,AFG.10_1,1979,4,0.016657,,,,,,,,,,,,,
4,AFG,AFG.10_1,1979,5,0.206155,,,,,,,,,,,,,


In [78]:
# Aggregate monthly to yearly
sum_cols = [
    "violence_Riots",
    "violence_Battles",
    "violence_Protests",
    "violence_Strategic developments",
    "violence_Violence against civilians",
    "violence_Explosions/Remote violence",
]
mean_cols = [
    "rain_gpcp",
    "temperature",
    "viirs_mean",
    "viirs_sum",
    "viirs_median",
    "viirs_count",
]

monthly_annualized = {}
for level, df in monthly_dict.items():
    print(level)
    # Aggregate monthly to yearly
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year"]
    df_sum = df.groupby(id_cols_level)[sum_cols].sum(min_count=1)
    df_mean = df.groupby(id_cols_level)[mean_cols].mean()
    monthly_annualized[level] = df_sum.merge(
        df_mean, left_index=True, right_index=True, how="outer"
    ).reset_index()

monthly_annualized[1].head()

0
1
2


Unnamed: 0,GID_0,GID_1,year,violence_Riots,violence_Battles,violence_Protests,violence_Strategic developments,violence_Violence against civilians,violence_Explosions/Remote violence,rain_gpcp,temperature,viirs_mean,viirs_sum,viirs_median,viirs_count
0,AFG,AFG.10_1,1979,,,,,,,0.730008,,,,,
1,AFG,AFG.10_1,1980,,,,,,,0.987897,,,,,
2,AFG,AFG.10_1,1981,,,,,,,0.477647,,,,,
3,AFG,AFG.10_1,1982,,,,,,,0.900926,,,,,
4,AFG,AFG.10_1,1983,,,,,,,1.19982,,,,,


In [80]:
# Export
for level in range(3):
    monthly_dict[level].to_parquet(
        DATA
        / f"processed/imagery_aggregations/monthly_level_{level}.parquet",
        index=False,
    )

In [81]:
# Export
for level in range(3):
    monthly_annualized[level].to_parquet(
        DATA
        / f"processed/imagery_aggregations/monthly_annualized_level_{level}.parquet",
        index=False,
    )

## Yearly


In [76]:
yearly_dict = {}
for level in range(3):
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year"]
    # Prepare data with level
    def read_df(x):
        df = pd.read_parquet(x)
        if df.year.dtype != "int":
            raise TypeError(f"year column is not int in df: {x}")
        return df

    df = reduce(
        lambda x, y: x.merge(y, on=id_cols_level, how="outer"),
        [read_df(x) for x in yearly if f"level_{level}" in str(x)],
    )
    # Merge
    df["year"] = pd.to_numeric(df["year"])
    yearly_dict[level] = df
yearly_dict[1].head()

Unnamed: 0,GID_0,GID_1,year,nightlights_dmsp,gdelt_coercion,gdelt_protest,fao_crop_production_value,fao_crop_production_yield,fao_crop_yield_achievement_ratio,ntl_harmonised_ext_count,...,ntl_harmonised_dvnl_count,ntl_harmonised_dvnl_sum,ntl_harmonised_dvnl_mean,slope,terrain_ruggedness_index,population_count,elevation,forest_loss_count,forest_loss_sum,forest_loss_mean
0,ABW,ABW,1992,7823.0,,,,,,,...,,,,,,,,,,
1,ABW,ABW,1993,8076.0,,,,,,,...,,,,,,,,,,
2,ABW,ABW,1994,8607.0,,,,,,,...,,,,,,,,,,
3,ABW,ABW,1995,8886.0,,,,,,,...,,,,,,,,,,
4,ABW,ABW,1996,8591.0,,,,,,,...,,,,,,,,,,


In [77]:
# Export
for level in range(3):
    yearly_dict[level].to_parquet(
        DATA
        / f"processed/imagery_aggregations/yearly_level_{level}.parquet",
        index=False,
    )

## Cross section


In [24]:
cross_section = [x for x in cross_section if "fao_cross_section" not in str(x)]

In [25]:
cs_dict = {}
for level in range(3):
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year"]

    def read_df_with_level(x):
        df = pd.read_parquet(x)
        if "year" not in df.columns:
            df["year"] = df["date"].dt.year
            df = df.drop(columns="date")
        df = df.drop(columns=[x for x in df.columns if x.startswith("NAME_")])
        return df

    df = reduce(
        lambda x, y: x.merge(y, on=id_cols_level, how="outer"),
        [read_df_with_level(x) for x in cross_section if f"level_{level}" in str(x)],
    )
    # Merge
    cs_dict[level] = df
cs_dict[1].head()

Unnamed: 0,GID_0,GID_1,time_to_cities_mins,time_to_large_cities_mins,time_to_medium_cities_mins,time_to_ports_mins,time_to_airports_mins,year,urban_time_to_airports_mins,urban_time_to_large_cities_mins,urban_time_to_medium_cities_mins,urban_time_to_ports_mins
0,PSE,PSE.1_1,88.849348,88.849348,88.849348,,,2019,,87.742774,87.742774,
1,PSE,PSE.2_1,20.756536,86.467379,61.521715,,,2019,,35.582735,22.311492,
2,CIV,CIV.2_1,66.124239,358.130309,125.995983,,,2019,,340.209929,57.056629,
3,CIV,CIV.3_1,48.996337,198.219897,198.219897,,,2019,,159.224587,159.224587,
4,CIV,CIV.4_1,242.498073,435.11964,258.970207,,,2019,,,,


In [26]:
# Export
for level in range(3):
    cs_dict[level].to_parquet(
        DATA / f"
        / f"data/processed/imagery_aggregations/cross_section_level_{level}.parquet",
        index=False,
    )

## Combine


In [80]:
for level in range(3):
    print(level)
    monthly_df = pd.read_parquet(
        DATA /
        f"processed/imagery_aggregations/monthly_annualized_level_{level}.parquet"
    )
    yearly_df = pd.read_parquet(
        DATA /
        f"processed/imagery_aggregations/yearly_level_{level}.parquet"
    )
    cs_df = pd.read_parquet(
        DATA /
        f"processed/imagery_aggregations/cross_section_level_{level}.parquet"
    )
    # Merge
    id_cols_level = [f"GID_{x}" for x in range(level + 1)] + ["year"]
    annualized = monthly_df.merge(yearly_df, on=id_cols_level, how="outer")
    annualized = annualized.merge(cs_df, on=id_cols_level, how="outer")
    annualized.to_parquet(
        DATA /
        f"processed/imagery_aggregations/annualized_level_{level}.parquet",
        index=False,
    )
    annualized.to_stata(
        DATA /
        f"processed/imagery_aggregations/annualized_level_{level}.dta",
        write_index=False,
        variable_labels={k: k for k in annualized.columns},
    )
    annualized.to_csv(
        DATA /
        f"processed/imagery_aggregations/annualized_level_{level}.csv",
        index=False,
    )

0


/n/home10/shreyasgm/.conda/envs/cid/lib/python3.8/site-packages/pandas/io/stata.py:2491: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    violence_Strategic developments   ->   violence_Strategic_developments
    violence_Violence against civilians   ->   violence_Violence_against_civili
    violence_Explosions/Remote violence   ->   violence_Explosions_Remote_viole

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



1


/n/home10/shreyasgm/.conda/envs/cid/lib/python3.8/site-packages/pandas/io/stata.py:2491: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    violence_Strategic developments   ->   violence_Strategic_developments
    violence_Violence against civilians   ->   violence_Violence_against_civili
    violence_Explosions/Remote violence   ->   violence_Explosions_Remote_viole

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



2


/n/home10/shreyasgm/.conda/envs/cid/lib/python3.8/site-packages/pandas/io/stata.py:2491: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    violence_Strategic developments   ->   violence_Strategic_developments
    violence_Violence against civilians   ->   violence_Violence_against_civili
    violence_Explosions/Remote violence   ->   violence_Explosions_Remote_viole

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



In [81]:
annualized.head()

Unnamed: 0,GID_0,GID_1,GID_2,year,violence_Riots,violence_Battles,violence_Protests,violence_Strategic developments,violence_Violence against civilians,violence_Explosions/Remote violence,...,ntl_harmonised_ext_mean,time_to_cities_mins,time_to_large_cities_mins,time_to_medium_cities_mins,time_to_ports_mins,time_to_airports_mins,urban_time_to_airports_mins,urban_time_to_large_cities_mins,urban_time_to_medium_cities_mins,urban_time_to_ports_mins
0,AFG,AFG.10_1,AFG.10.1_1,1979,,,,,,,...,,,,,,,,,,
1,AFG,AFG.10_1,AFG.10.1_1,1980,,,,,,,...,,,,,,,,,,
2,AFG,AFG.10_1,AFG.10.1_1,1981,,,,,,,...,,,,,,,,,,
3,AFG,AFG.10_1,AFG.10.1_1,1982,,,,,,,...,,,,,,,,,,
4,AFG,AFG.10_1,AFG.10.1_1,1983,,,,,,,...,,,,,,,,,,
