# Test ingested data for the monthly run

In [None]:
# VIEWS 3
from viewser import Queryset, Column
from views_mapper2.label_writer import *

## Prepare query for given time period and country

In [None]:
# Call the function

def vid2date(i):
    year=str(ViewsMonth(i).year)
    month=str(ViewsMonth(i).month)
    return year+'/'+month

In [None]:
# Choose year and month below to print the correlated `month_id`

print(ViewsMonth.from_year_month(year=2024, month=10)) 

In [None]:
# Set period to query for

StartOfHistory = 539
EndOfHistory = 548

StartYear = 2010
EndYear = 2025

In [None]:
# Set country to query for
name_date2cid('Nigeria','2017-07-01') # The date here is irrelevant

In [None]:

## Country name to country ID
country_id = name_date2cid('Nigeria','2017-07-01') # The date here is irrelevant

## Create df with identifiers

In [None]:
# Identifiers

identifiers_cm = (Queryset("identifiers", "country_month")

.with_column(Column('year', from_loa = "country_year", from_column = 'year_id'))
.with_column(Column('month', from_loa = "month", from_column = 'month'))
.with_column(Column('isoab', from_loa = "country", from_column = 'isoab'))
.with_column(Column('country', from_loa = "country", from_column = 'name'))

)
      
identifiers_cm = identifiers_cm.publish().fetch()

print(f"A dataset with {len(identifiers_cm.columns)} columns, with "
      f"data between t {min(identifiers_cm.index.get_level_values(0))} "
      f"and {max(identifiers_cm.index.get_level_values(0))}. "
      f"({len(np.unique(identifiers_cm.index.get_level_values(1)))} units)"
     )

In [None]:
identifiers_cm

## Create df with UCDP data

In [None]:
ucdp_data_cm = (Queryset("fatalities002_ucdp_cm", "country_month")

# GED

.with_column(Column("ucdp_ged_sb_best_sum", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi"))                   
##.with_column(Column("ucdp_ged_os_best_sum", from_loa = "country_month", from_column = "ged_os_best_sum_nokgi"))   
#.with_column(Column("ucdp_ged_ns_best_sum", from_loa = "country_month", from_column = "ged_ns_best_sum_nokgi"))   

)
      
ucdp_data_cm = ucdp_data_cm.publish().fetch()

print(f"A dataset with {len(ucdp_data_cm.columns)} columns, with "
      f"data between t {min(ucdp_data_cm.index.get_level_values(0))} "
      f"and {max(ucdp_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(ucdp_data_cm.index.get_level_values(1)))} units)"
     )


In [None]:
ucdp_data_cm.describe()


In [None]:
ucdp_data_cm

In [None]:
# Merge identifier df with data df

cols = ["country_id", "month_id"]

ucdp_data_cm_w_identifiers = pd.merge(identifiers_cm, ucdp_data_cm, on=cols, how="left")

ucdp_data_cm_w_identifiers

In [None]:
# UPDATED SUBSET + GED

ged_cm_subset= ucdp_data_cm_w_identifiers.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

ged_cm_subset # displays the subset

## Create df with ACLED data

In [None]:
acled_data_cm = (Queryset("fatalities002_acled_cm", "country_month")

# ACLED

.with_column(Column("acled_sb_count", from_loa = "country_month", from_column = "acled_sb_count"))                   
.with_column(Column("acled_sb_fat", from_loa = "country_month", from_column = "acled_sb_fat"))                   
.with_column(Column("acled_os_fat", from_loa = "country_month", from_column = "acled_os_fat"))   
.with_column(Column("acled_ns_fat", from_loa = "country_month", from_column = "acled_ns_fat"))   
.with_column(Column("acled_bat_gov_fat", from_loa = "country_month", from_column = "acled_bat_gov_fat"))  
.with_column(Column("acled_bat_reb_fat", from_loa = "country_month", from_column = "acled_bat_reb_fat"))                   
.with_column(Column("acled_pr_count", from_loa = "country_month", from_column = "acled_pr_count")) 
.with_column(Column("acled_prx_count", from_loa = "country_month", from_column = "acled_prx_count")) 
.with_column(Column("acled_prx_fat", from_loa = "country_month", from_column = "acled_prx_fat"))                   

)
      
acled_data_cm = acled_data_cm.publish().fetch()

print(f"A dataset with {len(acled_data_cm.columns)} columns, with "
      f"data between t {min(acled_data_cm.index.get_level_values(0))} "
      f"and {max(acled_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(acled_data_cm.index.get_level_values(1)))} units)"
     )

In [None]:
acled_data_cm

In [None]:
acled_data_cm.describe()

In [None]:
# Merge identifier df with data df

cols = ["country_id", "month_id"]

acled_data_cm_w_identifiers = pd.merge(identifiers_cm, acled_data_cm, on=cols, how="left")

acled_data_cm_w_identifiers

In [None]:
# Query the data

acled_cm_subset= acled_data_cm_w_identifiers.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

acled_cm_subset # displays the subset

## Create df with Topics data

In [None]:
topics_data_cm = (Queryset("fatalities002_topics_cm", "country_month")

# GED

.with_column(Column("randomly_selected__column", from_loa = "country_month", from_column = "topic_ste_theta13"))                   
)
      
topics_data_cm = topics_data_cm.publish().fetch()

print(f"A dataset with {len(topics_data_cm.columns)} columns, with "
      f"data between t {min(topics_data_cm.index.get_level_values(0))} "
      f"and {max(topics_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(topics_data_cm.index.get_level_values(1)))} units)"
     )

In [None]:
topics_data_cm.describe()

In [None]:
# Merge identifier df with data df

cols = ["country_id", "month_id"]

topics_data_cm_w_identifiers = pd.merge(identifiers_cm, topics_data_cm, on=cols, how="left")

topics_data_cm_w_identifiers

In [None]:
# Query the data

topics_cm_subset= topics_data_cm_w_identifiers.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

topics_cm_subset # displays the subset

## Create df with V-Dem data

In [None]:
vdem_data_cm = (Queryset("fatalities002_vdem_cm", "country_year")

# V-Dem

.with_column(Column("vdem_v2x_libdem", from_loa = "country_year", from_column = "vdem_v2x_gender"))                   
)
      
vdem_data_cm = vdem_data_cm.publish().fetch()

print(f"A dataset with {len(vdem_data_cm.columns)} columns, with "
      f"data between t {min(vdem_data_cm.index.get_level_values(0))} "
      f"and {max(vdem_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(vdem_data_cm.index.get_level_values(1)))} units)"
     )

In [None]:
vdem_data_cm.describe()

In [None]:
vdem_data_cm

In [None]:
# Fix discrepancies in column names

vdem_data_cm = (
    vdem_data_cm
    .reset_index()
    .rename(columns={'year_id': 'year'})
    .set_index(['year', 'country_id'])
)

In [None]:
vdem_data_cm


In [None]:
# Merge identifier df with data df

cols = ["country_id", "year"]

vdem_data_cm_w_identifiers = pd.merge(identifiers_cm, vdem_data_cm, on=cols, how="left")

vdem_data_cm_w_identifiers

In [None]:
# Query the data

vdem_data_cm= vdem_data_cm.query(f'year >= 2023 & year <= 2025 & country_id == {country_id}')

vdem_data_cm # displays the subset

# WDI

In [None]:
wdi_data_cm = (Queryset("fatalities002_wdi_cm", "country_year")

# V-Dem

.with_column(Column("wb_wdi_ny_gdp_pcap_kd", from_loa = "country_year", from_column = "wdi_ny_gdp_pcap_kd"))
.with_column(Column("wb_wdi_sp_dyn_le00_in", from_loa = "country_year", from_column = "wdi_sp_dyn_le00_in"))                   
.with_column(Column("wb_wdi_se_prm_nenr", from_loa = "country_year", from_column = "wdi_se_prm_nenr"))                   
.with_column(Column("wb_wdi_sp_pop_totl", from_loa = "country_year", from_column = "wdi_sp_pop_totl"))                   
.with_column(Column("wb_wdi_sp_dyn_imrt_in", from_loa = "country_year", from_column = "wdi_sp_dyn_imrt_in"))                   
)
      
wdi_data_cm = wdi_data_cm.publish().fetch()

print(f"A dataset with {len(wdi_data_cm.columns)} columns, with "
      f"data between t {min(wdi_data_cm.index.get_level_values(0))} "
      f"and {max(wdi_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(wdi_data_cm.index.get_level_values(1)))} units)"
     )

In [None]:
wdi_data_cm.describe()

In [None]:
wdi_data_cm

In [None]:
# Fix discrepancies in column names

wdi_data_cm = (
    wdi_data_cm
    .reset_index()
    .rename(columns={'year_id': 'year'})
    .set_index(['year', 'country_id'])
)

In [None]:
# Merge identifier df with data df

cols = ["country_id", "year"]

wdi_data_cm_w_identifiers = pd.merge(identifiers_cm, wdi_data_cm, on=cols, how="left")

wdi_data_cm_w_identifiers

In [None]:
# Query the data

wdi_data_cm= wdi_data_cm.query(f'year >= {StartYear} & year <= {EndYear} & country_id == {country_id}')

wdi_data_cm # displays the subset

In [None]:
print("All done!")