In [1]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views

In [7]:
def get_cm_querysets():

    qs_baseline = (Queryset("fatalities002_baseline", "country_month")

                   # target variable
                   .with_column(Column("ln_ged_sb_dep", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.ops.ln()
                                .transform.missing.fill()
                                )

                   # timelag 0 of target variable
                   .with_column(Column("ln_ged_sb", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.ops.ln()
                                .transform.missing.fill()
                                )
                   # Decay functions
                   # sb
                   .with_column(Column("decay_ged_sb_5", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.missing.replace_na()
                                )
                   # os
                   .with_column(Column("decay_ged_os_5", from_table="ged2_cm", from_column="ged_os_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.missing.replace_na()
                                )
                   # Spatial lag decay
                   .with_column(Column("splag_1_decay_ged_sb_5", from_table="ged2_cm",
                                       from_column="ged_sb_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.spatial.countrylag(1, 1, 0, 0)
                                .transform.missing.replace_na()
                                )

                   # From 
                   
                   .with_column(Column("wdi_sp_pop_totl", from_table="wdi_cy", from_column="wdi_sp_pop_totl")
                                .transform.missing.fill()
                                .transform.temporal.tlag(12)
                                .transform.missing.fill()
                                )

                   .with_theme("fatalities")
                   .describe("""Fatalities conflict history, cm level
    
                   Predicting ln(fatalities) using conflict predictors, ultrashort
    
                             """)
                   )

    data = qs_baseline.publish().fetch()

    print(f"fatalities002_baseline; "
          f"A dataset with {len(data.columns)} columns, with "
          f"data between t {min(data.index.get_level_values(0))} "
          f"and {max(data.index.get_level_values(0))}. "
          f"({len(np.unique(data.index.get_level_values(1)))} units)"
          )
    return data

In [8]:
df = get_cm_querysets()

 .    fatalities002_baseline; A dataset with 6 columns, with data between t 1 and 852. (213 units)


In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,ln_ged_sb,wdi_sp_pop_totl,decay_ged_sb_5,decay_ged_os_5,splag_1_decay_ged_sb_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0.00000,0.00000,778176.0,0.000000,0.000000e+00,0.0
1,2,0.00000,0.00000,375112.0,0.000000,0.000000e+00,0.0
1,3,0.00000,0.00000,1127852.0,0.000000,0.000000e+00,0.0
1,4,0.00000,0.00000,15210443.0,0.000000,0.000000e+00,0.0
1,5,0.00000,0.00000,164905.0,0.000000,0.000000e+00,0.0
...,...,...,...,...,...,...,...
852,242,0.00000,0.00000,63588334.0,0.000044,3.051758e-05,0.0
852,243,0.00000,0.00000,37076584.0,0.000046,5.282816e-05,0.0
852,244,0.00000,0.00000,4614974.0,0.000069,9.586124e-10,0.0
852,245,6.23637,6.23637,45657202.0,0.000081,7.689948e-05,0.0


In [13]:
def get_cm_querysets():

    qs_baseline = (Queryset("fatalities003", "country_month")

                   # target variable
                   .with_column(Column("ln_ged_sb_dep", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.ops.ln()
                                .transform.missing.fill()
                                )

                   # timelag 0 of target variable
                   .with_column(Column("ln_ged_sb", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.ops.ln()
                                .transform.missing.fill()
                                )
                   # Decay functions
                   # sb
                   .with_column(Column("decay_ged_sb_5", from_table="ged2_cm", from_column="ged_sb_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.missing.replace_na()
                                )
                   # os
                   .with_column(Column("decay_ged_os_5", from_table="ged2_cm", from_column="ged_os_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.missing.replace_na()
                                )
                   # Spatial lag decay
                   .with_column(Column("splag_1_decay_ged_sb_5", from_table="ged2_cm",
                                       from_column="ged_sb_best_sum_nokgi")
                                .transform.missing.replace_na()
                                .transform.bool.gte(5)
                                .transform.temporal.time_since()
                                .transform.temporal.decay(24)
                                .transform.spatial.countrylag(1, 1, 0, 0)
                                .transform.missing.replace_na()
                                )

                   # From 
                   
                   .with_column(Column("wdi_sp_pop_totl", from_table="wdi_cy", from_column="wdi_sp_pop_totl")
                                .transform.missing.fill()
                                .transform.temporal.tlag(12)
                                .transform.missing.fill()
                                )

                   .with_theme("fatalities")
                   .describe("""Fatalities conflict history, cm level
    
                   Predicting ln(fatalities) using conflict predictors, ultrashort
    
                             """)
                   )

    data = qs_baseline.publish().fetch()

    print(f"fatalities003_month_to_annual; "
          f"A dataset with {len(data.columns)} columns, with "
          f"data between t {min(data.index.get_level_values(0))} "
          f"and {max(data.index.get_level_values(0))}. "
          f"({len(np.unique(data.index.get_level_values(1)))} units)"
          )
    return data

In [14]:
df = get_cm_querysets()

 .    fatalities003_month_to_annual; A dataset with 6 columns, with data between t 1 and 852. (213 units)


In [15]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ln_ged_sb_dep,ln_ged_sb,wdi_sp_pop_totl,decay_ged_sb_5,decay_ged_os_5,splag_1_decay_ged_sb_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0.00000,0.00000,778176.0,0.000000,0.000000e+00,0.0
1,2,0.00000,0.00000,375112.0,0.000000,0.000000e+00,0.0
1,3,0.00000,0.00000,1127852.0,0.000000,0.000000e+00,0.0
1,4,0.00000,0.00000,15210443.0,0.000000,0.000000e+00,0.0
1,5,0.00000,0.00000,164905.0,0.000000,0.000000e+00,0.0
...,...,...,...,...,...,...,...
852,242,0.00000,0.00000,63588334.0,0.000044,3.051758e-05,0.0
852,243,0.00000,0.00000,37076584.0,0.000046,5.282816e-05,0.0
852,244,0.00000,0.00000,4614974.0,0.000069,9.586124e-10,0.0
852,245,6.23637,6.23637,45657202.0,0.000081,7.689948e-05,0.0
