# Download subsets of GED data

This notebook allows you to extract and download a subset of UCDP-GED data aggregated to the levels of analysis used in VIEWS. 

If you change the data query, please rename it using a descriptive label. 

In [None]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz

import os
home = os.path.expanduser('~')
desktop = home+'/Desktop'

In [None]:
# !viewser tables list

In [None]:
# !viewser tables show 'ged2_cm'      ## will show all variables in that table


In [None]:
from views_mapper2.label_writer import *

In [None]:
name_date2cid('Kenya','2017-07-01')   # run to get country_id from country name, for use in the functions below

# CM data

In [None]:
ged_data = (Queryset("ALM_cm_ged_data_no_transforms", "country_month")

# month_id and country_id translations
         
.with_column(Column('year', from_table = 'month', from_column = 'year_id'))
.with_column(Column('month', from_table = 'month', from_column = 'month'))
.with_column(Column('country_name', from_table = 'country', from_column = 'name'))

# non logged target variable GED fatalities
         
.with_column(Column("ged_best_sb", from_table = "ged2_cm", from_column = "ged_sb_best_sum_nokgi")
             #.transform.missing.fill()
            )                   
.with_column(Column("ged_best_os", from_table = "ged2_cm", from_column = "ged_os_best_sum_nokgi")
             #.transform.missing.fill()
            )   
.with_column(Column("ged_best_ns", from_table = "ged2_cm", from_column = "ged_ns_best_sum_nokgi")
             #.transform.missing.fill()
            )  
              )
      
data = ged_data.publish().fetch()

print(f"A dataset with {len(data.columns)} columns, with "
      f"data between t {min(data.index.get_level_values(0))} "
      f"and {max(data.index.get_level_values(0))}. "
      f"({len(np.unique(data.index.get_level_values(1)))} units)"
     )

In [None]:
subset= data.query('month_id >= 451 and month_id <= 511 and country_id == 237') # Change time period and country of interest here
subset

In [None]:
desktop

In [None]:
subset.to_csv(desktop+'/Kenya_ged_cm.csv') # Change the name of the file here

# PGM data

In [None]:
ged_data = (Queryset("ALM_pgm_ged_data_no_transforms", "priogrid_month")

# month_id and country_id translations
         
.with_column(Column('year', from_table = 'month', from_column = 'year_id'))
.with_column(Column('month', from_table = 'month', from_column = 'month'))
.with_column(Column('country_name', from_table = 'country', from_column = 'name'))

# non logged target variable GED fatalities
         
.with_column(Column("ged_best_sb", from_table = "ged2_pgm", from_column = "ged_sb_best_sum_nokgi")
             #.transform.missing.fill()
            )                   
.with_column(Column("ged_best_os", from_table = "ged2_pgm", from_column = "ged_os_best_sum_nokgi")
             #.transform.missing.fill()
            )   
.with_column(Column("ged_best_ns", from_table = "ged2_pgm", from_column = "ged_ns_best_sum_nokgi")
             #.transform.missing.fill()
            )  
              )
      
data = ged_data.publish().fetch()

print(f"A dataset with {len(data.columns)} columns, with "
      f"data between t {min(data.index.get_level_values(0))} "
      f"and {max(data.index.get_level_values(0))}. "
      f"({len(np.unique(data.index.get_level_values(1)))} units)"
     )

In [None]:
kenya_pg_step1 = data.query('month_id >= 451 and month_id <= 511') # Change time period here

kenya_pg = kenya_pg_step1[kenya_pg_step1['country_name'] == 'Kenya'] # Change country of interest here
kenya_pg

In [None]:
kenya_pg.to_csv(desktop+'/Kenya_ged_pgm.csv') # Change the name of the file here