# Download subsets of UCDP GED or UCDP Candidate data

**NOTE: Requires a certificate for access to the VIEWS database.**

This notebook allows you to extract and download a subset of UCDP GED and UCDP Candidate data from the VIEWS database, aggregated to the levels of analysis used in VIEWS. 

If you change the data query, please rename it using a descriptive label. 



In [1]:
# Basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
# sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
# Views 3
from viewser.operations import fetch
from viewser import Queryset, Column
import views_runs
from views_partitioning import data_partitioner, legacy
from stepshift import views
import views_dataviz
# Extra functions
from views_mapper2.label_writer import *
import os
home = os.path.expanduser('~')
desktop = home+'/Desktop'

In [6]:
# Functions to easily obtain month_ids and country_ids

def name2iso(i):
    str(i)
    try:
        output = pycountry.countries.get(name=i).alpha_3
    except AttributeError: 
        try:
            output = 'no match, did you mean'+' ' + str(pycountry.countries.search_fuzzy(i)[0])
        except LookupError:
            output= 'check_spelling'
    return output

#use for quick checks in the below, note ingester does a better job of maching country id based on year for countries
#that change their ids over time, e.g.Ethiopia
def name2id(i):
    str(i)
    try:
        output = Country.from_iso(pycountry.countries.get(name=i).alpha_3).id
    except AttributeError: 
        try:
            output = 'no match, did you mean'+' ' + str(pycountry.countries.search_fuzzy(i)[0])
        except LookupError:
            output= 'check_spelling'
    return output

def vid2date(i):
    year=str(ViewsMonth(i).year)
    month=str(ViewsMonth(i).month)
    return year+'/'+month

In [2]:
# !viewser tables list

In [3]:
# !viewser tables show 'ged2_cm'      ## Will show all variables in the chosen table


## Find the country_ids and month_ids of interest

In [8]:
# Get country_id for the chosen country

print(name2id('Kenya'))

237


In [9]:
# Get month_ids for the chosen period

print(ViewsMonth.from_year_month(year=2017, month=1)) 
print(ViewsMonth.from_year_month(year=2022, month=7)) 

ViewsMonth(id=445) #=> year:2017, month:1
ViewsMonth(id=511) #=> year:2022, month:7


## CM data

In [None]:
ged_cm_data = (Queryset("ALM_ged_cm", "country_month")

# Adds columns with translated month_ids and country_ids to the table
         
.with_column(Column('year', from_table = 'month', from_column = 'year_id'))
.with_column(Column('month', from_table = 'month', from_column = 'month'))
.with_column(Column('country_name', from_table = 'country', from_column = 'name'))

# Adds columns with non-logged and non-transformed sums of GED fatalities at chosen level of analysis
         
.with_column(Column("ged_best_sb", from_table = "ged2_cm", from_column = "ged_sb_best_sum_nokgi")
             #.transform.missing.fill()
            )                   
.with_column(Column("ged_best_os", from_table = "ged2_cm", from_column = "ged_os_best_sum_nokgi")
             #.transform.missing.fill()
            )   
.with_column(Column("ged_best_ns", from_table = "ged2_cm", from_column = "ged_ns_best_sum_nokgi")
             #.transform.missing.fill()
            )  
              )
      
data = ged_cm_data.publish().fetch()

print(f"A dataset with {len(data.columns)} columns, with "
      f"data between t {min(data.index.get_level_values(0))} "
      f"and {max(data.index.get_level_values(0))}. "
      f"({len(np.unique(data.index.get_level_values(1)))} units)"
     )

In [None]:
subset= data.query('month_id >= 451 and month_id <= 511 and country_id == 237') # Change the time period and/or country of interest here
subset

In [None]:
subset.to_csv(desktop+'/Kenya_ged_cm.csv') # Change the name of the csv-file here

## PGM data

In [None]:
ged_pgm_data = (Queryset("ALM_ged_cm", "priogrid_month")

# Adds columns with translated month_ids and country_ids to the table
         
.with_column(Column('year', from_table = 'month', from_column = 'year_id'))
.with_column(Column('month', from_table = 'month', from_column = 'month'))
.with_column(Column('country_name', from_table = 'country', from_column = 'name'))

# Adds columns with non-logged and non-transformed sums of GED fatalities at chosen level of analysis
         
.with_column(Column("ged_best_sb", from_table = "ged2_cm", from_column = "ged_sb_best_sum_nokgi")
             #.transform.missing.fill()
            )                   
.with_column(Column("ged_best_os", from_table = "ged2_cm", from_column = "ged_os_best_sum_nokgi")
             #.transform.missing.fill()
            )   
.with_column(Column("ged_best_ns", from_table = "ged2_cm", from_column = "ged_ns_best_sum_nokgi")
             #.transform.missing.fill()
            )  
              )
      
data = ged_pgm_data.publish().fetch()

print(f"A dataset with {len(data.columns)} columns, with "
      f"data between t {min(data.index.get_level_values(0))} "
      f"and {max(data.index.get_level_values(0))}. "
      f"({len(np.unique(data.index.get_level_values(1)))} units)"
     )

In [None]:
kenya_pg_step1 = data.query('month_id >= 451 and month_id <= 511') # Change time period here

kenya_pg = kenya_pg_step1[kenya_pg_step1['country_name'] == 'Kenya'] # Change country of interest here
kenya_pg

In [None]:
kenya_pg.to_csv(desktop+'/Kenya_ged_pgm.csv') # Change the name of the file here