# Test ingested data for the monthly run

In [1]:
# VIEWS 3
from viewser import Queryset, Column
from views_mapper2.label_writer import *

## Prepare query for given time period and country

In [2]:
# Call the function

def vid2date(i):
    year=str(ViewsMonth(i).year)
    month=str(ViewsMonth(i).month)
    return year+'/'+month

In [3]:
# Choose year and month below to print the correlated `month_id`

print(ViewsMonth.from_year_month(year=2024, month=10)) 

ViewsMonth(id=538) #=> year:2024, month:10


In [4]:
# Set period to query for

StartOfHistory = 539
EndOfHistory = 542

In [5]:
# Set country to query for

## Country name to country ID
country_id = name_date2cid('Sudan','2017-07-01') # The date here is irrelevant

## Create df with identifiers

TODO: Merge with the dfs below for better overview

In [6]:
# Identifiers

identifiers_cm = (Queryset("identifiers", "country_month")

.with_column(Column('year', from_loa = "country_year", from_column = 'year_id'))
.with_column(Column('month', from_loa = "month", from_column = 'month'))
.with_column(Column('isoab', from_loa = "country", from_column = 'isoab'))
.with_column(Column('country', from_loa = "country", from_column = 'name'))

)
      
identifiers_cm = identifiers_cm.publish().fetch()

print(f"A dataset with {len(identifiers_cm.columns)} columns, with "
      f"data between t {min(identifiers_cm.index.get_level_values(0))} "
      f"and {max(identifiers_cm.index.get_level_values(0))}. "
      f"({len(np.unique(identifiers_cm.index.get_level_values(1)))} units)"
     )

Queryset identifiers read successfully 
A dataset with 4 columns, with data between t 1 and 852. (213 units)


In [7]:
identifiers_cm

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,isoab,country
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1980,1,GUY,Guyana
1,2,1980,1,SUR,Suriname
1,3,1980,1,TTO,Trinidad and Tobago
1,4,1980,1,VEN,Venezuela
1,5,1980,1,WSM,Samoa
...,...,...,...,...,...
852,242,2050,12,TZA,Tanzania
852,243,2050,12,MAR,Morocco
852,244,2050,12,MRT,Mauritania
852,245,2050,12,SDN,Sudan


## Create df with UCDP data

In [18]:
ucdp_data_cm = (Queryset("fatalities002_ucdp_cm", "country_month")

# GED

.with_column(Column("ucdp_ged_sb_best_sum", from_loa = "country_month", from_column = "ged_sb_best_sum_nokgi"))                   
.with_column(Column("ucdp_ged_os_best_sum", from_loa = "country_month", from_column = "ged_os_best_sum_nokgi"))   
.with_column(Column("ucdp_ged_ns_best_sum", from_loa = "country_month", from_column = "ged_ns_best_sum_nokgi"))   

)
      
ucdp_data_cm = ucdp_data_cm.publish().fetch()

print(f"A dataset with {len(ucdp_data_cm.columns)} columns, with "
      f"data between t {min(ucdp_data_cm.index.get_level_values(0))} "
      f"and {max(ucdp_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(ucdp_data_cm.index.get_level_values(1)))} units)"
     )

Queryset fatalities002_ucdp_cm read successfully                                            
A dataset with 3 columns, with data between t 1 and 852. (213 units)


In [19]:
ucdp_data_cm.describe()

Unnamed: 0,ucdp_ged_sb_best_sum,ucdp_ged_os_best_sum,ucdp_ged_ns_best_sum
count,158230.0,158230.0,158230.0
mean,14.839891,7.509979,2.49423
std,502.536534,1601.759971,63.267255
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,124427.0,629870.0,11133.0


In [21]:
# Query the data

ged_cm_subset= ucdp_data_cm.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

ged_cm_subset # displays the subset

Unnamed: 0_level_0,Unnamed: 1_level_0,ucdp_ged_sb_best_sum,ucdp_ged_os_best_sum,ucdp_ged_ns_best_sum
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
539,245,565,170,0
540,245,499,159,2
541,245,554,135,120
542,245,0,0,0


## Create df with ACLED data

In [22]:
acled_data_cm = (Queryset("fatalities002_acled_cm", "country_month")

# ACLED

.with_column(Column("acled_sb_count", from_loa = "country_month", from_column = "acled_sb_count"))                   
.with_column(Column("acled_sb_fat", from_loa = "country_month", from_column = "acled_sb_fat"))                   
.with_column(Column("acled_os_fat", from_loa = "country_month", from_column = "acled_os_fat"))   
.with_column(Column("acled_ns_fat", from_loa = "country_month", from_column = "acled_ns_fat"))   

)
      
acled_data_cm = acled_data_cm.publish().fetch()

print(f"A dataset with {len(acled_data_cm.columns)} columns, with "
      f"data between t {min(acled_data_cm.index.get_level_values(0))} "
      f"and {max(acled_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(acled_data_cm.index.get_level_values(1)))} units)"
     )

Queryset fatalities002_acled_cm read successfully                                            
A dataset with 4 columns, with data between t 1 and 852. (213 units)


In [23]:
acled_data_cm

Unnamed: 0_level_0,Unnamed: 1_level_0,acled_sb_count,acled_sb_fat,acled_os_fat,acled_ns_fat
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,,,,
1,2,,,,
1,3,,,,
1,4,,,,
1,5,,,,
...,...,...,...,...,...
852,242,,,,
852,243,,,,
852,244,,,,
852,245,,,,


In [24]:
acled_data_cm.describe()

Unnamed: 0,acled_sb_count,acled_sb_fat,acled_os_fat,acled_ns_fat
count,64078.0,64078.0,64078.0,64078.0
mean,8.239848,18.494397,8.489778,4.511642
std,101.297426,288.386951,66.882121,39.231146
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,4782.0,39588.0,5237.0,2051.0


In [25]:
# Query the data

acled_cm_subset= acled_data_cm.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

acled_cm_subset # displays the subset

Unnamed: 0_level_0,Unnamed: 1_level_0,acled_sb_count,acled_sb_fat,acled_os_fat,acled_ns_fat
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
539,245,151.0,682.0,759.0,70.0
540,245,211.0,1499.0,726.0,99.0
541,245,217.0,1165.0,586.0,144.0
542,245,88.0,515.0,347.0,54.0


## Create df with Topics data

In [26]:
topics_data_cm = (Queryset("fatalities002_topics_cm", "country_month")

# GED

.with_column(Column("randomly_selected__column", from_loa = "country_month", from_column = "topic_ste_theta13"))                   
)
      
topics_data_cm = topics_data_cm.publish().fetch()

print(f"A dataset with {len(topics_data_cm.columns)} columns, with "
      f"data between t {min(topics_data_cm.index.get_level_values(0))} "
      f"and {max(topics_data_cm.index.get_level_values(0))}. "
      f"({len(np.unique(topics_data_cm.index.get_level_values(1)))} units)"
     )

Queryset fatalities002_topics_cm read successfully                                             
A dataset with 1 columns, with data between t 1 and 852. (213 units)


In [27]:
topics_data_cm.describe()

Unnamed: 0,randomly_selected__column
count,69664.0
mean,0.099542
std,0.081543
min,0.000817
25%,0.039548
50%,0.078493
75%,0.136976
max,0.69378


In [28]:
# Query the data

topics_cm_subset= topics_data_cm.query(f'month_id >= {StartOfHistory} & month_id <= {EndOfHistory} & country_id == {country_id}')

topics_cm_subset # displays the subset

Unnamed: 0_level_0,Unnamed: 1_level_0,randomly_selected__column
month_id,country_id,Unnamed: 2_level_1
539,245,0.214122
540,245,0.178804
541,245,0.160913
542,245,
