In [1]:
# Import functions
import pandas as pd
from data_gathering import gather_data_actuals, gather_data_features
from data_preparation import preprocess_data

In [2]:
# STEP: LOAD DATA
# Load 'Actuals' data
data_cm_actual_2018, data_cm_actual_2019, data_cm_actual_2020, data_cm_actual_2021, data_cm_actual_allyears \
    = gather_data_actuals()

# Load features data
_, _, _, _, data_cm_features_allyears \
    = gather_data_features()

In [3]:
from mappings import map_date_to_month_id

In [4]:
end_training_2018 = map_date_to_month_id(2017,10)
end_actuals_2018 = map_date_to_month_id(2018,12)
end_training_2019 = map_date_to_month_id(2018,10)
end_actuals_2019 = map_date_to_month_id(2019,12)
end_training_2020 = map_date_to_month_id(2019,10)
end_actuals_2020 = map_date_to_month_id(2020,12)
end_training_2021 = map_date_to_month_id(2020,10)
end_actuals_2021 = map_date_to_month_id(2021,12)
end_training_composed = map_date_to_month_id(2017,10)
end_actuals_composed = map_date_to_month_id(2021,12)

Determine quantiles of share of structured zeros in fatalities and average fatalities based on training data for the years respectively
Export year specific data for the previously determined countries up until the end of the respective actuals data
Later also the adjacency matrix needs to be year specific 
(would have to see if it makes a difference, more probable for feature set 2 than 1 that we have changes in the countries)

In [5]:
# Add the actual data to the data_cm_features_allyears (data_cm_features_allyears is up until month_id 490 corresponding to 2020-10, data_cm_actual_allyears has data including month_id 491 corresponding to 2020-11 and onwards)
additional_actual_data = data_cm_actual_allyears[data_cm_actual_allyears['month_id'] > 490]
data_cm_features_allyears = pd.concat([data_cm_features_allyears, additional_actual_data])

In [6]:
end_training = end_training_2018
end_actuals = end_actuals_2021
year = ""
actuals_data = data_cm_actual_allyears
feature_set = 4

In [7]:
small_set_lin_predictors = ['vdem_v2x_accountability', 'vdem_v2xpe_exlecon', 'vdem_v2xpe_exlpol', 'wdi_sm_pop_netm', 'wdi_sm_pop_refg_or', 'wdi_ms_mil_xpnd_zs', 'wdi_dt_oda_odat_pc_zs', 'wdi_sp_pop_totl']

large_set_lin_predictors = ['vdem_v2x_horacc', 'vdem_v2x_veracc', 'vdem_v2xnp_client', 'vdem_v2x_divparctrl', 'vdem_v2xpe_exlecon', 'vdem_v2xpe_exlpol', 'vdem_v2xpe_exlsocgr', 'vdem_v2xpe_exlgeo', 'vdem_v2xpe_exlgender', 'vdem_v2x_libdem', 'wdi_sm_pop_netm', 'wdi_sm_pop_refg_or', 'wdi_ms_mil_xpnd_zs', 'wdi_dt_oda_odat_pc_zs', 'wdi_sp_pop_totl', 'wdi_sp_dyn_imrt_in']

In [8]:
# Calculate the share of zeros for each 'country_id'.
share_of_zeros = data_cm_features_allyears[data_cm_features_allyears['month_id'] <= end_training].groupby('country_id')['ged_sb'].apply(lambda x: (x == 0).mean())

# Calculate the average fatalities for each 'country_id'.
average_fatalities = data_cm_features_allyears[data_cm_features_allyears['month_id'] <= end_training].groupby('country_id')['ged_sb'].mean()

# Combine the results into a new DataFrame.
desired_data = pd.DataFrame({
    'country_id': share_of_zeros.index,
    'share_structured_zeros': share_of_zeros.values,
    'avg_fatalities_per_month': average_fatalities.values
})

In [9]:
# Calculate quantiles of share of structured zeros in fatalities and average fatalities per month per country (fatalities = ged_sb) excluding the countries with structured zeros share of 1
structured_zeros_quantiles = desired_data['share_structured_zeros'].quantile([0.25, 0.5, 0.75])
avg_fatalities_quantiles = desired_data['avg_fatalities_per_month'].quantile([0.25, 0.5, 0.75])

In [10]:
# Determine all countries
actual_countries = data_cm_actual_allyears[data_cm_actual_allyears['month_id'] <= end_actuals]['country_id'].unique()
# Determine countries with at least one conflict fatality
feature_countries_non_zero = data_cm_features_allyears[(data_cm_features_allyears['ged_sb'] > 0) & (data_cm_features_allyears["month_id"] <= end_training)][
    'country_id'].unique()
# Determine countries whose the average fatalities per month are greater than the 0.75 quantile of average fatalities per month
countries_over_quantile = desired_data[desired_data['avg_fatalities_per_month'] > avg_fatalities_quantiles[0.75]]['country_id'].unique()

In [11]:
# Determine countries with at least one conflict fatality and in actuals data
feature_and_actuals_countries_non_zero = list(set(feature_countries_non_zero) & set(actual_countries))

# Determine the overlap between "actual_countries" and "countries_over_quantile"
feature_and_actuals_countries_non_zero_above_avg_quantile= list(set(feature_countries_non_zero) & set(actual_countries) & set(countries_over_quantile))

In [12]:
# Set country filter
country_filter = feature_and_actuals_countries_non_zero

In [13]:
# Set country filter
# country_filter = feature_and_actuals_countries_non_zero_above_avg_quantile

In [14]:
# Set country filter to all countries
# country_filter = actual_countries

In [124]:
# STEP: FEATURE SELECTION AND STANDARDIZATION
covariates = []
lagged_covariates = ['ged_sb', 'decay_ged_sb_5', 'decay_ged_sb_100', 'decay_ged_sb_500', 'ged_sb_tsum_24']
# Add small linear set to lagged covariates
lagged_covariates.extend(small_set_lin_predictors)
# Select countries which have at least one conflict fatality
filtered_data = data_cm_features_allyears[data_cm_features_allyears['country_id'].isin(country_filter)]
filtered_data = preprocess_data(filtered_data, covariates, lagged_covariates, standardize=True, lags_needed=3, drop_na=True)
filtered_data_export = filtered_data[filtered_data['month_id'] <= end_actuals]

  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['count

In [142]:
# STEP: FEATURE SELECTION AND STANDARDIZATION
covariates = []
lagged_covariates = ['ged_sb', 'decay_ged_sb_5', 'decay_ged_sb_100', 'decay_ged_sb_500', 'ged_sb_tsum_24']
# Add large linear set to lagged covariates
lagged_covariates.extend(large_set_lin_predictors)
# Select countries which have at least one conflict fatality
filtered_data = data_cm_features_allyears[data_cm_features_allyears['country_id'].isin(country_filter)]
filtered_data = preprocess_data(filtered_data, covariates, lagged_covariates, standardize=True, lags_needed=3, drop_na=True)
filtered_data_export = filtered_data[filtered_data['month_id'] <= end_actuals]

  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['country_id'] == country_id).astype(int)
  df[new_column_name] = (df['count

In [17]:
# STEP: FEATURE SELECTION AND STANDARDIZATION
covariates = []
lagged_covariates = ['ged_sb', 'decay_ged_sb_5', 'decay_ged_sb_100', 'decay_ged_sb_500', 'ged_sb_tsum_24']
# Select countries which have at least one conflict fatality
filtered_data = data_cm_features_allyears[data_cm_features_allyears['country_id'].isin(country_filter)]
filtered_data = preprocess_data(filtered_data, covariates, lagged_covariates, standardize=True, lags_needed=3, drop_na=False)
filtered_data_export = filtered_data[filtered_data['month_id'] <= end_actuals]

In [14]:
# Export as csv
filtered_data_export.to_csv(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set{feature_set}{year}.csv')

In [143]:
# Export data
filtered_data_export.to_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set{feature_set}{year}.parquet')

In [107]:
len(actual_countries)

191

In [144]:
filtered_data['country_id'].nunique()

94

In [0]:
data_2018 = pd.read_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set1_2018.parquet')
data_2019 = pd.read_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set1_2019.parquet')
data_2020 = pd.read_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set1_2020.parquet')
data_2021 = pd.read_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set1_2021.parquet')

In [109]:
data_composed = pd.read_parquet(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set1.parquet')

In [15]:
# Export list of countries non zero and above average quantile
import pickle
with open(fr'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\feature_and_actuals_countries_non_zero_above_avg_quantile.pkl', 'wb') as f:
    pickle.dump(feature_and_actuals_countries_non_zero_above_avg_quantile, f)