In [1]:
# Import functions
import pandas as pd
from data_gathering import gather_data_actuals, gather_data_features
from data_preparation import preprocess_data

In [2]:
# STEP: LOAD DATA
# Load 'Actuals' data
_, _,_, _, data_cm_actual_allyears \
    = gather_data_actuals()

# Load features data
_, _, _, _, data_cm_features_allyears \
    = gather_data_features()

In [3]:
# Calculate the share of zeros for each 'country_id'.
share_of_zeros = data_cm_features_allyears.groupby('country_id')['ged_sb'].apply(lambda x: (x == 0).mean())

# Calculate the average fatalities for each 'country_id'.
average_fatalities = data_cm_features_allyears.groupby('country_id')['ged_sb'].mean()

# Combine the results into a new DataFrame.
desired_data = pd.DataFrame({
    'country_id': share_of_zeros.index,
    'share_structured_zeros': share_of_zeros.values,
    'avg_fatalities_per_month': average_fatalities.values
})

In [4]:
# Calculate quantiles of share of structured zeros in fatalities and average fatalities per month per country (fatalities = ged_sb) excluding the countries with structured zeros share of 1
structured_zeros_quantiles = desired_data['share_structured_zeros'].quantile([0.25, 0.5, 0.75])
avg_fatalities_quantiles = desired_data['avg_fatalities_per_month'].quantile([0.25, 0.5, 0.75])

In [5]:
# Determine all countries
actual_countries = data_cm_actual_allyears['country_id'].unique()
# Determine countries with at least one conflict fatality
feature_countries_non_zero = data_cm_features_allyears[data_cm_features_allyears['ged_sb'] > 0][
    'country_id'].unique()
# Determine countries whose the average fatalities per month are greater than the 0.75 quantile of average fatalities per month
countries_over_quantile = desired_data[desired_data['avg_fatalities_per_month'] > avg_fatalities_quantiles[0.75]]['country_id'].unique()

In [6]:
# Determine countries with at least one conflict fatality and in actuals data
feature_and_actuals_countries_non_zero = list(set(feature_countries_non_zero) & set(actual_countries))

# Determine the overlap between "actual_countries" and "countries_over_quantile"
feature_and_actuals_countries_non_zero_above_avg_quantile= list(set(feature_countries_non_zero) & set(actual_countries) & set(countries_over_quantile))

In [7]:
# Set country filter
country_filter = feature_and_actuals_countries_non_zero

In [10]:
# Set country filter
country_filter = feature_and_actuals_countries_non_zero_above_avg_quantile

In [11]:
# STEP: FEATURE SELECTION AND STANDARDIZATION
covariates = []
lagged_covariates = ['ged_sb', 'decay_ged_sb_5', 'decay_ged_sb_100', 'decay_ged_sb_500', 'ged_sb_tsum_24']
# Select countries which have at least one conflict fatality
filtered_data = data_cm_features_allyears[data_cm_features_allyears['country_id'].isin(country_filter)]
filtered_data = preprocess_data(filtered_data, covariates, lagged_covariates, standardize=True)

In [12]:
# Export data
filtered_data.to_parquet(r'C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set2.parquet')