In [1]:
%load_ext autoreload
%autoreload 2

# Analysis of Article Category Specific Features: Disasters by Deaths and GDP per capita

In [2]:
import pandas as pd
import numpy as np
import requests as rq
from util.prediction import transform_vars_for_regression

## Retrieving deaths and injured from WikiData

In [27]:
# Note: You do not have to run this, you can just load the prepared dataframe here:
df_disasters = pd.read_csv('data/events/df_disasters.csv.gz')
df_disasters = transform_vars_for_regression(df_disasters)

### Load data from WikiData
This is not necessary if you upload the dataframe above

In [3]:
df_crawled = pd.read_csv('data/events/all_events.csv.gz').drop_duplicates().drop(['Unnamed: 0'], axis=1)
df_disasters = df_crawled[df_crawled.cat == 'disaster']
df_disasters = transform_vars_for_regression(df_disasters)

# Retrieve disaster info from Wikidata
dict_jsons = {}
for qid in df_disasters.event_id.unique():
    rec_json = rq.get(f'https://www.wikidata.org/wiki/Special:EntityData/{qid}.json').json()
    dict_jsons[qid] = rec_json

# Extract PIDS
#  number of survivors (P1561) 
#  number of injured (P1339) 
#  number of deaths (P1120) 
def get_dataval(entity, property_id):
    if 'claims' in entity:
        entity = entity['claims']
        if property_id in entity:
            dataval = entity[property_id][0]['mainsnak']
            if 'datavalue' in dataval:
                return int(dataval['datavalue']['value']['amount'].replace('+', ''))
            else:
                return None # Unknown
    return None

pd_data = []
for key, vals in dict_jsons.items():
    entity = list(vals['entities'].values())[0]
    survivors, injured, deaths = \
        get_dataval(entity, 'P1561'), get_dataval(entity, 'P1339'), get_dataval(entity, 'P1120')
    pd_data.append([key, survivors, injured, deaths])
df_casualties = pd.DataFrame(pd_data, columns=['event_id', 'survivors', 'injured', 'deaths'])

# Merge DF
df_disasters = df_disasters.merge(df_casualties.fillna(0), on='event_id')
df_disasters = transform_vars_for_regression(df_disasters)
df_disasters['casualties'] = df_disasters.deaths + df_disasters.injured
df_disasters['deaths_log'] = np.log1p(df_disasters.deaths)
df_disasters['casualties_log'] = np.log1p(df_disasters.casualties)
df_disasters['injured_log'] = np.log1p(df_disasters.injured)
df_disasters.to_csv('data/events/df_disasters_wikidata.csv', index=False)

## Fit XGB Tree and Compute SHAP values for Experiment

In [28]:
from xgboost import XGBRegressor
from util.prediction import ModelEvaluator
import shap
from util.plot import plot_disaster_results, build_colormap

In [181]:
df_disasters_reg = df_disasters[df_disasters.views_7_sum > 10]
df_disasters_reg['views_7_sum_log'] = np.log1p(df_disasters_reg.views_7_sum)
df_disasters_reg.event_date = pd.to_datetime(df_disasters_reg.event_date)

In [203]:
# fit model via random search
param_grid_xgboost_reg = {'max_depth': [4, 5, 6, 10, 15, 20],
                          'n_estimators': [10, 100, 500, 1000],
                          'learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
                          'gamma': [0, 1, 2, 5, 10],
                          'reg_lambda': [0, 1, 10],
                          'scale_pos_weight': [0.05, 0.1, 1, 2]}

xgb_disasters = ModelEvaluator(XGBRegressor(n_jobs=144), df_disasters_reg, 'views_7_sum_log',  'neg_mean_squared_error', params=param_grid_xgboost_reg,
                               cont_cols = ['GDP_pc_log', 'deaths_log', 'country_articles_log', 'population_log', 'cat_articles_log', 'view_country_article_log', 'views_baseline_log'],
                               factor_cols=['code'])
xgb_disasters.grid_search(n_jobs=1)

xgb_disasters_simple = ModelEvaluator(XGBRegressor(n_jobs=144), df_disasters_reg, 'views_7_sum_log', 'neg_mean_squared_error', params=param_grid_xgboost_reg, cont_cols = ['GDP_pc_log', 'deaths_log'], factor_cols=['code'])
xgb_disasters_simple.grid_search(n_jobs=1)

xgb_disasters.retrain_full()
xgb_disasters_simple.retrain_full()

RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, gamma=None,
                                          gpu_id=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_co...
                                          n_estimators=100, n_jobs=144,
                                          num_parallel_tree=None,
                                          predict

In [None]:
df_full = pd.concat([xgb_disasters.df_train_full, xgb_disasters.df_test])[xgb_disasters.encoded_columns]
explainer = shap.Explainer(xgb_disasters.full_model)
shap_values = explainer(df_full)
shap_interaction = explainer.shap_interaction_values(df_full)
plot_disaster_results(df_full, shap_values, shap_interaction, save_path=None)

In [None]:
df_full = pd.concat([xgb_disasters_simple.df_train_full, xgb_disasters_simple.df_test])[xgb_disasters_simple.encoded_columns]
explainer = shap.Explainer(xgb_disasters_simple.full_model)
shap_values = explainer(df_full)
shap_interaction = explainer.shap_interaction_values(df_full)
plot_disaster_results(df_full, shap_values, shap_interaction, save_path=None,)

In [None]:
cmp = build_colormap('GDP pc. (log)', (df_full['GDP pc. (log)'].min(), df_full['GDP pc. (log)'].max()), cmap='viridis', labelpad=-13, horizontal=True, adjust_limits=False, save_path='figures/disasters/deaths_col_cm_horizontal.pdf', ticks=[5.0, 8, 11.])