In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from util.prediction import prepare_experiment_dfs
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import itertools

In [3]:

df_crawled, df_reg, df_editreg, df_class = prepare_experiment_dfs('data/events/all_events.csv.gz')
df_class['higher'] = df_class.gni_class.isin(['H', 'UM'])

# Tests for "Descriptive analysis of event articles"

### Difference between noticed articles across languages

### Tests for "Descriptive analysis of event articles"

#### Attention by category and income

In [4]:
from util.stats import make_contingency_table, calc_and_print_chi2, perform_chi2_3d, perform_chi2_2d
from scipy.stats import chi2_contingency, ttest_1samp, ttest_ind

In [5]:
# Worldwide (en/es) Wikis are more likely to have noticed/edited articles
df_code_noticed = make_contingency_table(df_class, 'worldwide', 'noticed')
p_val, expected, cont_tables = calc_and_print_chi2(df_code_noticed, text='Chi2 Overall')
df_code_noticed.subtract(expected, axis='columns')
df_code_edited = make_contingency_table(df_class, 'worldwide', 'edited')
p_val, expected, cont_tables = calc_and_print_chi2(df_code_edited, text='Chi2 Overall')
df_code_edited.subtract(expected, axis='columns')

Chi2 Overall 0.00000 (val=1641.997, dof=1)
Chi2 Overall 0.00000 (val=632.292, dof=1)


edited,0,1
worldwide,Unnamed: 1_level_1,Unnamed: 2_level_1
False,774.343911,-774.343911
True,-774.343911,774.343911


In [6]:
#  differences in attention and coverage between language editions when partitioned into the four article categories
p_val, expected, cont_tables = perform_chi2_3d(df_class, 'code', 'cat', 'noticed', pairwise=False)
multipletests(list(p_val.values()), method='bonferroni')

Test separation by de
Chi2 Overall 0.00000 (val=265.652, dof=3)
Test separation by en
Chi2 Overall 0.00000 (val=125.391, dof=3)
Test separation by es
Chi2 Overall 0.00000 (val=191.433, dof=3)
Test separation by it
Chi2 Overall 0.00000 (val=46.128, dof=3)


(array([ True,  True,  True,  True]),
 array([1.07683069e-56, 2.12913403e-26, 1.19733863e-40, 2.13098725e-09]),
 0.012741455098566168,
 0.0125)

In [7]:
# politics are less likely to be viewed or edited in Italian
p_val, expected, cont_tables = perform_chi2_3d(df_class[df_class.cat.isin(['politics'])], 'cat', 'code', 'noticed', pairwise=True)
print(multipletests(list(p_val.values()), method='bonferroni'))
p_val, expected, cont_tables = perform_chi2_3d(df_class[df_class.cat.isin(['politics'])], 'cat', 'code', 'edited', pairwise=True)
multipletests(list(p_val.values()), method='bonferroni')

Test separation by politics
Chi2 Overall 0.00000 (val=391.322, dof=3)
de vs. en
Chi2 0.00000 (val=29.818, dof=1)
de vs. es
Chi2 0.01015 (val=6.608, dof=1)
de vs. it
Chi2 0.00000 (val=89.042, dof=1)
en vs. es
Chi2 0.00000 (val=123.307, dof=1)
en vs. it
Chi2 0.00000 (val=382.740, dof=1)
es vs. it
Chi2 0.00000 (val=71.250, dof=1)
(array([ True,  True, False,  True,  True,  True,  True]), array([1.17444081e-83, 3.32147370e-07, 7.10623252e-02, 2.70500683e-20,
       8.36191962e-28, 2.20529587e-84, 2.20249134e-16]), 0.007300831979014655, 0.0071428571428571435)
Test separation by politics
Chi2 Overall 0.00000 (val=321.471, dof=3)
de vs. en
Chi2 0.06269 (val=3.465, dof=1)
de vs. es
Chi2 0.00000 (val=57.057, dof=1)
de vs. it
Chi2 0.00000 (val=80.610, dof=1)
en vs. es
Chi2 0.00000 (val=198.412, dof=1)
en vs. it
Chi2 0.00000 (val=200.491, dof=1)
es vs. it
Chi2 0.00389 (val=8.336, dof=1)


(array([ True, False,  True,  True,  True,  True,  True]),
 array([1.56845571e-68, 4.38803184e-01, 2.96338021e-13, 1.92474169e-18,
        3.24740824e-44, 1.14207815e-44, 2.72103452e-02]),
 0.007300831979014655,
 0.0071428571428571435)

In [8]:
# disasters are less likely to be viewed or edited in Italian
p_val, expected, cont_tables = perform_chi2_3d(df_class[df_class.cat.isin(['disaster'])], 'cat', 'code', 'noticed', pairwise=True)
print(multipletests(list(p_val.values()), method='bonferroni'))
p_val, expected, cont_tables = perform_chi2_3d(df_class[df_class.cat.isin(['disaster'])], 'cat', 'code', 'edited', pairwise=True)
print(multipletests(list(p_val.values()), method='bonferroni'))

Test separation by disaster
Chi2 Overall 0.00000 (val=54.860, dof=3)
de vs. en
Chi2 0.65959 (val=0.194, dof=1)
de vs. es
Chi2 0.00048 (val=12.174, dof=1)
de vs. it
Chi2 0.00000 (val=29.279, dof=1)
en vs. es
Chi2 0.00001 (val=18.935, dof=1)
en vs. it
Chi2 0.00000 (val=39.330, dof=1)
es vs. it
Chi2 0.00427 (val=8.167, dof=1)
(array([ True, False,  True,  True,  True,  True,  True]), array([5.14827804e-11, 1.00000000e+00, 3.39147073e-03, 4.38642310e-07,
       9.46943341e-05, 2.50540767e-09, 2.98578929e-02]), 0.007300831979014655, 0.0071428571428571435)
Test separation by disaster
Chi2 Overall 0.00000 (val=91.530, dof=3)
de vs. en
Chi2 0.06527 (val=3.398, dof=1)
de vs. es
Chi2 0.00000 (val=45.238, dof=1)
de vs. it
Chi2 0.00000 (val=37.229, dof=1)
en vs. es
Chi2 0.00000 (val=52.247, dof=1)
en vs. it
Chi2 0.00000 (val=33.750, dof=1)
es vs. it
Chi2 0.45629 (val=0.555, dof=1)
(array([ True, False,  True,  True,  True,  True, False]), array([7.19488396e-19, 4.56866252e-01, 1.22130520e-10, 7.35

In [15]:
# in all languages users are more likely to view or edit articles from countries in the upper-middle or high-income classes
p_val, expected, cont_tables = perform_chi2_3d(df_class, 'code', 'higher', 'noticed', pairwise=False)
multipletests(list(p_val.values()), method='bonferroni')
p_val, expected, cont_tables = perform_chi2_3d(df_class, 'code', 'higher', 'edited', pairwise=False)
multipletests(list(p_val.values()), method='bonferroni')

Test separation by de
Chi2 Overall 0.05664 (val=3.633, dof=1)
Test separation by en
Chi2 Overall 0.00002 (val=17.949, dof=1)
Test separation by es
Chi2 Overall 0.00000 (val=41.968, dof=1)
Test separation by it
Chi2 Overall 0.00000 (val=46.228, dof=1)
Test separation by de
Chi2 Overall 0.29280 (val=1.107, dof=1)
Test separation by en
Chi2 Overall 0.01236 (val=6.259, dof=1)
Test separation by es
Chi2 Overall 0.00310 (val=8.745, dof=1)
Test separation by it
Chi2 Overall 0.00000 (val=34.854, dof=1)


(array([False,  True,  True,  True]),
 array([1.00000000e+00, 4.94267442e-02, 1.24191192e-02, 1.42169079e-08]),
 0.012741455098566168,
 0.0125)

In [16]:
# In Spanish, although more articles about events in high-income countries exist, events in upper-middle income countries are more likely to be viewed or edited
df_class['is_UM'] = df_class.gni_class == 'UM'
p_val, expected, cont_tables = perform_chi2_2d(df_class[df_class.code == 'es'], 'higher', 'noticed')
print(multipletests(list(p_val.values()), method='bonferroni'))
p_val, expected, cont_tables = perform_chi2_2d(df_class[df_class.code == 'es'], 'higher', 'edited')
print(multipletests(list(p_val.values()), method='bonferroni'))

Chi2 Overall 0.00000 (val=41.968, dof=1)
(array([ True]), array([9.2798465e-11]), 0.050000000000000044, 0.05)
Chi2 Overall 0.00310 (val=8.745, dof=1)
(array([ True]), array([0.00310478]), 0.050000000000000044, 0.05)


In [17]:
# lower and middle-income countries are more likely to have articles that are noticed or edited
p_val, expected, cont_tables = perform_chi2_2d(df_class, 'cat', 'higher')
print(multipletests(list(p_val.values()), method='bonferroni'))
p_val, expected, cont_tables = perform_chi2_2d(df_class, 'cat', 'higher')
print(multipletests(list(p_val.values()), method='bonferroni'))

Chi2 Overall 0.00000 (val=1081.972, dof=3)
culture vs. disaster
Chi2 0.00000 (val=190.946, dof=1)
culture vs. politics
Chi2 0.00000 (val=42.675, dof=1)
culture vs. sports
Chi2 0.00008 (val=15.447, dof=1)
disaster vs. politics
Chi2 0.00000 (val=127.549, dof=1)
disaster vs. sports
Chi2 0.00000 (val=1059.916, dof=1)
politics vs. sports
Chi2 0.00000 (val=378.313, dof=1)
(array([ True,  True,  True,  True,  True,  True,  True]), array([2.07648758e-233, 1.38315006e-042, 4.52437481e-010, 5.94008765e-004,
       9.86390865e-029, 1.19154477e-231, 2.02895174e-083]), 0.007300831979014655, 0.0071428571428571435)
Chi2 Overall 0.00000 (val=1081.972, dof=3)
culture vs. disaster
Chi2 0.00000 (val=190.946, dof=1)
culture vs. politics
Chi2 0.00000 (val=42.675, dof=1)
culture vs. sports
Chi2 0.00008 (val=15.447, dof=1)
disaster vs. politics
Chi2 0.00000 (val=127.549, dof=1)
disaster vs. sports
Chi2 0.00000 (val=1059.916, dof=1)
politics vs. sports
Chi2 0.00000 (val=378.313, dof=1)
(array([ True,  True,  

In [18]:
p_val, expected, cont_tables = perform_chi2_2d(df_class, 'cat', 'higher', pairwise=False)
for key, val in cont_tables.items():
    display(val.subtract(expected[key]))

Chi2 Overall 0.00000 (val=1081.972, dof=3)


higher,False,True
cat,Unnamed: 1_level_1,Unnamed: 2_level_1
culture,-24.964437,24.964437
disaster,344.260949,-344.260949
politics,192.536993,-192.536993
sports,-511.833505,511.833505


In [9]:
p_val, expected, cont_tables = perform_chi2_2d(df_class.query('cat == "disaster"'), 'code', 'cat', pairwise=False)
for key, val in cont_tables.items():
    display(key)
    display(val.subtract(expected[key]))

Chi2 Overall 1.00000 (val=0.000, dof=0)


('code', 'all')

cat,disaster
code,Unnamed: 1_level_1
de,0.0
en,0.0
es,0.0
it,0.0


### Geography and attention to events

In [10]:
from scipy.stats import mannwhitneyu

In [11]:
df_continent_de = df_class[df_class.code == 'de'].continent.value_counts()
df_continent_de['Europe'] / df_continent_de.sum()

0.46064673581452104

In [12]:
df_de_articles = df_class[df_class.code == 'de'].country.value_counts()
df_de_articles[['United States', 'Brazil']] / df_de_articles.sum()

United States    0.093655
Brazil           0.081757
Name: country, dtype: float64

In [13]:
df_it_articles = df_class[df_class.code == 'it'].country.value_counts()
df_it_articles[['United States', 'Brazil']] / df_it_articles.sum()

United States    0.082938
Brazil           0.119109
Name: country, dtype: float64

In [14]:
df_continent_it = df_class[df_class.code == 'it'].continent.value_counts()
df_continent_it['Europe'] / df_continent_it.sum()

0.5436609426379247

In [15]:
df_it_articles = df_class[df_class.code == 'it'].country.value_counts()
df_it_articles[['United States', 'Brazil']] / df_it_articles.sum()

United States    0.082938
Brazil           0.119109
Name: country, dtype: float64

In [16]:
df_es_articles = df_class[df_class.code == 'es'].country.value_counts()
df_es_articles[['United States', 'Spain']] / df_it_articles.sum()

United States    0.133358
Spain            0.138838
Name: country, dtype: float64

In [17]:
df_region_es = df_class[df_class.code == 'es'].gni_region.value_counts()
df_region_es['Latin America & Caribbean'] / df_region_es.sum()

0.23100731570061903

In [18]:
from statsmodels.stats.proportion import proportions_ztest
country = 'Brazil'
code = 'it'
other_codes = ['es', 'en', 'de']
df_country_it = df_class[df_class.code == code].country.value_counts()
for c in other_codes:
    df_country_c = df_class[df_class.code == c].country.value_counts()
    z_stat, p_value = proportions_ztest(
        [df_country_it[country], df_country_c[country]], [df_country_it.sum(), df_country_c.sum()], alternative='larger')
    print(f'{code} vs. {c} ({df_country_it[country]/df_country_it.sum():.2f} vs. {df_country_c[country] / df_country_c.sum():.2f})', p_value)

it vs. es (0.12 vs. 0.07) 4.928308246213556e-13
it vs. en (0.12 vs. 0.05) 2.250516796081022e-38
it vs. de (0.12 vs. 0.08) 6.643688702432038e-07


In [19]:
from statsmodels.stats.proportion import proportions_ztest
country = 'Spain'
code = 'es'
other_codes = ['es', 'en', 'it','de']
other_codes.remove(code)
df_country_it = df_class[df_class.code == code].country.value_counts()
for c in other_codes:
    df_country_c = df_class[df_class.code == c].country.value_counts()
    z_stat, p_value = proportions_ztest(
        [df_country_it[country], df_country_c[country]], [df_country_it.sum(), df_country_c.sum()], alternative='larger')
    print(f'{code} vs. {c} ({df_country_it[country]/df_country_it.sum():.2f} vs. {df_country_c[country] / df_country_c.sum():.2f})', p_value)

es vs. en (0.11 vs. 0.02) 1.4290592650903739e-78
es vs. it (0.11 vs. 0.04) 7.0539613101279e-22
es vs. de (0.11 vs. 0.02) 1.0783953788610659e-49


In [21]:
from statsmodels.stats.proportion import proportions_ztest
region = 'Latin America & Caribbean'
code = 'es'
other_codes = ['es', 'en', 'it','de']
other_codes.remove(code)
df_region = df_class[df_class.code == code].gni_region.value_counts()
for c in other_codes:
    df_region_c = df_class[df_class.code == c].gni_region.value_counts()
    z_stat, p_value = proportions_ztest(
        [df_region[region], df_region_c[region]], [df_region.sum(), df_region_c.sum()], alternative='larger')
    print(f'{code} vs. {c} ({df_region_c[region]/df_region_c.sum():.2f} vs. {df_region[region] / df_region.sum():.2f})', p_value)

es vs. en (0.10 vs. 0.23) 3.5920626073691374e-77
es vs. it (0.18 vs. 0.23) 4.363731375578671e-06
es vs. de (0.15 vs. 0.23) 1.856978044853246e-18


In [22]:
from util.preprocessing import label_language

df_class = label_language(df_class)

In [23]:
df_anglophone = df_class[df_class.in_code_lang & (df_class.code=='en')].country.value_counts()
df_non_anglophone = df_class[~df_class.in_code_lang & (df_class.code=='en')].country.value_counts()
usa_anglo_articles = df_anglophone.values
other_articles = df_non_anglophone.values

In [47]:
for code, df_code in df_class.groupby(['code']):
     df_lang = df_code.groupby(['in_code_lang']).views_7_sum.agg(list)
     print(f'{code}', mannwhitneyu(df_lang[True], df_lang[False], alternative='greater'))

de MannwhitneyuResult(statistic=719503.0, pvalue=2.504895318194536e-17)
en MannwhitneyuResult(statistic=8416175.0, pvalue=5.681794966514316e-12)
es MannwhitneyuResult(statistic=1399975.5, pvalue=3.157464293691152e-10)
it MannwhitneyuResult(statistic=455290.0, pvalue=1.6497448895645982e-11)


# SHAP Values

In [26]:
from util.prediction import load_rf_and_xgb_models
from util.shapley import combine_categories_SHAP_all, load_shapval_results
from scipy.stats import spearmanr, pearsonr, f_oneway, kruskal

In [27]:
models, model_eval = load_rf_and_xgb_models(df_class, df_reg, df_editreg)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

In [28]:
shap_vals, shap_vals_acv, shap_vals_acv_int = load_shapval_results('shap/ACV_SHAP_SHAPINT_noreg.pkl')
shap_vals = combine_categories_SHAP_all(shap_vals)

  from .autonotebook import tqdm as notebook_tqdm


In [67]:
model_prefix, model_postfix ='noreg_', '_xgb'
model_str = 'views'
full_model_str = f'{model_prefix}{model_str}{model_postfix}'

model, shapvals = model_eval[full_model_str], shap_vals[full_model_str]
df_full = model.get_full_dataset(decoded=True, all_cols=True)
x_cont = 'GDP_pc_log'
x_col = 'GDP_pc_log'
shap_vals_plot = pd.Series(shapvals[:, x_cont if x_cont else x_col].values,
                           name=x_cont if x_cont else x_col).to_frame()
df_full['shap_vals'] = shap_vals_plot.values

In [152]:
rescomp = three_way_mwu(df_full, 'shap_vals', adjust='fdr_bh')
rep_dict = {'de': 'German', 'en': 'English', 'it': 'Italian', 'es': 'Spanish',
            'culture': 'Culture', 'disaster': 'Disaster', 'sports': 'Sports', 'politics': 'Politics', 'cat': 'Category', 'code': 'Language', 'gni_region': 'Region'}
rescomp = rescomp.replace(rep_dict)
rescomp = rescomp.rename(rep_dict, axis=1)
rescomp.to_csv('shap/reg_cat_comp.csv', index=False)
rescomp

gni_region,Language,Category,Region,South Asia,East Asia & Pacific,Latin America & Caribbean,Middle East & North Africa,Europe & Central Asia,Sub-Saharan Africa,North America
0,German,Culture,South Asia,,,,,,,
1,German,Culture,East Asia & Pacific,,,,,,,
2,German,Culture,Latin America & Caribbean,,,,,,,
3,German,Culture,Middle East & North Africa,,,,,,,
4,German,Culture,Europe & Central Asia,,,,0.157,,,
...,...,...,...,...,...,...,...,...,...,...
107,Italian,Sports,Latin America & Caribbean,0.168,0.000,,,,,
108,Italian,Sports,Middle East & North Africa,0.449,0.467,0.000,,,,
109,Italian,Sports,Europe & Central Asia,0.214,0.000,0.000,0.000,,,
110,Italian,Sports,Sub-Saharan Africa,1.000,0.930,0.100,1.000,0.151,,


In [165]:
for code, df_code in df_full.groupby(['code']):
    print(code, pearsonr(df_code['GDP_pc_log'], df_code['shap_vals']))

de (-0.5032945112314637, 1.4819369703383136e-107)
en (-0.557800934412676, 0.0)
es (-0.6003776889639811, 2.134521816819079e-274)
it (-0.5056593320458029, 7.263629413285498e-103)
