In [1]:
import pandas as pd, numpy as np
from datetime import date, datetime, timedelta
from scipy.stats import ks_2samp, mannwhitneyu, anderson_ksamp

# statistical tests

| Test             | Data type     | Sensitive to shape?    | Null Hypothesis                            |
| ---------------- | ------------- | ---------------------- | ------------------------------------------ |
| KS Test          | Continuous    | Yes                    | Same distribution                          |
| Mann–Whitney U   | Ordinal/cont. | Less so                | Median of both samples is equal            |
| Anderson–Darling | Continuous    | Yes (especially tails) | All samples drawn from the same popiontin  |

# parameters

In [2]:
yeari, yearf = '2024', '2024'
weeki, weekf = '18', '31'

In [3]:
di = datetime.strptime(f'{yeari}-{weeki}-1', "%Y-%W-%w").date()
df = datetime.strptime(f'{yearf}-{weekf}-1', "%Y-%W-%w").date() + timedelta(6)
ds = [di+timedelta(dt) for dt in range((df-di).days+1)]
daylist = ds
print(di, 'until', df)

2024-04-29 until 2024-08-04


In [4]:
cdef = 'tl7_10m'# 'tl5_10m' 'tl6_10m' 'tl7_10m' 'tl8_10m' 'tl8_60m'
cdef_alt = '16m_10min'# tl5: 62 ... tl7: 16   tl8: 8

# load data

In [5]:
data = pd.read_csv(f'data/fig2/sensitivity_cities.csv', encoding='utf-8-sig')
data['day'] = [d.date() for d in pd.to_datetime(data.day)]
data

Unnamed: 0,resolution,city,day,event,contacts_nonunique,contacts_unique,to_baseline_nonunique,to_baseline_unique
0,16m_10min,Berlin,2024-04-29,,6086,2624,1.222768,1.064337
1,16m_10min,Berlin,2024-04-30,,6004,3042,1.123083,1.137229
2,16m_10min,Berlin,2024-05-01,,3182,1370,0.641921,0.525969
3,16m_10min,Berlin,2024-05-02,,5222,2604,0.932833,0.948683
4,16m_10min,Berlin,2024-05-03,,6342,2920,1.060726,1.026723
...,...,...,...,...,...,...,...,...
4920,8m_60min,Stuttgart,2024-07-31,,816,618,0.713968,0.820618
4921,8m_60min,Stuttgart,2024-08-01,,844,594,0.750111,0.795536
4922,8m_60min,Stuttgart,2024-08-02,,698,490,0.644885,0.674931
4923,8m_60min,Stuttgart,2024-08-03,SDP,2116,1348,2.059094,1.965536


In [6]:
data_ctry = pd.read_csv(f'output/00_sensitivity_germany.csv', encoding='utf-8-sig')
data_ctry['day'] = [d.date() for d in pd.to_datetime(data_ctry.day)]
data_ctry

Unnamed: 0,resolution,day,contacts_nonunique,contacts_unique,to_baseline_nonunique,to_baseline_unique
0,16m_10min,2024-04-29,117296,84830,1.099284,1.093875
1,16m_10min,2024-04-30,135926,102094,1.266744,1.297274
2,16m_10min,2024-05-01,103092,66764,0.937958,0.826169
3,16m_10min,2024-05-02,112532,82434,0.993173,0.976744
4,16m_10min,2024-05-03,121838,92254,0.949512,0.937602
...,...,...,...,...,...,...
485,8m_60min,2024-07-31,162784,139872,0.950318,0.956297
486,8m_60min,2024-08-01,168258,146124,0.950898,0.960743
487,8m_60min,2024-08-02,196900,173368,0.964589,0.970422
488,8m_60min,2024-08-03,193568,167440,1.038441,1.063260


# analyses

## cities: test if event & non-event have same contacts

In [7]:
data_kpi = data[['resolution','city','day','event','to_baseline_unique']].copy(deep=True)# [data.day>=date(2024,6,1)]
data_kpi = data_kpi.rename(columns={'to_baseline_unique':'val'})
# exclude public holidays and previous days
holis = [date(2024,5,1), date(2024,5,9), date(2024,5,20)]
#data_kpi = data_kpi[~data_kpi.day.isin(holis+[d-timedelta(1) for d in holis])]
# exclude non-German UEFA EURO 2024 matches
#data_kpi['event'] = [e if len(str(e))!=7 else (e if e[3]!='-' else (e if 'GER' in e else np.nan)) for e in data_kpi.event]

nonevent = data_kpi[data_kpi.event.isna()].drop(columns=['event']).drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()# ,'city'
event = data_kpi[~data_kpi.event.isna()].drop(columns=['event']).drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()# ,'city'
for_test = nonevent.merge(event, on=['resolution'], suffixes=('_nonevent','_event'))# ,'city'

#for_test['pval'] = [ks_2samp(list1, list2)[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['pval'] = [mannwhitneyu(list1, list2, alternative='two-sided')[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
#for_test['pval'] = [anderson_ksamp([list1, list2]).significance_level for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['significant'] = for_test.pval < .05
for_test

Unnamed: 0,resolution,val_nonevent,val_event,pval,significant
0,16m_10min,"[1.0643369734789392, 1.1372289641686315, 0.525...","[1.3297974927675988, 1.0704206602768904, 0.948...",3.549977e-29,True
1,31m_10min,"[1.0836585219427268, 1.1126946206442043, 0.497...","[1.2690654354402169, 1.081140350877193, 0.9667...",2.091233e-31,True
2,62m_10min,"[1.0963097153030468, 1.1086104419960374, 0.500...","[1.2953730578794604, 1.1123506549589752, 1.005...",4.57236e-36,True
3,8m_10min,"[1.0738042880703682, 1.107851814388671, 0.5792...","[1.429460580912863, 1.1507510729613732, 1.0269...",2.3431800000000003e-27,True
4,8m_60min,"[1.0765098184512782, 1.0533776054019364, 0.485...","[1.2661893914741296, 1.07958984375, 0.92206313...",4.500664e-24,True


In [8]:
print(for_test.val_nonevent.apply(len).iloc[0], for_test.val_event.apply(len).iloc[0])

816 164


In [9]:
for_test['median_nonevent'] = for_test.val_nonevent.apply(np.median)
for_test['q1_nonevent'] = for_test.val_nonevent.apply(lambda x: np.percentile(x, 25))
for_test['q3_nonevent'] = for_test.val_nonevent.apply(lambda x: np.percentile(x, 75))

for_test['median_event'] = for_test.val_event.apply(np.median)
for_test['q1_event'] = for_test.val_event.apply(lambda x: np.percentile(x, 25))
for_test['q3_event'] = for_test.val_event.apply(lambda x: np.percentile(x, 75))

order = ['62m_10min','31m_10min','16m_10min','8m_10min','8m_60min']
for_test['resolution'] = pd.Categorical(for_test['resolution'], categories=order, ordered=True)

for_test.drop(columns=['val_nonevent','val_event']).sort_values(['resolution'])# 'city',

Unnamed: 0,resolution,pval,significant,median_nonevent,q1_nonevent,q3_nonevent,median_event,q1_event,q3_event
2,62m_10min,4.57236e-36,True,1.005564,0.902375,1.103002,1.24456,1.06946,1.569338
1,31m_10min,2.091233e-31,True,0.998761,0.892547,1.109221,1.25591,1.053804,1.570453
0,16m_10min,3.549977e-29,True,0.998067,0.881928,1.118015,1.254251,1.038835,1.562078
3,8m_10min,2.3431800000000003e-27,True,0.996428,0.863226,1.110393,1.266511,1.030083,1.553829
4,8m_60min,4.500664e-24,True,1.00588,0.871605,1.114232,1.249005,1.001794,1.496185


## cities: test if non-event & non-event have same contacts

In [10]:
# Set random seed for reproducibility
np.random.seed(42)
# take as many random elements from non-event contact numbers as there are event contact numbers
nsam = for_test.val_event.apply(len).iloc[0]

nonevent = data_kpi[data_kpi.event.isna()].groupby(['resolution']).val.apply(list).reset_index()# ,'city'
event = data_kpi[data_kpi.event.isna()].groupby(['resolution']).val.apply(list).reset_index()# ,'city'
event['val_subsampled'] = [np.random.choice(vals, size=nsam, replace=False) for vals in event.val]
event = event.drop(columns=['val']).rename(columns={'val_subsampled':'val'})
for_test = nonevent.merge(event, on=['resolution'], suffixes=('_nonevent','_event'))# ,'city'

#for_test['pval'] = [ks_2samp(list1, list2)[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['pval'] = [mannwhitneyu(list1, list2, alternative='two-sided')[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
#for_test['pval'] = [anderson_ksamp([list1, list2]).significance_level for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['significant'] = for_test.pval < .05
for_test

Unnamed: 0,resolution,val_nonevent,val_event,pval,significant
0,16m_10min,"[1.0643369734789392, 1.1372289641686315, 0.525...","[1.1428571428571428, 0.975, 1.461139896373057,...",0.853562,False
1,31m_10min,"[1.0836585219427268, 1.1126946206442043, 0.497...","[1.3880048959608324, 0.8571946357375861, 0.976...",0.522045,False
2,62m_10min,"[1.0963097153030468, 1.1086104419960374, 0.500...","[0.8074141800349098, 1.0727243013564307, 1.065...",0.688719,False
3,8m_10min,"[1.0738042880703682, 1.107851814388671, 0.5792...","[0.96640826873385, 1.1032132424537489, 0.54199...",0.253927,False
4,8m_60min,"[1.0765098184512782, 1.0533776054019364, 0.485...","[1.0065125366330183, 1.3042967514571004, 0.953...",0.636965,False


In [11]:
print(for_test.val_nonevent.apply(len).iloc[0], for_test.val_event.apply(len).iloc[0])

816 164


## Germany: test if event & non-event have same contacts

In [12]:
data_kpi = data_ctry[['resolution','day','to_baseline_unique']].copy(deep=True)# [data.day>=date(2024,6,1)]
data_kpi = data_kpi.rename(columns={'to_baseline_unique':'val'})
# exclude public holidays and previous days
holis = [date(2024,5,1), date(2024,5,9), date(2024,5,20)]
#data_kpi = data_kpi[~data_kpi.day.isin(holis+[d-timedelta(1) for d in holis])]

# days of German matches
dgers = list(set(data[~data.event.isna()][data[~data.event.isna()].event.apply(lambda x: 'GER' in x)].day))
nonevent = data_kpi[~data_kpi.day.isin(dgers)].drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()
event = data_kpi[data_kpi.day.isin(dgers)].drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()
for_test = nonevent.merge(event, on=['resolution'], suffixes=('_nonevent','_event'))

#for_test['pval'] = [ks_2samp(list1, list2)[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['pval'] = [mannwhitneyu(list1, list2, alternative='two-sided')[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
#for_test['pval'] = [anderson_ksamp([list1, list2]).significance_level for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['significant'] = for_test.pval < .05
for_test

Unnamed: 0,resolution,val_nonevent,val_event,pval,significant
0,16m_10min,"[1.0938749194068342, 1.2972742388707768, 0.826...","[0.9819543959753468, 0.947537569761049, 0.9321...",0.327151,False
1,31m_10min,"[1.078404909918549, 1.27920788971967, 0.732840...","[0.9873381783983988, 0.9408991174896651, 0.923...",0.343287,False
2,62m_10min,"[1.0711959243053293, 1.2721700447427484, 0.692...","[0.9911248574171456, 0.9412932135323177, 0.927...",0.376964,False
3,8m_10min,"[1.114320898968649, 1.2828935923456075, 0.9416...","[0.988651010344178, 0.9344671594195734, 0.9329...",0.267335,False
4,8m_60min,"[1.0871584311747813, 1.3058665725379457, 0.791...","[0.9940322974015744, 0.9311920019455994, 0.928...",0.227429,False


In [13]:
print(for_test.val_nonevent.apply(len).iloc[0], for_test.val_event.apply(len).iloc[0])

93 5


In [14]:
for_test['median_nonevent'] = for_test.val_nonevent.apply(np.median)
for_test['q1_nonevent'] = for_test.val_nonevent.apply(lambda x: np.percentile(x, 25))
for_test['q3_nonevent'] = for_test.val_nonevent.apply(lambda x: np.percentile(x, 75))

for_test['median_event'] = for_test.val_event.apply(np.median)
for_test['q1_event'] = for_test.val_event.apply(lambda x: np.percentile(x, 25))
for_test['q3_event'] = for_test.val_event.apply(lambda x: np.percentile(x, 75))

order = ['62m_10min','31m_10min','16m_10min','8m_10min','8m_60min']
for_test['resolution'] = pd.Categorical(for_test['resolution'], categories=order, ordered=True)

for_test.drop(columns=['val_nonevent','val_event']).sort_values(['resolution'])

Unnamed: 0,resolution,pval,significant,median_nonevent,q1_nonevent,q3_nonevent,median_event,q1_event,q3_event
2,62m_10min,0.376964,False,0.995797,0.949677,1.037698,0.990714,0.941293,0.991125
1,31m_10min,0.343287,False,0.991003,0.950149,1.033824,0.983932,0.940899,0.987338
0,16m_10min,0.327151,False,0.990359,0.956323,1.029708,0.975735,0.947538,0.981954
3,8m_10min,0.267335,False,0.985845,0.952936,1.026919,0.973224,0.934467,0.984883
4,8m_60min,0.227429,False,0.992716,0.942061,1.035222,0.980493,0.931192,0.987886


## Germany: test if non-event & non-event have same contacts

In [15]:
# Set random seed for reproducibility
np.random.seed(42)
# take as many random elements from non-event contact numbers as there are event contact numbers
nsam = for_test.val_event.apply(len).iloc[0]

nonevent = data_kpi[~data_kpi.day.isin(dgers)].drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()
event = data_kpi[~data_kpi.day.isin(dgers)].drop_duplicates().groupby(['resolution']).val.apply(list).reset_index()
event['val_subsampled'] = [np.random.choice(vals, size=nsam, replace=False) for vals in event.val]
event = event.drop(columns=['val']).rename(columns={'val_subsampled':'val'})
for_test = nonevent.merge(event, on=['resolution'], suffixes=('_nonevent','_event'))# ,'city'

#for_test['pval'] = [ks_2samp(list1, list2)[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['pval'] = [mannwhitneyu(list1, list2, alternative='two-sided')[1] for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
#for_test['pval'] = [anderson_ksamp([list1, list2]).significance_level for list1, list2 in zip(for_test.val_nonevent, for_test.val_event)]
for_test['significant'] = for_test.pval < .05
for_test

Unnamed: 0,resolution,val_nonevent,val_event,pval,significant
0,16m_10min,"[1.0938749194068342, 1.2972742388707768, 0.826...","[0.9936086003967524, 1.121261517684047, 0.9334...",0.36165,False
1,31m_10min,"[1.078404909918549, 1.27920788971967, 0.732840...","[0.8577064941639277, 1.0070997288575692, 0.972...",0.827456,False
2,62m_10min,"[1.0711959243053293, 1.2721700447427484, 0.692...","[1.1720685830809765, 0.9734164691497248, 0.967...",0.993559,False
3,8m_10min,"[1.114320898968649, 1.2828935923456075, 0.9416...","[0.985845242557832, 0.9827282629780504, 0.9927...",0.611043,False
4,8m_60min,"[1.0871584311747813, 1.3058665725379457, 0.791...","[0.9660713025061772, 1.0008054628552996, 1.305...",0.942081,False


In [16]:
print(for_test.val_nonevent.apply(len).iloc[0], for_test.val_event.apply(len).iloc[0])

93 5
