In [451]:
import pandas as pd
from collections import defaultdict
import os
from datetime import datetime
from scipy.stats import ttest_ind
from numpy import mean

In [482]:
#set constants

sources = ["Matteo Salvini"]

start_collection = '2019-11-10'
start_polarization = '2019-11-20'


timerange = ('2019-11-21',
             '2019-11-28')


folder = '../outputs/fb/summary'
file_users =  '../config/users.csv'


In [489]:
#load all impressions, filtering useful fields and replacing hased names with fb names


def absoluteFilePaths(directory):
    for dirpath, _, filenames in os.walk(directory):
        for f in filenames:
            if str(f)[0] == '.':
                pass
            else:
                yield os.path.abspath(os.path.join(dirpath, f))

def getImpressions(folder):
    impressions = pd.DataFrame()
    files = absoluteFilePaths(folder)
    tokens2names = pd.read_csv(file_users)
    for impressions_file in files:
        user_impressions = pd.read_csv(impressions_file)[['impressionTime', 'user', 'source']] #you can add fields of interest here
        name = tokens2names[tokens2names['token'] == impressions_file.split('/')[-1].split('.')[0]]['name'].values[0]
        user_impressions['user'] = name
        impressions = pd.concat([impressions, user_impressions])      
    return impressions

impressions = getImpressions(folder)



                  impressionTime    user  \
0      2020-01-19 00:21:32+00:00  Nienke   
1      2020-01-19 00:21:26+00:00  Nienke   
2      2020-01-19 00:21:26+00:00  Nienke   
3      2020-01-19 00:21:26+00:00  Nienke   
4      2020-01-19 00:21:22+00:00  Nienke   
5      2020-01-16 17:46:33+00:00  Nienke   
6      2020-01-16 17:46:33+00:00  Nienke   
7      2020-01-16 17:46:32+00:00  Nienke   
8      2020-01-16 17:46:31+00:00  Nienke   
9      2020-01-16 17:00:16+00:00  Nienke   
10     2020-01-16 16:59:57+00:00  Nienke   
11     2020-01-16 16:59:50+00:00  Nienke   
12     2020-01-16 16:59:49+00:00  Nienke   
13     2020-01-16 16:59:49+00:00  Nienke   
14     2020-01-16 16:59:49+00:00  Nienke   
15     2020-01-16 16:59:48+00:00  Nienke   
16     2020-01-16 16:59:37+00:00  Nienke   
17     2020-01-16 16:59:37+00:00  Nienke   
18     2020-01-16 16:59:37+00:00  Nienke   
19     2020-01-16 16:59:35+00:00  Nienke   
20     2020-01-16 16:59:35+00:00  Nienke   
21     2020-01-16 10:29:44+00:00

In [490]:
#filter out for desired time range for experiment


impressions_exp = impressions[(impressions['impressionTime'] >= timerange[0]) 
                        & (impressions['impressionTime'] <= timerange[1])]




In [491]:
#count specific sources per user


def count_sources(impressions, sources):
    sources_users = {"source": [],
                     "user": [],
                     "count": []}
    for source in sources:
        for user, user_impressions in impressions_exp.groupby('user'):
            sources_users["source"].append(source)
            sources_users["user"].append(user)
            sources_users["count"].append(user_impressions[user_impressions["source"] == source]["source"].count())       
    return pd.DataFrame(data = sources_users)

sources_users = count_sources(impressions_exp, sources)



            source          user  count
0   Matteo Salvini        Aarend     45
1   Matteo Salvini         Bjoke     40
2   Matteo Salvini  CorTimmerman     63
3   Matteo Salvini       Doortje     31
4   Matteo Salvini        Erwijn     49
5   Matteo Salvini       Friedie     37
6   Matteo Salvini          Gert     44
7   Matteo Salvini       Hanneke     55
8   Matteo Salvini           Ivo     78
9   Matteo Salvini       Juultje     49
10  Matteo Salvini          Kris     42
11  Matteo Salvini       Lonneke     26
12  Matteo Salvini       Martijn     22
13  Matteo Salvini        Nienke     18
14  Matteo Salvini          Omar     17
15  Matteo Salvini        Phoebe      5
16  Matteo Salvini      Quintijn     26
17  Matteo Salvini       Roelfke      9
18  Matteo Salvini       Stephan     25
19  Matteo Salvini        Terese     14


In [486]:
#exclude salvini+lega in order to harmonize shares across polarized and control

impressions_exp = impressions_exp[(impressions_exp["source"] != "Matteo Salvini") 
                        & (impressions_exp["source"] != "Lega - Salvini Premier")]

In [495]:
#normalize counts dividing for total impression per user

users_totalcount = impressions_exp.groupby(['user'])["source"].count()


sources_users["tot_count"] = [users_totalcount[user]
                              for user in sources_users["user"]] 

sources_users["share_count"] = (sources_users["count"] / sources_users["tot_count"])*100



user
Aarend          1272
Bjoke           1300
CorTimmerman    1558
Doortje         1445
Erwijn          1345
Friedie         1297
Gert            1193
Hanneke         1246
Ivo             1474
Juultje         1429
Kris            1595
Lonneke         1150
Martijn         1189
Nienke          1508
Omar            1148
Phoebe          1305
Quintijn        1511
Roelfke         1181
Stephan         1176
Terese          1213
Name: source, dtype: int64


In [493]:
#fiter impressions pre-polarization and harmonize counts on initial distribution of sources

impressions_pre = impressions[(impressions['impressionTime'] <= start_polarization) & (impressions['impressionTime'] <= start_collection)] 

sources_users_pre = count_sources(impressions_pre, sources)

users_totalcount = impressions_pre.groupby(['user'])["source"].count()

sources_users_pre["tot_count"] = [users_totalcount[user]
                              for user in sources_users_pre["user"]] 

sources_users_pre["share_count"] = (sources_users_pre["count"] / sources_users_pre["tot_count"])*100


sources_users["share_count_adj"] = sources_users["share_count"] / sources_users_pre["share_count"]

sources_users_pre["share_count"]

0     1.698113
1     1.344086
2     1.982379
3     0.961837
4     1.527908
5     0.983258
6     1.020645
7     1.749364
8     2.791696
9     1.779876
10    1.417004
11    0.713306
12    0.611281
13    0.296834
14    0.711000
15    0.259471
16    1.080183
17    0.358566
18    1.019160
19    0.490196
Name: share_count, dtype: float64

In [480]:
#add column with test group the user belongs to
polarized_users = ('Aarend','Bjoke','CorTimmerman','Doortje','Erwijn','Friedie','Gert','Hanneke','Ivo','Juultje')

sources_users["user_group"] = ["polarized" if user in polarized_users 
                                         else "control" 
                             for user in sources_users["user"]]        

In [481]:
#compare means and run t-test

polarized = sources_users[sources_users['user_group'] == 'polarized']["share_count_adj"]
control = sources_users[sources_users['user_group'] == 'control']["share_count_adj"]


print(mean(polarized))
print(mean(control))

ttest_ind(polarized, control)

#3.45396947496875
#2.6749734757740766 0.16691875551730254


inf
inf


  x = asanyarray(arr - arrmean)
  d = mean1 - mean2


Ttest_indResult(statistic=nan, pvalue=nan)