In [1]:
import pandas as pd
import numpy as np
import warnings
from ipynb.fs.full.func_lib import get_class_labels

In [2]:
def get_timeseries(lbl):
    journalists_timeseries = pd.read_csv(f'Journalist_activity_timeseries_lbl{lbl}.csv')
    mp_timeseries = pd.read_csv(f'MP_activity_timeseries_lbl{lbl}.csv')

    timeseries = pd.concat([journalists_timeseries, mp_timeseries]).reset_index(drop=True)
    timeseries = timeseries.rename(columns={"Screen_name": "Node"})
    timeseries = timeseries.merge(get_class_labels(),on='Node')

    timeseries['Label'] = timeseries['Label'].astype(int)
    label_dict = {0:'Backbench MP',
            1:'Cabinet MP',
            2:'Shadow cabinet MP',
            3:'Opposition backbench MP',
            4:'Prominent journalists',
            5:'Rest of the journalists'}

    timeseries['Label_str'] = timeseries['Label'].replace(to_replace=label_dict)
    label_cols = timeseries[['Label','Label_str']]
    timeseries.drop(labels=['Label','Label_str'], axis=1,inplace = True)
    timeseries.insert(1, 'Label', label_cols['Label'])
    timeseries.insert(2, 'Label_str', label_cols['Label_str'])
    return timeseries

In [3]:
def split_timeseries(x_lbl, timeseries):
    timeseries_x = timeseries.loc[timeseries['Label_str']==x_lbl]
    timeseries_y = timeseries.loc[timeseries['Label_str']!=x_lbl]
    
    return timeseries_x, timeseries_y

In [4]:
def get_timeseries_array(min_day, delta, timeseries):
    
    max_day = len(timeseries.columns) - 1
    
    df = pd.DataFrame()

    day = min_day+1
    number_week = 1

    while day < max_day:
        idx_min = day
        full_week = day+7
        if full_week > max_day:
            full_week = max_day
        idx_max = idx_min + delta
        df[f'Week{number_week}'] = timeseries[timeseries.columns[idx_min:idx_max]].sum(axis=1)

        day = full_week
        number_week += 1
    
    return df

In [5]:
def H(a):
    _,p = np.unique(a,axis=0,return_counts=True)
    p = p/np.sum(p)
    h = -1*np.sum(p*np.log2(p))
    return h

def get_mi(a,b):
    mi = H(a) + H(b) - H(np.column_stack((a,b)))
    return mi

def get_cmi(a,b,c):
    cmi = get_mi(a,np.column_stack((b,c))) - get_mi(a,c)
    return cmi


def get_mi_diff_pop(X_biweekly, Y_biweekly, Y_weekly, niter=100):
    N,_ = np.shape(X_biweekly)
    M,T = np.shape(Y_weekly)

    te_xy = []

    for z in range(niter):
        
        if N <= M:
            selected_idx = np.random.choice(M,size = N,replace=False)
            y_weekly = Y_weekly[selected_idx]
            y_biweekly = Y_biweekly[selected_idx]
        else:
            selected_idx = np.random.choice(N,size = M,replace=False)
            x_biweekly = X_biweekly[selected_idx]          
            
        xy_temp = []

        for i in range(1,T):
            if N <= M:
                txy = get_cmi(X_biweekly[:,i-1],y_weekly[:,i],y_biweekly[:,i-1])
            else:
                txy = get_cmi(x_biweekly[:,i-1],Y_weekly[:,i],Y_biweekly[:,i-1])
            
            xy_temp.append(txy)
            
        te_xy.append(xy_temp)

    te_xy = np.mean(te_xy,axis=0)
    
    return te_xy

In [6]:
groups = ['Backbench MP','Cabinet MP','Shadow cabinet MP','Opposition backbench MP','Prominent journalists','Rest of the journalists']
topics = ['ukraine','covid','costofliving','brexit']
labels = [1,3,5,6]

In [10]:
col_list = []
for i in range(51):
    col_list.append('Week'+str(i+1))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df = pd.DataFrame(columns = col_list)
    for topic in topics:

        for group in groups:
            
#             x: group of interest
#             y: rest of the population
            timeseries_x, timeseries_y = split_timeseries(group, get_timeseries(labels[topics.index(topic)]))
            
            X_biweekly = get_timeseries_array(0, 14, timeseries_x).to_numpy()
            Y_biweekly = get_timeseries_array(0, 14, timeseries_y).to_numpy()
            
            X_weekly = get_timeseries_array(7, 7, timeseries_x).to_numpy()
            Y_weekly = get_timeseries_array(7, 7, timeseries_y).to_numpy()
            
            te_xy= get_mi_diff_pop(X_biweekly, Y_biweekly, Y_weekly)            
            te_yx= get_mi_diff_pop(Y_biweekly, X_biweekly, X_weekly)  

            df.loc[group] = te_xy

        df.to_csv('Transfer_entropy/'+topic+'.csv')