Diffusion model based on https://github.com/iconvk/LearningIndependentCascadeOnVK repurposed for the ParlaMint data set.

In [None]:
import re
import os
import glob
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

Preparing the ParlaMint data

In [None]:
dir_path = r'C:\Users\soren\Documents\beyond_rep\parlamint_raw'
folders = [x[0] for x in os.walk(dir_path) if x[0][-8:-5] == 'txt']


df = pd.DataFrame()
counter = 1
for folder in folders:
    df_country = pd.DataFrame()

    os.chdir(folder)
    
    tsv_files = glob.glob('*meta-en.tsv')

    for file in tsv_files:
        path = folder + '/' + file
        d_temp = pd.read_csv(path, sep='\t')

        try:
            df_text = pd.read_csv(path[:-12] + '.txt', sep = '\t', header = None).rename(columns={0:'ID', 1:'text'})

            d_temp['text'] = df_text['text'] 
        except:
            print('failed... ' + path[:-12] + '.txt')
            pass

        df_country = df_country._append(d_temp)

    if len(folder) == 71:
        country = folder[-11:-9]
    else:
        country = folder[-15:-10]

    print(f'{country}: {counter} of {len(folders)}')
    counter += 1

    df_country = df_country.assign(country = country)

    df_country.to_csv(f"parlamint_csv/{country}.csv")

    df = df._append(df_country)


df['Speaker_birth'] = [str(i) if i != '-' else np.nan for i in df.Speaker_birth]

df = df.assign(date = [datetime.strptime(i, '%Y-%m-%d') for i in df['From']],
                    Speaker_birth = [datetime.strptime(i, '%Y') if i != 'nan' else i for i in df['Speaker_birth']])

df['Speaker_age'] = [i if pd.isna(i) else round(int(i)/365) for i in ((df.date - df.Speaker_birth).dt.days)]

df['year'] = [int(i[0:4]) for i in df.From]

df['gender'] = [0 if i == 'M' else 1 for i in df.Speaker_gender]

df.to_csv("PATH_TO/ParlaMint_data.csv")

Fitting the diffusion model

In [1]:
import networkx as nx
import itertools

import pickle as pkl
from tqdm import tqdm
import pandas as pd

from collections import defaultdict

from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split


import string
import matplotlib.pyplot as plt
import numpy as np

from collections import defaultdict
from multiprocessing import Pool
from collections import Counter
from math import log
from scipy.spatial.distance import cosine


import pickle
from sklearn import metrics
from sentence_transformers import SentenceTransformer
import torch_directml

In [2]:
def data_to_network_year(data, country, house):
    G = nx.DiGraph()
    nodes = pd.DataFrame(data[['Speaker_name', 'Speaker_minister', 'Speaker_party', 'Speaker_gender', 'Speaker_birth', 'Party_status']].value_counts().reset_index())
    # adding nodes
    for _, row in nodes.iterrows():
        speaker = row['Speaker_name']
        age = row['Speaker_birth']
        gender = row['Speaker_gender']
        party = row['Speaker_party']
        party_status = row['Party_status']
        minister = row['Speaker_minister']
        # Check if speaker node already exists in the graph
        if speaker not in G.nodes():
            G.add_node(speaker, age=age, gender=gender, 
                   party=party, party_status = party_status, minister = minister)
    # adding edges
    for date in set(data['dateRank'].to_list()):
        speakers_on_same_day = data[data['dateRank'] == date]['Speaker_name'].tolist()
        for edge in itertools.combinations(speakers_on_same_day, 2):
            if edge[0] != edge[1]:
                edge = sorted(edge)
                if G.has_edge(edge[0], edge[1]):
                    G[edge[0]][edge[1]]['weight'] += 1
                else:
                    G.add_edge(edge[0], edge[1], weight=1)
    with open(f'yearWise\\network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'wb') as f:
        pickle.dump(G, f, pickle.HIGHEST_PROTOCOL)


In [3]:
def filter_relevant_nodes_year(data, country, house):
    with open(f'yearWise\\network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb') as f:
        G = pickle.load(f)
    a = data.groupby('Speaker_name')['text'].count()
    a = a[a >= 5]
    valid_nodes = []
    for n in tqdm(G.nodes()):
        if n in a.index:
            valid_nodes.append(n)
    g = G.subgraph(valid_nodes).to_undirected() 
    g = g.subgraph(next(nx.connected_components(g))) 
    with open(f'yearWise\\filtered_network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'wb') as f:
        pickle.dump(g, f, pickle.HIGHEST_PROTOCOL)
    posts_in_graph = pd.DataFrame(data[data['Speaker_name'].isin(g)])
    posts_in_graph = posts_in_graph[['Speaker_name', 'timeOfDay', 'dateRank', 'text']].reset_index()
    pkl.dump(posts_in_graph, open(f'yearWise\\new_collected_posts_{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))
    

In [4]:
def mean_document_pool(embeddings):
    # Ensure at least one embedding is provided
    if len(embeddings) == 0:
        raise ValueError("Empty list of embeddings")

    # Calculate the mean embedding
    mean_embedding = np.mean([i for i in embeddings], axis=0)
    return mean_embedding

def recursive_split(string, embedding_model):

    if len(embedding_model.tokenizer(string)['input_ids']) <= 512:
        return [string]

    # Find the index of the middle full stop
    full_stops = [i for i, char in enumerate(string) if char in ['.', '!', '?']]

    if not full_stops:
        full_stops = [i for i, char in enumerate(string) if char in ['/', ',', ':', ';', '(', ')']]

    middle_index = len(full_stops) // 2

    if len(full_stops) % 2 == 0:  # Even number of full stops
        # Take the one before the midpoint
        middle_index -= 1
    # Split the string using the middle full stop index
    first_part = string[:full_stops[middle_index]]
    second_part = string[full_stops[middle_index] + 1:]

    if (len(embedding_model.tokenizer(first_part)['input_ids']) <= 512) and (
            len(embedding_model.tokenizer(second_part)['input_ids']) <= 512):
        return [first_part, second_part]
    else:
        # Recursively split the parts that are too long
        first_part_splits = recursive_split(first_part, embedding_model)
        second_part_splits = recursive_split(second_part, embedding_model)
        return first_part_splits + second_part_splits


def getSpeechEmbd(speech, model):
    speech_split = recursive_split(speech, model)
    sentence_embeddings = []
    for part in speech_split:
        sentence_embedding = model.encode(part)
        sentence_embeddings.append(sentence_embedding)

    speech_embedding = mean_document_pool(sentence_embeddings)
    return speech_embedding

def are_posts_close(th, post1_emb, post2_emb):
    
    c = 1.-cosine(post1_emb, post2_emb) # cosine here is 1-cos (scipy)
    return [c >= t for t in th]


In [5]:
def speech2vec_year(country, year):
    device = torch_directml.device()

    model = SentenceTransformer('intfloat/multilingual-e5-base', device = device)

    posts_in_graph = pkl.load(open(f'yearWise\\new_collected_posts_{country}{year}.pkl', "rb"))

    speechVectors = []

    for speech in tqdm(posts_in_graph.text):
        if isinstance(speech, str):
            speechVectors.append(getSpeechEmbd(speech, model))
        else:
            speechVectors.append(getSpeechEmbd('(...)', model))

    posts_in_graph['speechVector'] = speechVectors
    pkl.dump(posts_in_graph, open(f'yearWise\\speechVectors{country}{year}.pkl', "wb"))


In [6]:
def calculate_Av2u_year(data, country, house):
    posts_in_graph = pkl.load(open(f'yearWise\\speechVectors{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
    with open(f'yearWise\\filtered_network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb') as f:
        G_with_posts = pickle.load(f)
    train_posts_in_graph, test_posts_in_graph = train_test_split(posts_in_graph, test_size=0.3)
    train_posts_in_graph.to_pickle(f'yearWise\\train_posts_in_graph{country}{house}{max(data.year)}{min(data.year)}.pkl')
    test_posts_in_graph.to_pickle(f'yearWise\\test_posts_in_graph{country}{house}{max(data.year)}{min(data.year)}.pkl')
    Av2u = []
    already_success = []
    for _ in range(len(th)): # tau cut-offs
        Av2u.append(Counter())
        already_success.append(set())
    published = defaultdict(lambda: [])
    time_threshold = 3
    train_posts_in_graph = train_posts_in_graph.sort_values(["dateRank", 'timeOfDay'])
    for post_ind in tqdm(train_posts_in_graph.index):
        post = train_posts_in_graph.loc[post_ind]
        timeOfDay, time, post1embd, post1, u = post['timeOfDay'], post['dateRank'], post['speechVector'], post['text'], post['Speaker_name']
        for neighbor in G_with_posts.neighbors(u):
            forbidden_i = set() # forbid for u to be influenced by neighbor for post, if it was already influenced by that neighbor on that post
            for post2, post2embd, time2, timeOfDay2 in published[neighbor]:
                if time - time2 > time_threshold:
                    break
                if (time == time2) & (timeOfDay > timeOfDay2):
                    break
                try:
                    th_succ = are_posts_close(th, post1embd, post2embd)
                    for i in range(len(th)):
                        if th_succ[i]:
                            if ((post2, u) not in already_success[i]) and (i not in forbidden_i):
                                Av2u[i][(neighbor,u)] += 1
                                already_success[i].add((post2, u)) # owner_id can not publish more copies of similar post like post_id2
                                forbidden_i.add(i)
                except:
                    pass
        published[u] = [(post1, post1embd, time, timeOfDay)] + published[u]
    pkl.dump(Av2u, open(f'yearWise\\Av2u{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))


In [7]:
def assign_probabilities_to_edges_year(data, country, house):
    #print(year)
    Av2u = pkl.load(open(f'yearWise\\Av2u{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
    with open(f'yearWise\\filtered_network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb') as f:
        G_with_posts = pickle.load(f)
    posts_in_graph = pkl.load(open(f'yearWise\\new_collected_posts_{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
    weighted_Gs = []
    Au = posts_in_graph.groupby("Speaker_name")['text'].count()
    for th_index in tqdm(range(len(th))):
        all_edges = []
        for e in G_with_posts.edges():
            if ((e[0], e[1]) in Av2u[th_index]):
                all_edges.append((e[0], e[1], Av2u[th_index][(e[0], e[1])]/Au[e[0]]))
            if (e[1], e[0]) in Av2u[th_index]:
                all_edges.append((e[1], e[0], Av2u[th_index][(e[1], e[0])]/Au[e[1]]))
        weighted_Gs.append(nx.DiGraph())
        weighted_Gs[-1].add_weighted_edges_from(all_edges)
    pkl.dump(weighted_Gs, open(f'yearWise\\weighted_Gs{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))


In [8]:
def test_prediction_year(data, country, house):
    with open(f'yearWise\\filtered_network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb') as f:
        G_with_posts = pickle.load(f)
    weighted_Gs = pkl.load(open(f'yearWise\\weighted_Gs{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
    test_posts_in_graph = pkl.load(open(f'yearWise\\test_posts_in_graph{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb'))
    test_posts_in_graph = test_posts_in_graph.sort_values(["dateRank", 'timeOfDay'])
    G_with_posts_directed = G_with_posts.to_directed()
    for e in G_with_posts_directed.edges():
        G_with_posts_directed[e[0]][e[1]]['success'] = [0]*len(th)
        G_with_posts_directed[e[0]][e[1]]['failure'] = [0]*len(th)
    published = defaultdict(lambda: [])
    posts_that_has_been_reposted = set() # tuples <source post, target_user, threshold>
    time_threshold = 3
    for post_index in tqdm(test_posts_in_graph.index):
        post = test_posts_in_graph.loc[post_index]
        timeOfDay, time, post1embd, post1, u = post['timeOfDay'], post['dateRank'], post['speechVector'], post['text'], post['Speaker_name']
        for neighbor in G_with_posts.neighbors(u):
            thresholds_where_neighbor_have_influenced = set()
            for post2, post2embd, time2, timeOfDay2 in published[neighbor]:
                if (time - time2 > time_threshold) or (len(thresholds_where_neighbor_have_influenced) == len(th)):
                    break # other posts of neighbor are either too old, or already influenced u on the post
                try:
                    th_succ = are_posts_close(th, post1embd, post2embd)
                    for i in range(len(th)):
                        if th_succ[i] and ((post2, u, i) not in posts_that_has_been_reposted) and (i not in thresholds_where_neighbor_have_influenced):
                            posts_that_has_been_reposted.add((post2, u, i))
                            G_with_posts_directed[neighbor][u]['success'][i] += 1
                            G_with_posts_directed[neighbor][u]['failure'][i] -= 1
                            assert(G_with_posts_directed[neighbor][u]['failure'][i] >= 0)
                            thresholds_where_neighbor_have_influenced.add(i)
                except:
                    pass
        published[u] = [(post1, post1embd, time, timeOfDay)] + published[u]
        for n in G_with_posts_directed.successors(u):
            for i in range(len(th)):
                G_with_posts_directed[u][n]['failure'][i] += 1
    rates = []
    for i in tqdm(range(len(th))):
        rates.append([])
        for e in G_with_posts_directed.edges(data=True):
            if weighted_Gs[i].has_edge(e[0], e[1]):
                prob = weighted_Gs[i][e[0]][e[1]]['weight']
            else:
                prob = 0
            if e[2]['success'][i] > 0 or e[2]['failure'][i] > 0:
                rates[-1].append((e[2]['success'][i], e[2]['failure'][i], prob))
    pkl.dump(rates, open(f'yearWise\\rates{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))


In [9]:
def get_tpr_fpr(data):
    sorted_data = sorted(data, key=lambda x: -x[2])  # Sort based on edge probabilities
    total_positives = sum([x[0] for x in sorted_data])
    total_negatives = sum([x[1] for x in sorted_data])
    tp, fp = 0, 0
    tpr_list, fpr_list = [0], [0]

    for successes, failures, _ in sorted_data:
        tp += successes
        fp += failures
        if total_positives != 0:
            tpr = tp / total_positives
        else:
            tpr = 0
        if total_negatives != 0:
            fpr = fp / total_negatives
        else:
            fpr = 0
        tpr_list.append(tpr)
        fpr_list.append(fpr)

    return tpr_list, fpr_list


def calculate_roc_curve(data, country, house):
    best_thresholds = []
    auc_values = []
    for i, threshold in tqdm(enumerate(th)):
        rates = pkl.load(open(f'yearWise\\rates{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
        rateList = rates[i]
        tpr, fpr = get_tpr_fpr(rateList)
        auc = metrics.auc(tpr, fpr)
        auc_values.append(auc)
    max_auc_index = np.argmax(auc_values)
    best_threshold = th[max_auc_index]
    best_thresholds.append(best_threshold)
    best_auc = auc_values[max_auc_index]
    plt.plot(th, auc_values)#, marker='o')
    plt.xlabel('Threshold')
    plt.ylabel('AUC')
    plt.title(f'{country} Area Under the ROC Curve (AUC) for Different Thresholds')
    plt.grid(True)
    plt.show()
    print(f"Best threshold value: {best_threshold}")
    print(f"Highest AUC: {best_auc}")

    return best_thresholds


In [10]:
def retrain_with_best_th_year(data, country, house, best_threshold):
    
    posts_in_graph = pkl.load(open(f'yearWise\\speechVectors{country}{house}{max(data.year)}{min(data.year)}.pkl', "rb"))
    with open(f'yearWise\\filtered_network{country}{house}{max(data.year)}{min(data.year)}.pkl', 'rb') as f:
        G_with_posts = pickle.load(f)
    Av2u = []
    already_success = []
    for _ in [best_threshold]:
        Av2u.append(Counter())
        already_success.append(set())
    published = defaultdict(lambda: [])
    time_threshold = 3
    posts_in_graph = posts_in_graph.sort_values(["dateRank", 'timeOfDay'])
    for post_ind in tqdm(posts_in_graph.index):
        post = posts_in_graph.loc[post_ind]
        timeOfDay, time, post1embd, post1, u = post['timeOfDay'], post['dateRank'], post['speechVector'], post['text'], post['Speaker_name']
        for neighbor in G_with_posts.neighbors(u):
            forbidden_i = set() # forbid for u to be influenced by neighbor for post, if it was already influenced by that neighbor on that post
            for post2, post2embd, time2, timeOfDay2 in published[neighbor]:
                if time - time2 > time_threshold:
                    break
                if (time == time2) & (timeOfDay > timeOfDay2):
                    break
                try:
                    th_succ = are_posts_close(th, post1embd, post2embd)
                    for i in range(len([best_threshold])):
                        if th_succ[i]:
                            if ((post2, u) not in already_success[i]) and (i not in forbidden_i):
                                Av2u[i][(neighbor,u)] += 1
                                already_success[i].add((post2, u)) # owner_id can not publish more copies of similar post like post_id2
                                forbidden_i.add(i)
                except:
                    pass
        published[u] = [(post1, post1embd, time, timeOfDay)] + published[u]
    weighted_Gs = []
    Au = posts_in_graph.groupby("Speaker_name")['text'].count()
    for th_index in tqdm(range(len([best_threshold]))):
        all_edges = []
        for e in G_with_posts.edges():
            if ((e[0], e[1]) in Av2u[th_index]):
                all_edges.append((e[0], e[1], Av2u[th_index][(e[0], e[1])]/Au[e[0]]))
            if (e[1], e[0]) in Av2u[th_index]:
                all_edges.append((e[1], e[0], Av2u[th_index][(e[1], e[0])]/Au[e[1]]))
        weighted_Gs.append(nx.DiGraph())
        weighted_Gs[-1].add_weighted_edges_from(all_edges)
    pkl.dump(weighted_Gs, open(f'yearWise\\weighted_Gs_retrained{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))
    pkl.dump(all_edges, open(f'yearWise\\all_edges{country}{house}{max(data.year)}{min(data.year)}.pkl', "wb"))

#retrain_with_best_th_year(df, country, house, best_threshold)

In [13]:
countryHouseList = [('AT', 'all'), 
                    ('BA', 'all'), 
                    ('BE', 'com'),
                    ('BE', 'low'),
                    ('BG', 'all'), 
                    ('CZ', 'all'), 
                    ('DK', 'all'), 
                    ('EE', 'all'), 
                    ('ES', 'all'), 
                    ('ES-CT', 'all'), 
                    ('ES-GA', 'all'), 
                    ('ES-PV', 'all'), 
                    ('FI', 'all'), 
                    ('FR', 'all'),  
                    ('GB', 'low'), 
                    ('GB', 'upp'), 
                    ('GR', 'all'), 
                    ('HR', 'all'), 
                    ('HU', 'all'), 
                    ('IS', 'all'), 
                    ('IT', 'all'), 
                    ('LV', 'all'), 
                    ('NL', 'low'), 
                    ('NL', 'upp'), 
                    ('NO', 'all'), 
                    ('PL', 'low'), 
                    ('PL', 'upp'), 
                    ('PT', 'all'), 
                    ('RS', 'all'),  
                    ('SE', 'all'),  
                    ('SI', 'all'),  
                    ('TR', 'all'), 
                    ('UA', 'all') 
                    ]

In [11]:
elections = {'ATall':['01-01-1996', '03/10/1999', '24/11/2002', '01/10/2006', '28/09/2008', '29/09/2013', '15/10/2017', '29/09/2019', '16/09/2024'],
             'BAall':['01-01-1996', '05/10/2002', '01-10-2006','03-10-2010','12-10-2014','07-10-2018','02-10-2022','','16/09/2024'],
             'BEcom':['01-01-1996', '25-05-2014', '26-05-2019', '16/09/2024'],
             'BElow':['01-01-1996', '25-05-2014', '26-05-2019', '16/09/2024'],
             'BGall':['01-01-1996', '05-10-2014', '26-03-2017', '04-04-2021', '11-07-2021', '21-11-2021', '16/09/2024'],
             'CZall':['01-01-1996', '25-10-2013', '20-10-2017', '08-10-2021', '16/09/2024'],
             'DKall':['01-01-1996', '18-06-2015', '05-06-2019', '16/09/2024'],
             'EEall':['01-01-1996', '06-03-2011', '01-03-2015', '03-03-2019', '16/09/2024'],
             'ESall':['01-01-1996', '20-12-2015', '26-06-2016', '28-04-2019', '10-11-2019', '23-07-2023', '16/09/2024'],
             'ES-CTall':['01-01-1996', '27-09-2015', '21-12-2017', '14-02-2021', '16/09/2024'],
             'ES-GAall':['01-01-1996', '25-09-2016', '12-07-2020', '16/09/2024'],
             'ES-PVall':['01-01-1996', '25-09-2016', '12/07/2020', '16/09/2024'],
             'FIall':['01-01-1996', '19-04-2015', '14-04-2019', '16/09/2024'],
             'FRall':['01-01-1996', '18-06-2017', '19-06-2022', '16/09/2024'],
             'GBlow':['01-01-1996', '06-01-2016', '06-01-2017', '06-01-2018', '12-12-2019', '06-01-2020', '06-01-2021', '16/09/2024'],
             'GBupp':['01-01-1996', '16/09/2016', '16/09/2018', '16/09/2020', '16/09/2022', '16/09/2024'],
             'GRall':['01-01-1996', '25-01-2015', '20-09-2015', '07-07-2019', '16/09/2024'],
             'HRall':['01-01-1996', '25-11-2007', '04-12-2011', '08-11-2015', '11-09-2016', '05-07-2020', '16/09/2024'],
             'HUall':['01-01-1996', '06-04-2014', '08-04-2018', '03-04-2022', '16/09/2024'],
             'ISall':['01-01-1996', '29/10/2016', '28-10-2017', '25-09-2021', '16/09/2024'],
             'ITall':['01-01-1996', '24-02-2013', '04-03-2018', '25-09-2022', '16/09/2024'],
             'LVall':['01-01-1996', '04-10-2014', '01-10-2018', '01-10-2022', '16/09/2024'],
             'NLlow':['01-01-1996', '12-09-2012', '15-03-2017', '17-03-2021', '16/09/2024'],
             'NLupp':['01-01-1996', '26-05-2015', '27-05-2019', '16/09/2024'],
             'NOall':['01-01-1996', '14-09-2009', '09-09-2013', '11-09-2017', '13-09-2021', '16/09/2024'],
             'PLlow':['01-01-1996', '25-10-2015', '13-10-2019', '16/09/2024'],
             'PLupp':['01-01-1996', '25-10-2015', '13-10-2019', '16/09/2024'],
             'PTall':['01-01-1996', '04-10-2015', '06/10/2019', '30-01-2022', '16/09/2024'],
             'RSall':['01-01-1996', '05-10-1997', '23-12-2000', '28-12-2003', '21-01-2007', '11-05-2008', '06-05-2012', '16-03-2014', '24-04-2016', '21-06-2021', '03-04-2022', '16/09/2024'],
             'SEall':['01-01-1996', '14/09/2014', '09-09-2018', '11-09-2022', '16/09/2024'],
             'SIall':['01-01-1996', '15-10-2000', '03-10-2004', '21-09-2008', '04-12-2011', '13-07-2014', '03-06-2018', '24-04-2022', '16/09/2024'],
             'TRall':['01-01-1996', '12-06-2011', '07-06-2015', '01-11-2015', '24-06-2018', '16/09/2024'],
             'UAall':['01-01-1996', '28-10-2012', '26-10-2014', '21-07-2019','16/09/2024']
             }

In [None]:
for country, house in countryHouseList:
    df = pd.read_csv(f'parlamint_csv\\{country}.csv')

    df = df.reset_index(drop=True)

    # Creating a new column 'date_rank' counting upward
    df['timeOfDay'] = df.groupby('Date').cumcount()
    df['dateRank'] = df.groupby(df['Date']).ngroup()
    df['year'] = [int(i[0:4]) for i in df.Date]
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

    df = df[df['Speaker_role'] != 'Chairperson']

    th = list(np.arange(0.7,0.95,0.005)) + list(np.arange(0.95,0.999,0.001))

    for i, date in enumerate(elections[f'{country}{house}'][1:]):
        print(f'{elections[f'{country}{house}'][i]} - {date}')
        df_year = df[(df['Date'] > elections[f'{country}{house}'][i]) & (df['Date'] < date)]

        if len(df_year) != 0:

            data_to_network_year(df_year, country, house) 
            filter_relevant_nodes_year(df_year, country, house)
            speech2vec_year(df_year, country, house)
            calculate_Av2u_year(df_year, country, house)
            assign_probabilities_to_edges_year(df_year, country, house)
            test_prediction_year(df_year, country, house)
            best_threshold = calculate_roc_curve(df_year, country, house)
            retrain_with_best_th_year(df_year, country, house, best_threshold)

Extracting influences and saving ROC data

In [None]:
import glob
import pandas as pd
import numpy as np
import os
import pickle as pkl
from sklearn import metrics
from tqdm import tqdm

In [None]:
def calculate_roc_curve(data):
    sorted_data = sorted(data, key=lambda x: -x[2])  # Sort based on edge probabilities
    #sorted_data = data.sort(key = lambda row: row[2])
    total_positives = sum([x[0] for x in sorted_data])
    total_negatives = sum([x[1] for x in sorted_data])
    tp, fp = 0, 0
    tpr_list, fpr_list = [0], [0]

    for successes, failures, _ in sorted_data:
        tp += successes
        fp += failures
        if total_positives != 0:
            tpr = tp / total_positives
        else:
            tpr = 0
        if total_negatives != 0:
            fpr = fp / total_negatives
        else:
            fpr = 0
        tpr_list.append(tpr)
        fpr_list.append(fpr)

    return tpr_list, fpr_list

In [None]:
# extract auc and thresholds

os.chdir('PATH_TO\\yearWise')
ratesFiles = glob.glob('rates*') # weighted Gs

#ratesFiles = [i for i in ratesFiles if i[5:7] in ['NL']]

th = list(np.arange(0.7, 0.95, 0.005)) + list(np.arange(0.95, 0.999, 0.001))

# INITIALIZE EMPTY DATAFRAMES
d_ratesBest = pd.DataFrame()
d_ratesAll = pd.DataFrame()
d_ROC = pd.DataFrame()

for ratesFile in tqdm(ratesFiles):
    print(ratesFile)
    # LOAD RATES DATA FROM FILE
    with open(ratesFile, "rb") as f:
        rates = pkl.load(f)

    # COLLECT AUC VALUES FOR EACH THRESHOLD
    auc_values = []
    fpr_list = []
    tpr_list = []

    for i, threshold in enumerate(th):
        data = rates[i]
        tpr, fpr = calculate_roc_curve(data)
        fpr_list.append(fpr)
        tpr_list.append(tpr)
        auc = metrics.auc(fpr, tpr)
        auc_values.append(auc)

    # FIND THE BEST AUC AND CORRESPONDING THRESHOLD
    max_auc_index = np.argmax(auc_values)
    best_threshold = th[max_auc_index]
    best_auc = auc_values[max_auc_index]

    # EXTRACT UPPERCASE LETTERS FROM FILE NAME
    country = ''.join([char for char in ratesFile if char.isupper()])
    house = ratesFile[-15:-12]
    year = ratesFile[-12:-4]

    # STORE BEST AUC AND THRESHOLD
    dictBest = {'country': country, 
                'house': house,
                'year': year,
                'threshold': best_threshold,
                'auc': best_auc
                }
    
    # APPEND TO d_ratesBest DATAFRAME
    d_ratesBest = d_ratesBest._append(dictBest, ignore_index=True)

    # STORE ALL AUC VALUES AND OTHER DETAILS
    #d_rateAll = pd.DataFrame({'country': [country] * len(th), 
    #                          'house': [house] * len(th),
    #                          'year': [year] * len(th),
    #                          'threshold' : th,
    #                          'auc': auc_values
    #                          })
    downsample_size = 500  # Set the desired sample size here
    sampled_indices = pd.Series(range(len(fpr_list[max_auc_index]))).sample(n=downsample_size, random_state=1).tolist()
    downsampled_data = pd.DataFrame({
        'fpr': [fpr_list[max_auc_index][i] for i in sampled_indices],
        'tpr': [tpr_list[max_auc_index][i] for i in sampled_indices],
        'country': country,
        'year': year,
        'house': house})

    d_ROC = pd.concat([d_ROC, downsampled_data], ignore_index=True)

    # CONCATENATE TO d_ratesAll DATAFRAME
    #d_ratesAll = pd.concat([d_ratesAll, d_rateAll], ignore_index=True)


d_ratesBest.to_csv('influence\\ratesBest.csv')
d_ROC.to_csv('influence\\ROC.csv')


In [None]:
# extract influence sum

os.chdir('C:\\Users\\soren\\Documents\\beyond_rep\\yearWise')
edgesFiles = glob.glob('all_edges*')

influence = pd.DataFrame()

for edgesFile in tqdm(edgesFiles):
    with open(edgesFile, "rb") as f:
        edges = pkl.load(f)
    
    d = [(i[0], i[2]) for i in edges]

    # Dictionary to store the sum and count of values for each name
    sums_counts = {}

    # Iterate through the list
    for name, value in d:
        if name in sums_counts:
            sums_counts[name]['sum'] += value
            sums_counts[name]['count'] += 1
        else:
            sums_counts[name] = {'sum': value, 'count': 1}

    # Calculate the average for each name and store in a DataFrame
    averages = pd.DataFrame.from_dict(sums_counts, orient='index')
    #averages['average'] = averages['sum'] / averages['count']
    averages.reset_index(inplace=True)
    averages.rename(columns={'index': 'name'}, inplace=True)

    # Add additional columns for country and house
    averages['country'] = ''.join([char for char in edgesFile if char.isupper()])
    averages['house'] = edgesFile[-15:-12]
    averages['years'] = edgesFile[-12:-4]

    # Select relevant columns
    averages = averages[['name', 'sum', 'country', 'house', 'years']]

    # Concatenate to the influence DataFrame
    influence = pd.concat([influence, averages], ignore_index=True)

birthdays = []
gender = []
party = []
party_status = []
party_orientation = []
body = []
role = []
mp = []
minister = []
name = []

missingGender = pd.read_excel(f'C:\\Users\\soren\\Documents\\beyond_rep\\missingData\\missing_gender.xlsx')
missingGender = missingGender[['country', 'Speaker_name', 'gender']]
missingAge = pd.read_excel(f'C:\\Users\\soren\\Documents\\beyond_rep\\missingData\\missing_age.xlsx')

for country in tqdm(np.unique(influence.country)):
    influence_country = influence[influence['country'] == country]
    if len(country) == 4:
        d_country = pd.read_csv(f'C:\\Users\\soren\\Documents\\beyond_rep\\parlamint_csv\\{country[0:2] + '-' + country[2:]}.csv')
    else:
        d_country = pd.read_csv(f'C:\\Users\\soren\\Documents\\beyond_rep\\parlamint_csv\\{country}.csv')

    # Adding missing gender
    d_country = pd.merge(d_country, missingGender, on=['country', 'Speaker_name'], how='left')
    d_country['Speaker_gender'] = [None if i == '-' else i for i in d_country['Speaker_gender']]
    d_country['Speaker_gender'] = d_country['Speaker_gender'].combine_first(d_country['gender'])

    # Adding missing birthdays
    d_country = pd.merge(d_country, missingAge, on=['country', 'Speaker_name'], how='left')

    d_country['Speaker_birth'] = [None if i == '-' else i for i in d_country['Speaker_birth']]
    d_country['Speaker_birth'] = d_country['Speaker_birth'].combine_first(d_country['birth'])
        
    for i, row in influence_country.iterrows():
        speaker_info = d_country[d_country.Speaker_name == row['name']].iloc[0]

        birthdays.append(speaker_info.Speaker_birth)
        gender.append(speaker_info.Speaker_gender)
        party.append(speaker_info.Speaker_party)
        party_status.append(speaker_info.Party_status)
        party_orientation.append(speaker_info.Party_orientation)
        body.append(speaker_info.Body)
        role.append(speaker_info.Speaker_role)
        mp.append(speaker_info.Speaker_MP)
        minister.append(speaker_info.Speaker_minister)
        name.append(speaker_info.Speaker_name)
    
d_temp = pd.DataFrame({'birthdays' : birthdays,
                       'gender' : gender,
                       'party' : party,
                       'party_status' : party_status,
                       'party_orientation' : party_orientation,
                       'body' : body,
                       'role' : role,
                       'mp' : mp,
                       'minister' : minister,
                       'name' : name})

d_temp = d_temp.drop_duplicates(subset='name', keep='first')


influence.merge(d_temp, on = 'name').to_csv('influence\\influenceSum.csv')