#### To skip to Simulation- run imports then scroll down to "Simulate Run with TwitterSim"

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import community as comm
import pickle
import os.path
import community as community_louvain
import matplotlib.pyplot as plt

import random
from sklearn.metrics import pairwise_distances
import progressbar
import argparse
import operator
from numpy.random import choice

## Detect Communities

In [None]:
path = './data/higgs-social_network.edgelist'

In [None]:
%%time
G = nx.read_edgelist(path, create_using=nx.DiGraph)

In [None]:
%%time
print('checking if communities file exists...')
if os.path.isfile('./data/communities.pkl'):
    print('found communities file, loading communities')
    with open("./data/communities.pkl", "rb") as f:
        communities = pickle.load(f)
else:
    print('communities file not found, using lovain to generate communities...')
    communities = comm.louvain_communities(G)
    print('pickle dumping communities')
    with open("./data/communities.pkl", "wb") as f:
        pickle.dump(np.array(communities), f)

In [None]:
#assign communities to node attributes
community_label = 1
for community in communities:
    #print(community_label, community)
    for node in community:
        G.nodes[node]['Community'] = community_label
    community_label += 1

In [None]:
community_label

In [None]:
%%time
nx.write_gpickle(G, "./data/nodes_with_community.gpickle")

In [None]:
%%time
nx.write_gexf(G, "./data/nodes_with_community.gexf")

## Calculate Community Substats

In [None]:
## Function takes a graph partitioned by communities and returns number of edges between communities
def calc_edges_between_communities(G, outpath):
    communities = nx.get_node_attributes(G, 'Community')
    ig = community_louvain.induced_graph(communities, G, weight = 'WEIGHT')
    c1 = []
    c2 = []
    weight = []
    for u,v,a in ig.edges(data=True):
        if u != v: 
           c1.append(u)
           c2.append(v)
           weight.append(a['WEIGHT'])
    edges_between_communities = pd.DataFrame()
    edges_between_communities['community1'] = c1
    edges_between_communities['community2'] = c2
    edges_between_communities['edges'] = weight
    print(edges_between_communities.head())
    #edges_between_communities.to_csv(outpath)
    print('\n\nEdges Between Communities Calculated...\n\n')
    return ig

In [None]:
## Function takes a graph partitioned by communities and returns stats such as 
## max, mean, median degree and community size
def calc_stats_for_communities(G, community_network, outpath):
    communities = pd.DataFrame.from_dict(nx.get_node_attributes(G, 'Community'), orient='index')\
        .rename(columns={0:'com'})
    
    community_size = communities.groupby(['com']).size().sort_values(ascending=False)
    
    print('\n\nCommunity Size Calculated...\n\n')
    t = list(G.degree)
    degrees = [item[1] for item in t]
    communities['degree'] = degrees
    degree_stat = communities.groupby('com').agg(max_degree=('degree', 'max'),
                                                mean_degree=('degree', 'mean'),
                                                median_degree=('degree', 'median'))
    stats = degree_stat.merge(community_size.rename('nodes'), left_index=True, right_index=True)

    btwn_c = nx.betweenness_centrality(community_network, weight='WEIGHT')
    stats = stats.merge(pd.DataFrame.from_dict(btwn_c, orient='index'), left_index=True,
                        right_index=True).rename(columns={0:'betweenness_centrality'})
    print(stats.head())
    #stats.to_csv(outpath, index=True)
    print('\n\nDegree Stats Calculated ... \n\n\n')
    return None

In [None]:
def calc_stats_for_network(G, outpath):
    stats = pd.DataFrame(columns=['Avg. Degree', 'Density', 'Diameter', 'Clustering Coefficient',
                                  'Avg. Shortest Path Length'])
    stats.loc[0, 'Avg. Degree'] = np.array([*nx.average_degree_connectivity(G).values()]).mean()
    stats.loc[0, 'Density'] = nx.density(G)
    try:
        stats.loc[0, 'Diameter'] = nx.diameter(G)
    except Exception:
        stats.loc[0, 'Diameter'] = str(Exception)
    stats.loc[0, 'Clustering Coefficient'] = nx.average_clustering(G)
    try:
        stats.loc[0, 'Avg. Shortest Path Length'] = nx.average_shortest_path_length(G)
    except Exception:
        stats.loc[0, 'Avg. Shortest Path Length'] = str(Exception)
    
    print('Here are yo network stats:')
    print(stats.head())
    #stats.to_csv(outpath, index=False)
    return None

In [None]:
path = './data/nodes_with_community.gpickle'
stats_outpath = './test_output/community_stats.csv'
edges_outpath = './test_output/edges_between_communities.csv'
network_stats_outpath = './test_output/network_stats.csv'

G = nx.read_gpickle(path)

In [None]:
%%time
ig = calc_edges_between_communities(G, outpath=edges_outpath)

In [None]:
%%time
calc_stats_for_communities(G, ig, outpath=stats_outpath)

## Subset Community

In [None]:
def subset_graph(G, outpath, communities=None):
    """
    If communities is not None, only return graph of nodes in communities subset.

    param G: input graph
    param communities: list of int
    """

    #filter graph to desired community subset
    comm_list = nx.get_node_attributes(G, 'Community')
    nodes = list(G.nodes)
    G2 = G.copy()
    if communities is not None:
        for node in nodes:
            if comm_list[node] not in communities:
                G2.remove_node(node)
    
    nx.write_gexf(G2, outpath)

    #get log degree distribution
    degrees = list(list(zip(*G2.degree))[1])

    #log scale pdf
    plt.clf()
    hist, bins = np.histogram(degrees, bins=10, normed=1)
    bin_centers = (bins[1:]+bins[:-1])*0.5

    #plt.hist(np.log(data['Degree']), bins=10, density=1, edgecolor='black')
    plt.plot(np.log(bin_centers), np.log(hist), color='red')
    plt.title('Twitter Log Scale PDF')
    plt.xlabel('Log(Degree)')
    plt.ylabel('log(Probability)')
    plt.show()

    #log-log rank-frequency
    plt.clf()
    unique, counts = np.unique(degrees, return_counts=True)
    rank_freq = pd.DataFrame({'degree': unique, 'frequency': counts})
    rank_freq.sort_values(by='frequency', ascending=False)
    rank_freq['rank'] = range(1, len(unique)+1)
    plt.scatter(np.log(rank_freq['rank']), np.log(rank_freq['frequency']))
    plt.xlabel('log(rank)')
    plt.ylabel('log(frequency)')
    plt.title('Twitter Degree Log-Log Rank-Frequency Plot')
    plt.show()
    
    return None

In [None]:
def export_community_net(G, outpath):

    communities = nx.get_node_attributes(G, 'Community')
    ig = community_louvain.induced_graph(communities, G, weight = 'WEIGHT')
    btwn_c = nx.betweenness_centrality(ig, weight='WEIGHT')
    
    communities = pd.DataFrame.from_dict(nx.get_node_attributes(G, 'Community'), orient='index')\
        .rename(columns={0:'com'})
    community_size = communities.groupby(['com']).size().sort_values(ascending=False)
    
    above_10k = community_size[community_size>10000]
    community_size = community_size.to_dict()
    nx.set_node_attributes(ig, community_size, 'SIZE')
    
    c1 = []
    c2 = []
    weight = []
    for u,v,a in ig.edges(data=True):
        if u in above_10k.index and v in above_10k.index: 
           c1.append(u)
           c2.append(v)
           weight.append(a['WEIGHT'])
    edges_between_communities = pd.DataFrame()
    edges_between_communities['community1'] = c1
    edges_between_communities['community2'] = c2
    edges_between_communities['edges'] = weight
    
    edges_between_communities['percent_edges'] = edges_between_communities.groupby(['community1'])['edges']\
        .transform(lambda x: x/x.sum())

    for i in range(len(edges_between_communities)):
        nx.set_edge_attributes(ig, 
                               {(edges_between_communities.iloc[i,0], 
                                 edges_between_communities.iloc[i,1]): 
                                {'prob_links_to':edges_between_communities.iloc[i,3]}})
    
    #remove self edges
    #ig.remove_edges_from(nx.selfloop_edges(ig))

    nx.write_gexf(ig, outpath)
    return None

In [None]:
path = './data/nodes_with_community.gpickle'
net_outpath = './test_output/subset_net.gexf'
com_outpath = './test_output/community_net.gexf'

G = nx.read_gpickle(path)

In [None]:
%%time
subset_graph(G, net_outpath, communities=[3, 56, 43])

In [None]:
%%time
export_community_net(G, com_outpath)

## Simulate Run with TwitterSim

In [2]:
start_from_scratch = False
com_outpath = './test_output/community_net.gexf'

#### Create Network

In [3]:
if start_from_scratch:
    G = nx.read_gexf(com_outpath).to_directed()
else:
    G = nx.read_gpickle("./data/nodes_simulation_checkpoint.gpickle")
    with open('./data/metadata_checkpoint.pickle', 'rb') as m:
        metadata = pickle.load(m)
print(G.size())
print(len(G.nodes()))

24731
2967


In [4]:
G.nodes(data=True)

NodeDataView({'1': {'SIZE': 13520, 'label': '1', 'lambda': 0.4276488988668722, 'wake': 9.0, 'inbox': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [5]:
if start_from_scratch:
    for node, data in G.nodes(data=True):
        data['lambda'] = np.random.uniform(0.001,0.75)
        data['wake'] = 0 + np.round(np.random.exponential(scale = 1 / data['lambda']))
        data['inbox'] = []
        data['mentioned_by'] = []
        if int(node) < int(data['SIZE']):
            data['belief'] = np.random.uniform(0,0.5)
        else:
            data['belief'] = np.random.uniform(0.5,1.0)
        if data['belief'] < 0.2:
            data['kind'] = 'beacon'
        else:
            data['kind'] = 'normal'

In [6]:
if start_from_scratch:
    perc_bots = 0.05
    bot_initial_links= 2

    #  Add Bots
    num_bots = int(np.round(len(G.nodes)*perc_bots))
    bot_names = [str(len(G) + i) for i in range(num_bots)]
    for bot_name in bot_names:
        initial_links = random.sample(G.nodes, bot_initial_links)
        G.add_node(bot_name)
        for link in initial_links:
            G.add_edge(bot_name,link)
    # Add Bot Data      
    for node, data in G.nodes(data=True):
        if node in bot_names:
            data['lambda'] = np.random.uniform(0.1,0.75)
            data['wake'] = 0 + np.round(np.random.exponential(scale = 1 / data['lambda']))
            data['inbox'] = []
            data['belief'] = np.random.uniform(0.95,1.0)
            data['kind'] = 'bot'
            data['mentioned_by'] = []
            data['label'] = node

In [7]:
if start_from_scratch:
    ## Remove self_loops and isololates
    G.remove_edges_from(list(nx.selfloop_edges(G)))
    G.remove_nodes_from(list(nx.isolates(G)))

In [8]:
if start_from_scratch:
    A = nx.adjacency_matrix(G).astype(bool)
    b = np.squeeze(np.asarray(A.sum(axis = 1)))
    b = np.argwhere(b==0)

In [9]:
if start_from_scratch:
    for node in [list(G.nodes(data=True))[i[0]] for i in list(b)]:
        connected = [to for (fr, to) in G.edges(G.nodes[node[0]])]
        unconnected = [n for n in G.nodes() if not n in connected] 
        new = random.sample(unconnected,1)
        G.add_edge(node[0], new[0])

#### Start Run

In [10]:
def scale(x):
    '''
    Normalizes a vector by dividing each element by the vector max.
    '''
    x = np.array(x)
    return(x/np.max(x))

In [11]:
def link_prediction(G, node,similarity):
    '''
    This function takes the graph G, a given node, and the jaccard similarity 
    matrix for the nodes, and returns recommended link based on similarity.  
    '''
    ## Potential links are drawn from those whoe follow the same accounts 
    potential = []
    successors = G.successors(node)
    predecessors = list(G.predecessors(node)) 
    for successor in successors:
        friends = G.predecessors(successor)
        for friend in friends:
            if friend != node:
                potential.append(friend)
    # If potential exists, find highest similarity, otherwise sample from predecessors
    final = []
    if len(potential) > 0:
        jaccard1 = similarity[get_idx(node),[get_idx(p) for p in potential]]
        i = np.argmax(jaccard1)
        link = (node,potential[i])
        if ~G.has_edge(link[0],link[1]):
            final.append(link)
    elif len(predecessors) > 0:
        get_one = random.sample(list(predecessors),1)
        link = (node,get_one[0])
        if ~G.has_edge(link[0],link[1]):
            final.append(link)
    return(final)

In [12]:
if start_from_scratch:
    #Create initial simlilarity and prestige arrays
    A = nx.adjacency_matrix(G).astype(bool)
    similarity = 1 - pairwise_distances(A.todense(), metric = 'jaccard')
    prestige = scale(list(dict(G.degree()).values()))
else:
    A = metadata['A']
    similarity = metadata['similarity']
    prestige = metadata['prestige']

In [13]:
bar = progressbar.ProgressBar()
if start_from_scratch:
    # Initialize objects to collect results
    total_tweets = []
    all_beliefs = {'time':[],'user':[],'beliefs':[], 'kind':[]}

    influence_proportion = 0.1
    bucket1 = [0,1]
    bucket2 = [0,-1]
    probability_of_link = 0.05
    dynamic_network = True
    global_perception = 0.00000001
    retweet_perc = 0.25
    allowed_successors = 0.2
    strategy = 'normal'
    last_step = 0
    
    metadata = {}
    metadata['total_tweets'] = total_tweets
    metadata['all_beliefs'] = all_beliefs = {'time':[],'user':[],'beliefs':[], 'kind':[]}
    #metadata['bar'] = bar = progressbar.ProgressBar()
    metadata['influence_proportion'] = influence_proportion
    metadata['bucket1'] = bucket1
    metadata['bucket2'] = bucket2
    metadata['probability_of_link'] = probability_of_link
    metadata['dynamic_network'] = dynamic_network
    metadata['global_perception'] = global_perception
    metadata['retweet_perc'] = retweet_perc
    metadata['allowed_successors'] = allowed_successors
    metadata['strategy'] = strategy
    metadata['last_step'] = last_step
    metadata['perc_bots'] = perc_bots
    metadata['bot_initial_links'] = bot_initial_links
    
else:
    # Initialize objects to collect results
    total_tweets = metadata['total_tweets']
    all_beliefs = metadata['all_beliefs']

    influence_proportion = metadata['influence_proportion']
    bucket1 = metadata['bucket1']
    bucket2 = metadata['bucket2']
    probability_of_link = metadata['probability_of_link']
    dynamic_network = metadata['dynamic_network']
    global_perception = metadata['global_perception']
    retweet_perc = metadata['retweet_perc']
    allowed_successors = metadata['allowed_successors']
    strategy = metadata['strategy']
    last_step = metadata['last_step']
    perc_bots = metadata['perc_bots']
    bot_initial_links = metadata['bot_initial_links']

In [14]:
def get_idx(node):
    return list(G.nodes).index(G.nodes()[node]['label'])

In [15]:
G.nodes(data=True)

NodeDataView({'1': {'SIZE': 13520, 'label': '1', 'lambda': 0.4276488988668722, 'wake': 9.0, 'inbox': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [16]:
for step in bar(range(last_step, 15)):#bar(range(last_step, 1680)):
        # Once a week we update the similarity matrix and Global Perception and prestige
        #if (step % 168) == 0:
        if (step % 5) == 0:
            A = nx.adjacency_matrix(G).astype(bool)
            similarity = 1 - pairwise_distances(A.todense(), metric = 'jaccard')
            prestige = scale(list(dict(G.in_degree()).values()))
            
            ## Update Global Perception
            if len(total_tweets) > 0:
                df = pd.concat(total_tweets)
                global_perception = 0.001*df['tweets'].mean()
            
            #save progress
            nx.write_gpickle(G, "./data/nodes_simulation_checkpoint.gpickle")
            metadata['A'] = A
            metadata['similarity'] = similarity
            metadata['prestige'] = prestige
            metadata['global_perception'] = global_perception
            metadata['all_beliefs'] = all_beliefs
            metadata['last_step'] = step
            with open('./data/metadata_checkpoint.pickle', 'wb') as m:
                pickle.dump(metadata, m, protocol=pickle.HIGHEST_PROTOCOL)
            
        # Loop over all nodes
        for node, data in G.nodes(data=True):
            all_beliefs['time'].append(step)
            all_beliefs['user'].append(node);
            all_beliefs['beliefs'].append(data['belief']);
            all_beliefs['kind'].append(data['kind'])
            # Check if User logs on for this Time Step
            if data['wake'] < step:
                retweets = []
                # Get new 'wake' time
                data['wake'] = data['wake'] + np.round(np.random.exponential(scale = 1 / data['lambda']))
                # Read Tweets
                if len(data['inbox']) > 0:
                    number_to_read = min(random.randint(4,20),len(data['inbox']))
                    read_tweets = data['inbox'][-number_to_read:]
                    perc = np.mean(read_tweets)
                    # Update Belief
                    if (perc + global_perception) > 0:
                        new_belief = data['belief'] +   (perc + global_perception) * (1-data['belief'])
                    else:
                        new_belief = data['belief'] +   (perc + global_perception) * (data['belief'])
                    data['belief'] = new_belief  
                    # Get retweets from read tweets
                    retweets = random.sample(read_tweets, round(retweet_perc*len(read_tweets)))
                # Send Tweets for bots
                if data['kind'] == 'bot':
                    chance = 0.8
                    tweets = list(choice(bucket1, np.random.randint(0,10),p=[1-chance, chance]))
                    
                    # Send Tweets for Stiflers/Beacons
                elif (data['kind'] == 'beacon') and ('read_tweets' in locals()):
#                    chance = 0.8
    #                tweets = list(choice(bucket2, np.random.randint(0,10),p=[1-chance, chance]))
                    num_dis = np.sum(np.array(read_tweets) > 0)
                    tweets = [-1] * num_dis
                    
                # Send Tweets for normal users
                else:
#                    chance = data['belief'] * influence_proportion
                    chance = 0   # Normal users only send disinformation with retweets
                    tweets = list(choice(bucket1, np.random.randint(0,10),p=[1-chance, chance]))
                tweets.extend(retweets)
                total_tweets.append(pd.DataFrame({'tweets': tweets, 'time' :[step] * len(tweets)}))
                predecessors = G.predecessors(node)
                for follower in predecessors:
                    homophily = similarity[get_idx(node),get_idx(follower)]
                    importance =  prestige[get_idx(follower)]
                    tweets = [homophily * importance * i for i in tweets]
                    G.nodes[follower]['inbox'].extend(tweets)
                    
                # Send Mentions
                neighbors = list(G.neighbors(node))
                mention = random.sample(neighbors,1)[0]
                G.nodes[mention]['mentioned_by'].append(node)
                    
                # Make sure doesn't have too many successors already
                successors = list(G.successors(node)) + [node]
                if len(successors) < allowed_successors * len(G.nodes) and (dynamic_network):
                    # If probabliliy right, add link for non-bot users
                    if (np.random.uniform(0,1) < probability_of_link) and (data['kind'] != 'bot'):
                        new_link = link_prediction(G,node,similarity)
                        if len(new_link) > 0:
                            G.add_edges_from(new_link) 

                    # If probabliliy right, add link to a mention
                    if (np.random.uniform(0,1) < probability_of_link) and (len(data['mentioned_by']) > 0):
                        new_link = random.sample(data['mentioned_by'],1)
                        if len(new_link) > 0:
                            G.add_edge(node, new_link[0]) 
                    # Bots try to add link every time
                    if (data['kind'] == 'bot'):
                        potential = list(set(G.nodes) - set(successors))
                        if len(potential) > 0:
                            if strategy == 'targeted':
                                degree = dict(G.in_degree(potential))
                                new_link = max(degree.items(), key=operator.itemgetter(1))[0]
                            else:
                                new_link = random.sample(list(potential),1)[0]
                            G.add_edge(node,new_link)

  A = nx.adjacency_matrix(G).astype(bool)
100% (5 of 5) |##########################| Elapsed Time: 0:01:11 Time:  0:01:11


In [17]:
print(G.nodes(data=True)['1'].keys())
G.nodes(data=True)['1']

dict_keys(['SIZE', 'label', 'lambda', 'wake', 'inbox', 'mentioned_by', 'belief', 'kind'])


{'SIZE': 13520,
 'label': '1',
 'lambda': 0.4276488988668722,
 'wake': 14.0,
 'inbox': [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.

In [32]:
G.nodes()['1']['inbox']

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [18]:
print(metadata.keys())
metadata

dict_keys(['total_tweets', 'all_beliefs', 'influence_proportion', 'bucket1', 'bucket2', 'probability_of_link', 'dynamic_network', 'global_perception', 'retweet_perc', 'allowed_successors', 'strategy', 'last_step', 'perc_bots', 'bot_initial_links', 'A', 'similarity', 'prestige'])


{'total_tweets': [   tweets  time
  0       0     1
  1       0     1,
  Empty DataFrame
  Columns: [tweets, time]
  Index: [],
     tweets  time
  0       0     1
  1       0     1
  2       0     1
  3       0     1
  4       0     1
  5       0     1
  6       0     1,
     tweets  time
  0     0.0     1
  1     0.0     1,
     tweets  time
  0     0.0     1
  1     0.0     1
  2     0.0     1
  3     0.0     1
  4     0.0     1
  5     0.0     1
  6     0.0     1
  7     0.0     1,
     tweets  time
  0     0.0     1,
     tweets  time
  0     0.0     1
  1     0.0     1
  2     0.0     1
  3     0.0     1,
     tweets  time
  0       0     1
  1       0     1,
     tweets  time
  0     0.0     1
  1     0.0     1,
     tweets  time
  0     0.0     1
  1     0.0     1
  2     0.0     1
  3     0.0     1
  4     0.0     1
  5     0.0     1
  6     0.0     1
  7     0.0     1
  8     0.0     1,
     tweets  time
  0     0.0     1
  1     0.0     1
  2     0.0     1,
     tweets  time

In [19]:
%%time
nx.write_gpickle(G, "./data/nodes_post_simulation.gpickle")

CPU times: total: 2.58 s
Wall time: 2.55 s


In [20]:
G1 = nx.read_gpickle("./data/nodes_post_simulation.gpickle")

In [157]:
sys.path.append('./4_simulation/src/')
import checkworthy

In [158]:
with open('./4_simulation/output/all_info.pickle', 'rb') as m:
    all_info = pickle.load(m)
with open('./4_simulation/output/all_claims.pickle', 'rb') as m:
    all_claims = pickle.load(m)
with open('./4_simulation/output/node_info.pickle', 'rb') as m:
    node_info = pickle.load(m)
G = nx.read_gpickle('./4_simulation/output/node_metadata.gpickle')
with open('./4_simulation/output/checkworthy_data.pickle', 'rb') as m:
    check = pickle.load(m)
with open('./4_simulation/output/community_sentiment.pickle', 'rb') as m:
    community_sentiment_through_time = pickle.load(m)
with open('./4_simulation/output/node_time_info.pickle', 'rb') as m:
    node_read_tweets_by_time = pickle.load(m)

In [97]:
node_info[list(node_info)[0]]

['1-3291-55074-0',
 '0-2448-55074-0',
 '0-1023-55074-0',
 '0-5425-35637-0',
 '2-2064-3136-0',
 '1-489-1034-0',
 '1-293-1034-0',
 '2-3042-5600-0',
 '1-1994-5600-0',
 '1-3018-2740-0',
 '3-5042-2740-0',
 '1-2634-55080-0',
 '1-2558-55080-0',
 '0-3336-2740-0',
 '1-119-2740-0',
 '2-4348-3117-0',
 '1-2209-3117-0',
 '2-2998-3117-0',
 '2-5159-88780-0',
 '1-3641-140085-0',
 '1-1337-36861-2',
 '0-2196-55060-2',
 '1-2358-55060-2',
 '1-2322-2711-2',
 '2-2336-2711-2',
 '1-3209-120539-1',
 '3-5491-5571-1',
 '3-5292-105216-1',
 '3-4893-5578-1',
 '3-4204-5578-1',
 '2-4022-5638-0',
 '1-947-3136-0',
 '0-4755-5664-1',
 '1-371-5664-1',
 '1-415-5664-1',
 '0-2182-1034-2',
 '1-1808-5600-0',
 '3-2180-5638-3',
 '1-3155-5638-3',
 '2-3755-5602-3',
 '1-373-3149-3',
 '2-5257-5593-3',
 '1-501-36880-2',
 '3-5810-5589-2',
 '0-2588-5600-3',
 '2-2757-5600-3',
 '2-4304-5600-3',
 '1-1967-165348-5',
 '1-243-165348-5',
 '1-708-45895-5',
 '3-4901-28217-3',
 '2-4268-140615-3',
 '3-5957-5602-5',
 '0-3804-36910-5',
 '3-5389-368

In [27]:
all_info

{'1-2218-1034-0': {'topic': 1,
  'value': 0,
  'claim': 2218,
  'node-origin': '1034',
  'time-origin': 0},
 '1-2069-1034-0': {'topic': 1,
  'value': 0,
  'claim': 2069,
  'node-origin': '1034',
  'time-origin': 0},
 '1-489-1034-0': {'topic': 1,
  'value': -1,
  'claim': 489,
  'node-origin': '1034',
  'time-origin': 0},
 '3-2813-1034-0': {'topic': 3,
  'value': 0,
  'claim': 2813,
  'node-origin': '1034',
  'time-origin': 0},
 '1-293-1034-0': {'topic': 1,
  'value': -1,
  'claim': 293,
  'node-origin': '1034',
  'time-origin': 0},
 '2-1058-1034-0': {'topic': 2,
  'value': -1,
  'claim': 1058,
  'node-origin': '1034',
  'time-origin': 0},
 '1-2415-2239-0': {'topic': 1,
  'value': 0,
  'claim': 2415,
  'node-origin': '2239',
  'time-origin': 0},
 '0-3879-2716-0': {'topic': 0,
  'value': 0,
  'claim': 3879,
  'node-origin': '2716',
  'time-origin': 0},
 '2-3620-2716-0': {'topic': 2,
  'value': 0,
  'claim': 3620,
  'node-origin': '2716',
  'time-origin': 0},
 '1-990-2716-0': {'topic': 1,

In [159]:
check.checkworthy_data['1-676']

{'topic': 1,
 'value': -1,
 'claim': 676,
 'num_of_origins': 5,
 'avg_degree_of_origins': 15.0,
 'max_degree_of_origins': 53,
 'avg_centrality_of_origins': 8.235399262918608e-05,
 'max_centrality_of_origins': 0.00035449504470198785,
 'step1_nodes_visited': 7,
 'step1_avg_degree_visited': 4.428571428571429,
 'step1_avg_centrality_visited': 3.471497258179477e-06,
 'step1_max_degree_visited': 12,
 'step1_max_centrality_visited': 1.5990266282792133e-05,
 'step1_max_depth_from_origin': 2,
 'step1_nodes_at_depth2': 1,
 'step1_nodes_at_depth4': 0,
 'step1_nodes_at_depth6': 0,
 'outcome_nodes_at_t48': 6,
 'average_truth_perception_random': 0.8333333333333334,
 'average_truth_perception_stratified': 0.6666666666666666,
 'average_truth_perception_knowledgable_community': 0.16666666666666666}

In [164]:
check.__dict__

{'checkworthy_data': {'3-3661': {'topic': 3,
   'value': 0,
   'claim': 3661,
   'num_of_origins': 76,
   'avg_degree_of_origins': 52.02631578947369,
   'max_degree_of_origins': 642,
   'avg_centrality_of_origins': 0.00017747252826218033,
   'max_centrality_of_origins': 0.003179559123945327,
   'step1_nodes_visited': 133,
   'step1_avg_degree_visited': 30.390977443609003,
   'step1_avg_centrality_visited': 0.00026670360608031075,
   'step1_max_degree_visited': 450,
   'step1_max_centrality_visited': 0.007454914708620314,
   'step1_max_depth_from_origin': 3,
   'step1_nodes_at_depth2': 32,
   'step1_nodes_at_depth4': 0,
   'step1_nodes_at_depth6': 0,
   'outcome_nodes_at_t48': 132,
   'average_truth_perception_random': 0.0,
   'average_truth_perception_stratified': 0.0,
   'average_truth_perception_knowledgable_community': 0.0},
  '1-2806': {'topic': 1,
   'value': 0,
   'claim': 2806,
   'num_of_origins': 30,
   'avg_degree_of_origins': 25.966666666666665,
   'max_degree_of_origins': 1

In [29]:
len(checkworthy_data)

24000

In [153]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix

check_df = pd.DataFrame.from_dict(checkworthy_data).T
check_df['target'] = check_df['average_truth_perception_random'].astype(int)


train, test = train_test_split(check_df, test_size=0.2)
train_x = train[[i for i in train.columns if ('truth' not in i) and ('target' not in i) and ('step' not in i) and ('outcome' not in i) and ('value' not in i)]]
train_y = train[['target']]

test_x = test[[i for i in test.columns if ('truth' not in i) and ('target' not in i) and ('step' not in i) and ('outcome' not in i) and ('value' not in i)]]
test_y = test[['target']]

clf = HistGradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0) \
                            .fit(train_x, train_y)

print(confusion_matrix(test_y, clf.predict(test_x)))
print(clf.predict(np.array(test_x.iloc[1]).reshape(1, -1)))

[[4570    0]
 [ 229    1]]
[0]


  y = column_or_1d(y, warn=True)


In [147]:
test.groupby('target').count()

Unnamed: 0_level_0,topic,value,claim,num_of_origins,avg_degree_of_origins,max_degree_of_origins,avg_centrality_of_origins,max_centrality_of_origins,step1_nodes_visited,step1_avg_degree_visited,...,step2_nodes_at_depth6,step3_nodes_visited,step3_avg_degree_visited,step3_avg_centrality_visited,step3_max_degree_visited,step3_max_centrality_visited,step3_max_depth_from_origin,step3_nodes_at_depth2,step3_nodes_at_depth4,step3_nodes_at_depth6
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4540,4540,4540,4540,4540,4540,4540,4540,4470,4470,...,1276,283,283,283,283,283,283,283,283,283
1,260,260,260,260,260,260,260,260,237,237,...,42,4,4,4,4,4,4,4,4,4


In [146]:
test_x

Unnamed: 0,topic,claim,num_of_origins,avg_degree_of_origins,max_degree_of_origins,avg_centrality_of_origins,max_centrality_of_origins
3-397,3.0,397.0,11.0,68.909091,441.0,0.000274,0.002503
3-1,3.0,1.0,17.0,141.941176,1973.0,0.000201,0.002035
1-1432,1.0,1432.0,11.0,20.181818,75.0,0.000055,0.000314
0-4725,0.0,4725.0,42.0,32.357143,378.0,0.000393,0.006190
2-5114,2.0,5114.0,14.0,73.642857,459.0,0.000236,0.001395
...,...,...,...,...,...,...,...
0-4274,0.0,4274.0,27.0,19.481481,163.0,0.000056,0.000429
3-5300,3.0,5300.0,43.0,48.418605,703.0,0.000103,0.001470
2-5400,2.0,5400.0,24.0,22.708333,349.0,0.000053,0.000286
0-1281,0.0,1281.0,10.0,21.400000,83.0,0.000081,0.000391


In [108]:
G.nodes(data=True)[list(G.nodes())[0]]

{'Community': 3,
 'lambda': 0.7233193021357949,
 'wake': 52.0,
 'inbox': ['1-3259-2711-48',
  '0-3846-2711-48',
  '3-4055-5681-47',
  '3-5270-5568-47',
  '3-5758-5567-45',
  '2-5700-55071-47',
  '3-2362-36861-47',
  '1-767-36861-47',
  '2-5232-36861-47',
  '1-5320-55071-46',
  '0-3470-2725-48',
  '1-1295-2725-48',
  '3-2666-2725-48',
  '1-1452-2725-48',
  '1-3251-2725-48',
  '0-1635-2725-48',
  '1-1740-2725-48',
  '1-767-36861-47',
  '2-5232-36861-47',
  '3-4055-5681-47',
  '3-5270-5568-47',
  '3-5758-5567-45',
  '2-5700-55071-47',
  '3-2362-36861-47',
  '1-5856-2728-48',
  '1-2782-2728-48',
  '1-1801-2728-48',
  '3-5173-2728-48',
  '1-2811-2728-48',
  '1-1933-3151-46',
  '3-4921-36883-44',
  '0-3974-2720-48',
  '0-5975-55071-47',
  '1-767-36861-47',
  '2-5232-36861-47',
  '3-4055-5681-47',
  '3-5270-5568-47',
  '2-5700-55071-47',
  '1-412-3119-48',
  '0-3101-3119-48',
  '1-3037-3119-48',
  '0-3963-3119-48',
  '0-3954-3119-48',
  '0-5975-55071-47',
  '1-767-36861-47',
  '2-5232-36861-4

In [132]:
from scipy.stats import beta, rankdata

def percentile(x):
    x = np.array(x)
    ranks = rankdata(x)
    return(ranks/len(x))

def calculate_sentiment_rankings(G: nx.DiGraph, topics: list):

    '''
    This function returns a pandas DataFrame with all nodes' percentile rankings of deviation from mean sentiment across all topics.
    This ranking is multiplied by -1 if they have a negative deviation and by +1 if they have a positive deviation,
    creating a range of possible values [-1,1].

    This pandas dataframe is used as an input to modify the distribution from which agents draw their quality of information when tweeting.
    A higher rank value in the dataframe results in a higher probability of creating misinformation.
    This should be intuitive... if someone's sentiment is already high, they are
    more likely to create misinformation. If someone's sentiment is low, they are more likely to produce anti-misinformation.

    One potential issue here is if sentiment is tightly clustered for all agents, this will sort of artificially make some agents produce more/less misinformation in that case.
    '''
    all_node_sentiments = nx.get_node_attributes(G, 'sentiment')
    rankings = pd.DataFrame(index = all_node_sentiments.keys())

    for topic in topics:
        node_sentiments = [all_node_sentiments[key][topic] for key in all_node_sentiments.keys()]
        median = np.median(node_sentiments)
        deviations = [np.absolute(i - median) for i in node_sentiments]
        rankings['sentiment' + str(topic)] = node_sentiments
        rankings['deviation' + str(topic)] = deviations
        rankings['rank' + str(topic)] = np.where(rankings['sentiment' + str(topic)] < median,
                                                 -1*rankings['deviation' + str(topic)].rank(method='max')/len(rankings),
                                                 rankings['deviation' + str(topic)].rank(method='max')/len(rankings))

    return rankings

In [165]:
runtime = 10
impactednesses = [{3: 0.5, 56: 0.5, 43: 0.5},
                  {3: 0.8, 56: 0.3, 43: 0.3},
                  {3: 0.3, 56: 0.8, 43: 0.3},
                  {3: 0.3, 56: 0.3, 43: 0.8}]
num_topics = 4
prestige_values = percentile(list(dict(G.in_degree()).values()))
nodes = list(G.nodes())
prestige = {nodes[i]: prestige_values[i] for i in range(len(nodes))}
node_read_tweets = node_info
all_info = all_info
community_sentiment_through_time = community_sentiment_through_time
node_read_tweets_by_time = node_read_tweets_by_time
topics = list(range(num_topics))
all_claims = all_claims
check = check


In [168]:
with open('./4_simulation/output/stored_model.pickle', 'rb') as m:
    clf = pickle.load(m)

In [233]:
with open('./4_simulation/output/checkworthy_data.pickle', 'rb') as m:
    check = pickle.load(m)

In [237]:
check.__dict__['checkworthy_data']['2-1116']

KeyError: '2-1116'

In [240]:
check.__dict__['checkworthy_data'].update()