# Computing Graphs and initial conditions

This notebook will instatiate and save the necessities for conducting the dynamical analysis.
Firstly the dataset will be elaborated and then graphs and initial conditions will be computed.
Might be necessary to change the data path (`PATH_TO_DATA`) or the dataset filnames (`WORD_VECTORS_FILENAME` and `ARTCLES_DF_FILENAME`).

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm


## These following have to be customized
PATH_TO_DATA = Path('../data')
# for joblib multithreading
N_THREADS = -1
# hyperparameters of the graph, get them from empirical data analysis
MIN_SIM = 0.75
TIME_THRESHOLD = 18*24 # in hours
TIME_MIN = 6 # in hours

These methods, used for building the graph, have been extracted and adapted from [this repo](https://github.com/elisamussumeci/modeling-news-spread).

In [None]:
def get_index(
    data, # DataFrame containing articles info
    timestamp, # time of current article
    similarities, # similarities of the current article
    max_dt, # max dt (in hours) for two articles to be linked
    min_dt, # min dt (in hours) for two articles to be linked
    min_similarity, # minimum cos sim distance for two articles to be linked
    outs, # current list of excluded articles
    ):
    while True:
        # get the maximum similarity w.r.t. older articles
        similarity = max(similarities)
        # get the index of such max similar article
        index = similarities.index(similarity)
        # get dt in terms of hours
        dt = (timestamp - data['timestamp'][index]).total_seconds() / 3600 / 24
        # similarity threshold
        if similarity < min_similarity:
            # return None to add index to outs
            return None
        # continue if article is in outs or its distant in time
        elif index in outs or dt > max_dt or dt <= min_dt:
            similarities[index] = 0
        # pass condition
        else:
            return index

def create_graph(
    dists_triu, # similarity matrix
    data, # DataFrame containing articles info
    time_max: int = 168, # max dt (in hours) for two articles to be linked
    time_min: int = 6, # min dt (in hours) for two articles to be linked
    sim_min: float = 0.8, # minimum cos sim distance for two articles to be linked
    ):
    # max number of nodes
    n_articles = dists_triu.shape[0]
    # instantiate the directed graph
    G = nx.Graph()
    # adding the first node
    G.add_node(0, step=0,
               date=data['timestamp'][0],
               domain=data['source'][0],
               _id=data['article_id'][0],
               children=[])
    # instatiating elimination list
    outs = []
    # loop on the other articles
    for i in range(1, n_articles):
        # get time of current article
        pub_i = data['timestamp'][i]
        # get similarities
        column = list(dists_triu[:, i])
        # get index of an article related to the current one
        index = get_index(data, pub_i, column, time_max, time_min, sim_min, outs)
        # if a relation was found
        if index != None:
            # if the related article has not already been inserted, insert it
            if index not in G.nodes():
                G.add_nodes_from([(index, {
                    'timestamp': data['timestamp'][index],
                    'source': data['source'][index],
                    'id': data['article_id'][index],
                })])
            # if the current article has not already been inserted, insert it
            if i not in G.nodes():
                G.add_nodes_from([(i, {
                    'timestamp': data['timestamp'][i],
                    'source': data['source'][i],
                    'id': data['article_id'][i]
                })])
            # linking the nodes
            G.add_edge(index, i)
        # if a relation wa not found
        else:
            # add current article to elimination listS
            outs.append(i)
    # return the graph
    return G

# TODO: describe method, add comments
def create_matrix_domain(graph):
    
    domain_list = []
    for_domain = tqdm(
        graph.nodes(),
        leave=False,
        unit='nodes_for_domain',
    )
    for pos in for_domain:
        node = graph.nodes()._nodes[pos]
        d = node['source']
        if d not in domain_list:
            domain_list.append(d)

    df = pd.DataFrame(0, index = domain_list, columns = domain_list)
    
    for_nodes = tqdm(
        graph.nodes(),
        leave=False,
        unit='nodes',
    )
    
    for pos in for_nodes:
        node = graph.nodes()._nodes[pos]
        d = node['source']
        successors = graph.successors(pos)
        for suc in successors:
            df[d][graph.nodes()._nodes[suc]['source']] += 1

    return [domain_list, df]

# TODO: describe method, add comments
def create_complete_adjacency(graph, matrix):
    df = pd.DataFrame(0, index=graph.nodes(), columns=graph.nodes())
    
    for_nodes = tqdm(
        graph.nodes(),
        leave=False,
        unit='nodes',
    )
    
    for column in for_nodes:
        i_domains_column = matrix[graph.nodes()._nodes[column]['source']]
        for row in graph.nodes():
            prob = i_domains_column[graph.nodes()._nodes[row]['source']]
            df[column][row] = prob

    return df

Methods for initializing the dynamic simulation and creating the initial state.

In [None]:
def create_first_pubs(original_graph):
    dates_list = [original_graph.nodes()[node]['timestamp'] for node in original_graph.nodes()]
    fs = []
    print(min(dates_list).date())
    for node in original_graph.nodes():
        if original_graph.nodes()[node]['timestamp'].date() == min(dates_list).date():
            fs.append(original_graph.nodes()[node]['source'])
    return fs


def create_i0(list_first_pubs, domains):
    i0 = np.zeros(len(domains))
    for pos, i in enumerate(i0):
        if domains[pos] in list_first_pubs:
            i0[pos] = 1
            list_first_pubs.remove(domains[pos])

    return i0

Load data.

In [None]:
files = [
    'dists_triu.csv',
    'info_df.csv',
    'empirical_graph.gpickle',
    'empirical_graph_nodes.csv',
    'graph_original_domains_each_node.txt',
    'graph_complete.csv',
    'i0.csv',
    ]
stories = [
    'world_russia',
    'world_norway',
    'world_capitol_hill',
]
story_to_elaborate = 0
with open(PATH_TO_DATA/stories[story_to_elaborate]/files[1]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    info_df = pd.read_csv(csv_file).drop(['Unnamed: 0'], 1)
info_df = info_df.rename(columns={'id': 'article_id'})
info_df['timestamp'] = pd.to_datetime(info_df.timestamp)
print(info_df.head())
with open(PATH_TO_DATA/stories[story_to_elaborate]/files[0]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    dists_triu = pd.read_csv(csv_file, sep=',', header=None)
dists_triu = dists_triu.values
print(dists_triu.shape)

In [None]:
dataframes = {
    'norway_attack': ['world_norway_word_matrix_df.csv', 'world_norway_info_df.csv', 'world_norway_dists_triu.csv'],
    'russia_shooting': ['world_russia_word_matrix_df.csv', 'world_russia_info_df.csv', 'world_russia_dists_triu.csv'],
    'capitol_hill': ['world_capitol_hill_word_matrix_df.csv', 'world_capitol_hill_info_df.csv', 'world_capitol_hill_dists_triu.csv'],
    'test': ['word_matrix_df.csv', 'info_df.csv', 'dists_triu.csv'],
}
story_to_elaborate = 'test'# 'norway_attack' # 'russia_shooting' # 'capitol_hill'
with open(PATH_TO_DATA/dataframes[story_to_elaborate][0]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    word_matrix_df = pd.read_csv(csv_file).drop(['Unnamed: 0','article_id'], 1)
print(word_matrix_df.head())
with open(PATH_TO_DATA/dataframes[story_to_elaborate][1]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    info_df = pd.read_csv(csv_file).drop(['Unnamed: 0'], 1)
info_df = info_df.rename(columns={'id': 'article_id'})
info_df['timestamp'] = pd.to_datetime(info_df.timestamp)
print(info_df.head())
with open(PATH_TO_DATA/dataframes[story_to_elaborate][2]) as csv_file:
    # dropping autospawned 'Unnamed: 0' column, and unecessary (since they are ordered already) 'article_id' column
    dists_triu = pd.read_csv(csv_file, sep=',', header=None)
dists_triu = dists_triu.values
print(dists_triu.shape)

Creating, saving and drawing the Graphs instatiated using _networkX_ library.

In [11]:
G = create_graph(dists_triu, info_df, time_max=TIME_THRESHOLD, time_min=TIME_MIN, sim_min=MIN_SIM)
nx.write_gpickle(G, PATH_TO_DATA/stories[story_to_elaborate]/files[2])
# nx.draw(G, with_labels=True, font_weight='bold')

In [12]:
pd.DataFrame(dict(G.nodes())).transpose().to_csv(PATH_TO_DATA/stories[story_to_elaborate]/files[3])
all_nodes_domains = []
for i in G.nodes():
    all_nodes_domains.append(G.nodes()._nodes[i]['source'])

with open(PATH_TO_DATA/stories[story_to_elaborate]/files[4], 'w') as file:
    for item in all_nodes_domains:
        file.write("%s\n" % item)

In [13]:
domain_list, domain_matrix = create_matrix_domain(G)
graph_complete = create_complete_adjacency(G, domain_matrix)
as_numpy = np.array(graph_complete)
np.fill_diagonal(as_numpy, 0)
np.savetxt(PATH_TO_DATA/stories[story_to_elaborate]/files[5], as_numpy, delimiter=',')
graph_complete

Unnamed: 0,0,4,5,7,9,10,12,15,16,17,...,2981,2982,2983,2984,2985,2986,2987,2989,2990,2991
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2986,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
2987,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2990,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Retrieve and save initial conditions.

In [15]:
list_first_pubs = create_first_pubs(G)
print('Length of first day pubs is {}'.format(len(list_first_pubs)))
I0 = create_i0(list_first_pubs, all_nodes_domains)
np.savetxt(PATH_TO_DATA/stories[story_to_elaborate]/files[6], I0, delimiter=',')
print('The number of inital infected is {}'.format(np.sum(I0)))
print(I0)

2021-01-06


127