# Computing Graphs and initial conditions

This notebook will instatiate and save the necessities for conducting the dynamical analysis.
Firstly the dataset will be elaborated and then graphs and initial conditions will be computed.
Might be necessary to change the data path (`PATH_TO_DATA`) or the dataset filnames (`WORD_VECTORS_FILENAME` and `ARTCLES_DF_FILENAME`).

In [10]:
from pathlib import Path
import numpy as np
import pandas as pd
import networkx as nx
from scipy import sparse
from sklearn.preprocessing import normalize
from gensim import corpora, models
from gensim.models.word2vec import Word2Vec, KeyedVectors
from tqdm import tqdm
from joblib import Parallel, delayed
# import cugraph ## this library works with CUDA-capable GPUs but has many issues

PATH_TO_DATA = Path('../data')
WORD_VECTORS_FILENAME = 'words_dataframe.csv'
ARTICLES_DF_FILENAME = 'info_dataframe.csv'

## Create Similarity Matrix

In order to create the **Similarity Matrix**, we should _load_ the 'WORD VECTORS' DataFrame and then we should _calculate the distances_ of all the articles (exploiting the simple dot product).

The 'WORD VECTORS' must be normalized before computing the distance.
The fastest method for normalization is that from _sklearn_, but also a _numpy_ version is provided (but not used).

This is equivalent to use the _cosine similarity_ distance to compute these distances.

This step should take

In [2]:
# Load the dataframe
word_vectors = pd.read_csv(PATH_TO_DATA/WORD_VECTORS_FILENAME).drop(['Unnamed: 0','article_id'], 1)
word_vectors

Unnamed: 0,stay,video,plaza,jan,juvenil,month,northeastern,nation,ireland,worri,...,amiri,flaccus,semi-retir,szelag,best-cas,cordoned-off,dearborn,blocked-off,cort,soul-search
0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,7.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
993,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
995,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def create_model_matrix(data, model):
    
    # get the list of words from data
    texts = [list(map(lambda x: x.encode().decode("utf8"), list(data.columns)))]

    # corpora with all words from data, dictionary[id] returns the word linked to id
    dictionary = corpora.Dictionary(texts)
    
    articles_iterator = tqdm(
        range(len(data),
        leave=True,
        unit='columns',
    )
    
    docvs = np.zeros((len(data), 300), dtype=float)
    
    # def process(article):
    #     for word in list(data.columns):
    #         weight = data[word][article]
    #         w = dictionary.doc2idx([word])
    #         try:
    #             word_vector = model.word_vec(w[0])
    #             try:
    #                 assert np.isfinite(word_vector).all()
    #             except AssertionError:
    #                 print(w)
    #         except KeyError:
    #             word_vector = np.zeros((1, 300))
    #     return docvs[article, :] + word_vector*weight
            
    # list_of_docvs = Parallel(n_jobs=12)(delayed(process)(i) for i in articles_iterator)
    
    for n in range(len(data)):
        for word in list(data.columns):
            weight = data[word][n]
            w = dictionary.doc2idx([word])
            try:
                word_vector = model.word_vec(w[0])
                try:
                    assert np.isfinite(word_vector).all()
                except AssertionError:
                    print(w)
            except KeyError:
                word_vector = np.zeros((1, 300))

            docvs[n, :] = docvs[n, :] + word_vector*weight
    return docvs

In [4]:
model = KeyedVectors.load_word2vec_format(PATH_TO_DATA/'conceptnet-numberbatch-17-06-300.gz')
model

<gensim.models.keyedvectors.KeyedVectors at 0x7f090f3fb1f0>

In [12]:
# get the list of words from data
texts = [list(map(lambda x: x.encode().decode("utf8"), list(word_vectors.columns)))]

# corpora with all words from data, dictionary[id] returns the word linked to id
dictionary = corpora.Dictionary(texts)

articles_iterator = tqdm(
    range(len(word_vectors)),
    leave=True,
    unit='articles',
)

def fn(article):
    for word in list(word_vectors.columns):
        weight = word_vectors[word][article]
        w = dictionary.doc2idx([word])
        try:
            word_vector = model.word_vec(w[0])
            try:
                assert np.isfinite(word_vector).all()
            except AssertionError:
                print(w)
        except KeyError:
            word_vector = np.zeros((1, 300))
    return word_vector*weight
        
list_of_docvs = Parallel(n_jobs=6)(delayed(fn)(i) for i in articles_iterator)



KeyboardInterrupt: 

In [None]:
docvs = create_model_matrix(word_vectors, model)
docvs

In [None]:
# Alternative normalization procedure
row_sums = docvs.sum(axis=1)
np_docvs_norm = docvs / np.sqrt((row_sums**2).sum(-1))[:, np.newaxis]
print("Shape of normalized matrix is {}.".format(np_docvs_norm.shape))
print("Sum of normalized matrix is {}.".format(np.sum(np_docvs_norm)))
print("Max={}; Min={}.".format(np.max(np_docvs_norm), np.min(np_docvs_norm)))
np_docvs_norm

In [None]:
# Scikit-learn normalization procedure (axis=1)
sk_docvs_norm = normalize(docvs)
print("Shape of normalized matrix is {}.".format(sk_docvs_norm.shape))
print("Sum of normalized matrix is {}.".format(np.sum(sk_docvs_norm)))
print("Max={}; Min={}.".format(np.max(sk_docvs_norm), np.min(sk_docvs_norm)))
sk_docvs_norm

In [None]:
# Scikit-learn normalization procedure (axis=0)
sk_docvs_norm_0 = normalize(docvs)
print("Shape of normalized matrix is {}.".format(sk_docvs_norm._0shape))
print("Sum of normalized matrix is {}.".format(np.sum(sk_docvs_norm)_0))
print("Max={}; Min={}.".format(np.max(sk_docvs_norm)_0, np.min(sk_docvs_norm)_0))
sk_docvs_norm_0

Computation of the distance matrix.
The simple dot product is used between the matrix and its transpose.
Here are used `scipy.sparse` matrices.

In [None]:
s = sparse.csr_matrix(sk_docvs_norm)
s_t = sparse.csr_matrix(sk_docvs_norm).T
s_dist = s.dot(s_t)
dists_triu = sparse.triu(s_dist, k=1)
dists_triu = np.array(dists_triu.todense())
np.savetxt('../data/dists_triu.csv', dists_triu, delimiter=',')
dists_triu

These methods, used for building the graph, have been extracted and adapted from [this repo](https://github.com/elisamussumeci/modeling-news-spread).

In [None]:
def get_pos(data, pub_i, column_list, time_max, sim_min, outs):
    ans = False
    pos = None

    while ans is False:
        sim = max(column_list)
        pos = column_list.index(sim)
        time_dif = (pub_i - data['timestamp'][pos]).total_seconds() / 3600
        if sim < sim_min:
            pos = None
            ans = True
        elif pos in outs or time_dif > time_max:
            column_list[pos] = 0
        else:
            ans = True
    return pos

def create_graph(dists_triu, data, time_max=168, sim_min=0.8):
    size = dists_triu.shape[0]
    G = nx.DiGraph()
    G.add_node(0, step=0, date=data['timestamp'][0], domain=data['source'][0], _id=data['article_id'][0],
              children=[])
    outs = []
    for i in range(1,size):
        pub_i = data['timestamp'][i]
        column = list(dists_triu[:, i])
        pos = get_pos(data, pub_i, column, time_max, sim_min, outs)

        if pos != None:
            if pos not in G.nodes():
                domain_1 = data['source'][pos]
                G.add_node(pos, date=data['timestamp'][pos], domain=domain_1,
                           _id=data['article_id'][pos], children=[])
            if i not in G.nodes():
                domain_2 = data['source'][i]
                G.add_node(i, date=pub_i, domain=domain_2, _id=data['article_id'][i], children=[])

            G.add_edge(pos, i)
        else:
            outs.append(i)
    return G

def create_date(pub1, pub2, s):
    dif = (pub2-pub1).total_seconds()/3600
    return round((dif/s))

def create_graphml(dists_triu, data, time_max=168, sim_min=0.8):
    size = dists_triu.shape[0]
    G = nx.DiGraph()
    G.add_node(0, step=0, date=0, domain=data['source'][0])
    date_init = data['timestamp'][0]
    outs = []
    for i in range(1, size):
        pub_i = data['timestamp'][i]
        column = list(dists_triu[:,i])
        pos = get_pos(data, pub_i, column, time_max, sim_min, outs)

        if pos != None:
            if pos not in G.nodes():
                domain_1 = data['source'][pos]
                date_1 = create_date(date_init, data['timestamp'], 5)
                G.add_node(pos, date=date_1, domain=domain_1)
            if i not in G.nodes():
                domain_2 = data['source'][i]
                date_2 = create_date(date_init, pub_i, 5)
                G.add_node(i, date=date_2, domain=domain_2)

            G.add_edge(pos, i)
        else:
            outs.append(i)
    return G

def create_matrix_domain(graph):
    
    domain_list = []
    for pos in graph.nodes():
        node = graph.nodes()[pos]
        d = node['domain']
        if d not in domain_list:
            domain_list.append(d)

    df = pd.DataFrame(0, index = domain_list, columns = domain_list)

    for pos in graph.nodes():
        node = graph.nodes()[pos]
        d = node['domain']
        successors = graph.successors(pos)
        for suc in successors:
            df[d][graph.nodes()[suc]['domain']] += 1

    return [domain_list, df]

def create_complete_adjacency(graph, matrix):
    df = pd.DataFrame(0, index=graph.nodes(), columns=graph.nodes())
    for column in graph.nodes():
        i_domains_column = matrix[graph.nodes()[column]['domain']]
        for row in graph.nodes():
            prob = i_domains_column[graph.nodes()[row]['domain']]
            df[column][row] = prob

    return df

Creating, saving and drawing the Graphs instatiated using _networkX_ library.
This step should take ~6min.

In [None]:
articles = pd.read_csv(PATH_TO_DATA/ARTICLES_DF_FILENAME)
articles['timestamp'] = pd.to_datetime(articles.timestamp)
articles = articles.drop('Unnamed: 0', 1)
articles = articles.rename(columns={'id': 'article_id'})
articles

In [None]:
G = create_graph(dists_triu, articles)
nx.write_gpickle(G, '../data/empirical_graph.gpickle')
# nx.draw(G, with_labels=True, font_weight='bold')
# plt.show()
H = create_graphml(dists_triu, articles)
nx.write_graphml(H, '../data/empirical_graph.graphml')
# nx.draw(H, with_labels=True, font_weight='bold')

In [None]:
pd.DataFrame(dict(G.nodes())).transpose().to_csv('../data/empirical_graph_nodes.csv')
all_nodes_domains = []
for i in G.nodes():
    all_nodes_domains.append(G.nodes()[i]['domain'])

f = open('../data/graph_original_domains_each_node.txt', 'w')
for item in all_nodes_domains:
    f.write("%s\n" % item)

In [None]:
domain_list, domain_matrix = create_matrix_domain(G)
graph_complete = create_complete_adjacency(G, domain_matrix)
as_numpy = np.array(graph_complete)
np.fill_diagonal(as_numpy, 0)
np.savetxt('../data/graph_complete.csv', as_numpy, delimiter=',')
graph_complete

Methods for initializing the dynamic simulation and creating the initial state.

In [None]:
def create_first_pubs(original_graph):
    dates_list = [original_graph.nodes()[node]['date'] for node in original_graph.nodes()]
    fs = []
    print(min(dates_list).date())
    for node in original_graph.nodes():
        if original_graph.nodes()[node]['date'].date() == min(dates_list).date():
            fs.append(original_graph.nodes()[node]['domain'])
    return fs


def create_i0(list_first_pubs, domains):
    i0 = np.zeros(len(domains))
    for pos, i in enumerate(i0):
        if domains[pos] in list_first_pubs:
            i0[pos] = 1
            list_first_pubs.remove(domains[pos])

    return i0

Initialization

In [None]:
list_first_pubs = create_first_pubs(G)
I0 = create_i0(list_first_pubs, all_nodes_domains)
np.savetxt('../data/i0.csv', I0, delimiter=',')
print('The number of inital infected is {}'.format(np.sum(I0)))
I0