In [594]:
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import ray
import parallel, propagation, read, simulation
from simulation import Simulation

In [605]:
import importlib
importlib.reload(simulation)
#ray.init()

<module 'simulation' from '/home/sarming/tmp/propagation/simulation.py'>

## Loading the anonymized follower graphs
Right now we read in the adjacency lists using NetworkX. This builds a dense representation which needs quadratric time and space.
So we only want to do this only once and store it as a sparse matrix together with node labels.

In [454]:
graph = read.adjlist('data/anonymized_outer_graph_neos_20200311.adjlist')
A = nx.to_scipy_sparse_matrix(graph)
node_labels = graph.nodes()
read.save_labelled_graph('data/outer_neos.npz', A, node_labels, compressed=False)

Or, alternatively, in one line:

In [None]:
read.adjlist('data/anonymized_outer_graph_neos_20200311.adjlist', save_as='data/outer_neos.npz')

For the future we can now use the much faster:

In [3]:
A, node_labels = read.labelled_graph('data/outer_neos.npz')

## Basic message propagation
The fundamental function of our model is `edge_propagate`.
In a single call node `source` sends a message to all its followers, who with (uniform) edge probability `p` retweet it in turn.
At every level, `p` gets smaller by multiplication with a discount factor `discount` $< 1$.
If a node receives a message a second time, it discards it, irrespective of if it retweeted it or not.
This continues until either maximimum depth `depth` is reached or `max_nodes` have retweeted the message.

The return value is the number of nodes that resent/retweeted the message.

The concrete values below are not realistic for our twitter use case (`p` is much too high and `discount` to low), but allow to play around and observe the stochastic nature of the call. 

In [547]:
propagation.edge_propagate(A, source=0, p=0.5, discount=0.2, depth=10, max_nodes=10000)

3300

We can run several propagations using the function `simulate`.
It takes a list of source nodes and runs `samples` many propagations for each start node.

It returns the mean number of retweets (ie. the expected number of retweets) per tweet, and the fraction of tweets that are retweeted at least once (ie. the retweet probability).
These are the two main statistics we use to compare our model to the data.

In [585]:
propagation.simulate(A, sources=[0,1,2], p=0.01, discount=0.9, depth=10, max_nodes=1000, samples=1000)

(2.1483333333333334, 0.024333333333333332)

Passing `return_stats=False` the `simulate` function yields the complete results.
In the example below one can observe the crucial role the outdegree of the source node plays: $\delta^-(0)=7$, $\delta^-(1)=0$, $\delta^-(3)=232$.

In [600]:
for source in propagation.simulate(A, sources=[0,1,3], p=0.01, discount=0.9, depth=10, max_nodes=10000, samples=20, return_stats=False):
    print(list(source))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 9850, 12, 7, 3, 8, 3, 0, 3, 1, 1, 30, 1, 1, 10000, 2, 2994, 4, 0, 2]


## Tweet data
For every source tweet in our dataset we extract several features of both the author and the tweet, as well as the number of retweets.

Below is the resulting csv file, albeit with a limited selection of features.



In [601]:
pd.read_csv('data/authors_tweets_features_neos.csv')

Unnamed: 0,author,verified,activity,defaultprofile,userurl,hashtag,tweeturl,mentions,media,retweets
0,8091533,0,0,1,0,1,0,1,0,0
1,283096,0,0,0,1,1,0,1,0,2
2,1087609,0,0,1,0,1,1,0,0,0
3,8400570,0,0,0,1,1,1,1,0,6
4,252015,0,1,0,1,0,0,0,0,21
...,...,...,...,...,...,...,...,...,...,...
7451,639211,1,0,1,1,0,0,1,0,5
7452,6244880,0,0,0,0,1,1,0,0,1
7453,1559022,0,0,0,0,0,0,1,0,14
7454,2632615,0,0,1,0,0,0,0,0,0


The function `read.tweets` reads such an file and aggregates some selected features we are interested in into a single `author_feature` and a  single `tweet_feature`. In this case, since we only consider binary features, we represent them as  binary strings. Additionally the function translates the author id `author` (corresponding to the original graph) into a node id `source` (index into our sparse matrix).

In [592]:
tweets = read.tweets('data/authors_tweets_features_neos.csv', node_labels)
tweets

Unnamed: 0,source,author_feature,tweet_feature,retweets
0,15193,0010,1010,0
1,83877,0001,1010,2
2,39676,0010,1100,0
3,912617,0001,1110,6
4,13477,0101,0000,21
...,...,...,...,...
7451,48182,1011,0010,5
7452,886098,0000,1100,1
7453,39347,0000,0010,14
7454,12281,0010,0000,0


From this we can now calculate statistics for each feature combination.
Of particular interest are `tweets` (the total number of tweets) and the before mentioned `retweet_probability` and `mean_retweets`. 

In [602]:
stats=simulation.tweet_statistics(tweets)
stats

Unnamed: 0_level_0,Unnamed: 1_level_0,tweets,retweet_probability,mean_retweets,median_retweets,max_retweets
author_feature,tweet_feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000,0000,275,0.247273,1.094545,0.0,86
0000,0001,17,0.235294,1.823529,0.0,21
0000,0010,221,0.257919,1.687783,0.0,23
0000,0011,32,0.343750,0.968750,0.0,7
0000,0100,35,0.114286,1.085714,0.0,28
...,...,...,...,...,...,...
1101,0111,12,1.000000,8.833333,4.0,33
1101,1000,11,0.909091,6.090909,4.0,26
1101,1010,149,0.959732,4.281879,3.0,30
1101,1011,58,0.948276,5.931034,4.0,44


## Creating a Simulation object


The main class is `Simulation`.
It needs apart from the graph `A` and the tweet statistics `stats` also a map from author feature to sources with this feature.

In [610]:
source_map = simulation.tweet_sources(tweets)
source_map

author_feature
0000    [39347, 142661, 27740, 19276, 17965, 1166242, ...
0001    [83877, 912617, 48025, 48430, 332342, 39292, 1...
0010    [15193, 39676, 24551, 18773, 28548, 18332, 175...
0011    [14793, 14701, 121082, 11142, 13774, 142712, 2...
0100    [12791, 21164, 38686, 13229, 14857, 182814, 24...
0101    [13477, 332476, 20066, 329203, 210, 23261, 647...
0110    [14582, 12881, 10842, 15116, 14887, 39244, 130...
0111    [22981, 33044, 140136, 13047, 12204, 59632, 22...
1000    [1099157, 48150, 13027, 18412, 12843, 47965, 2...
1001    [115695, 28598, 47882, 318076, 142021, 11381, ...
1010                      [11370, 144662, 1511476, 20590]
1011    [2264839, 6600719, 2412296, 291185, 142193, 27...
1100                                             [883368]
1101    [47653, 305464, 11645, 20414, 27919, 2970372, ...
Name: source, dtype: object

In [611]:
sim = Simulation(A, stats, source_map)

We can also directly read a graph together with tweet data. This combines all of the setup we did step by step above.

In [612]:
sim = Simulation.from_files(graph_file='data/outer_neos.npz', tweet_file='data/authors_tweets_features_neos.csv')

## Finding the simulation parameters

Our current approach is to first find the `edge_probability` according to `retweet_probability` and then the `discount_factor` according to `mean_retweets`.


In [None]:
datadir = '/Users/ian/Nextcloud'
# datadir = '/home/sarming'
#read.adjlist(f'{datadir}/anonymized_outer_graph_neos_20200311.adjlist',
#              save_as=f'{datadir}/outer_neos.npz')
sim = propagation.Simulation.from_files(f'{datadir}/outer_neos.npz', f'{datadir}/authors_tweets_features_neos.csv' )
features = sim.features
params = sim.params
stats = sim.stats

In [None]:
stats.max_retweets.max()

In [None]:
np.array_split(features[0:10],32)


In [None]:
import itertools, operator
stats.groupby('author_feature')['sources'].apply(lambda ls: set(filter(pd.notna,itertools.chain(*ls))))
#agg(operator.concat)

In [None]:
sim.sources[('01000','1')]

In [None]:
type(stats.loc[('0000','0000'),'tweets'])

In [None]:
stats.info()

In [None]:
stats.loc[('0000','0000')]

In [None]:

sim.search_parameters(1,0.5)

In [None]:
sim.search_parameters(200,0.0001)

In [None]:
features.to_csv('features_200_0.0001.csv')

In [None]:
csv=pd.read_csv(f'{datadir}/authors_tweets_features_neos.csv' )
csv['author_feature']=list(zip(csv.))

In [None]:
list(params)

In [None]:
[(x) for x,y in params]

In [None]:
p=ray.get(params)

In [None]:
 def helper(feature):
        sim.search_parameters(100,0.00001)
    sample_calls = [(A, coef, random_vector(n)) for _ in range(num_samples)]
s = sum(pool.starmap(chebyshev_sample, sample_calls))

In [None]:
list(sim.simulate(sim.sample_feature(10)))

In [None]:
sim.sample_source('0000',size=None)

In [None]:
tweets.dropna().groupby('author_feature')['source'].unique()

In [None]:
sim.sample_source('0000',1)[0]
# features.loc[('0000','0011')]

In [None]:
af = features.groupby('author_feature')['tweets'].sum()
af /= af.sum()
np.random.choice(af.index,size=1,p=af)


In [None]:
with pd.option_context("display.max_rows", 1000): display(sim.features)

In [None]:
sim.sample_source('0000',size=None)

In [None]:
tweets.dropna().groupby('author_feature')['source'].unique()

In [None]:
sim.sample_source('0000',1)[0]
# features.loc[('0000','0011')]

In [None]:
af = features.groupby('author_feature')['tweets'].sum()
af /= af.sum()
np.random.choice(af.index,size=1,p=af)


In [None]:
with pd.option_context("display.max_rows", 1000): display(sim.features)

In [None]:
af = features.groupby('author_feature')['tweets'].sum()
af /= af.sum()
np.random.choice(af.index,size=1,p=af)


In [None]:
with pd.option_context("display.max_rows", 1000): display(sim.features)

In [None]:
sim.sample_source('0000',size=None)

In [None]:
tweets.dropna().groupby('author_feature')['source'].unique()

In [None]:
sim.sample_source('0000',1)[0]
# features.loc[('0000','0011')]

In [None]:
af = features.groupby('author_feature')['tweets'].sum()
af /= af.sum()
np.random.choice(af.index,size=1,p=af)


In [None]:
with pd.option_context("display.max_rows", 1000): display(sim.features)