# Preamble

## Imports

In [36]:
import pickle
from collections import defaultdict, Counter
import numpy as np
import random
import subprocess
import re
import itertools
import pandas as pd

# Utility functions

In [13]:
def load_binned_network(filename):
    with open('data_source/' + filename +'.pkl', 'r') as infile:
        return pickle.load(infile)

In [54]:
def Infomap(pajek_string, *args, **kwargs):
    """Function that pipes commands to subprocess and runs native Infomap implementation.
    
    Requires two folders (1) 'input' and (2) 'output', in sister-directory of 'infomap' folder
    that contains 'Infomap' executable. To setup 'infomap' folder, close Infomap from 
    https://github.com/mapequation/infomap and run 'make' inside resulting folder.
    
    Parameters
    ----------
    pajek_string : str
        Pajek representation of the network (str)
    *args : dict
        Infomap execution options. (http://www.mapequation.org/code.html#Options)
    Returns
    -------
    communities : list of lists
    layer_communities : data structure in required format for d3 viz (json)
    """
    
    def _get_id_to_label(filename):
        def __int_if_int(val):
            try: return int(val)
            except ValueError: return val
        with open('community_detection/input/' + filename + ".net", 'r') as fp:
            parsed_network = fp.read()
        return dict(
            (int(n.split()[0]), __int_if_int(n.split('"')[1]))
            for n in re.split(r"\*.+", parsed_network)[1].split("\n")[1:-1]
        )
    
    def _parse_communities_multiplex(id_to_label, filename):
        with open('community_detection/output/'+filename+"_expanded.clu", 'r') as infile:
            clusters = infile.read()

        # Get layers, nodes and clusters from _extended.clu file
        la_no_clu_flow = re.findall(r'\d+ \d+ \d+ \d\.\d+', clusters) # ["30 1 2 0.00800543",...]
        la_no_clu_flow = [tuple(i.split()) for i in la_no_clu_flow]

        node_flow_json = defaultdict(float)      # {layer_node: flow, ...}
        community_flow_json = defaultdict(float) # {community: flow, ...}
        communities_json = defaultdict(set)      # {layer: {(node, cluster), ...}, ...}
        for layer, node, cluster, flow in la_no_clu_flow:
            node_flow_json[(int(layer), id_to_label[int(node)])] += float(flow)
            community_flow_json[cluster] += float(flow)
            communities_json[int(layer)].add((id_to_label[int(node)], int(cluster)))

        return communities_json, node_flow_json, community_flow_json
    
    def _parse_communities_planar(id_to_label, filename):
        with open('community_detection/output/'+filename+".clu", 'r') as infile:
            clusters = infile.read()
        
        # Get nodes and clusters from .clu file
        no_clu = [tuple(i.split()[:-1]) for i in re.findall(r"\d+ \d+ \d\.\d+", clusters)]  # [(node, cluster), ...]
        return {0: set([(id_to_label[int(no)], int(clu)) for no, clu in no_clu])}
    
    def _clean_up(filename):
        subprocess.call(['rm', 'community_detection/input/'+filename+'.net'])
        subprocess.call(['rm', 'community_detection/output/'+filename+'_expanded.clu'])
        subprocess.call(['rm', 'community_detection/output/'+filename+'.clu'])
    
    # Check for process id in args (for multiprocessing)
    if args[-1][:3] == "pid":
        pid = args[-1][3:]
        args = args[:-1]
    else:
        pid = ""
        
    # Set default kwarg params
    return_flow = kwargs.get("return_flow", False)
        
    # Get network in multiplex string format and define filename
    filename = 'tmpnet' + pid

    # Store locally
    with open("community_detection/input/"+filename+".net", 'w') as outfile:
        outfile.write(pajek_string)
    
    # Run Infomap for multiplex network
    subprocess.call(
        ['./community_detection/infomap/Infomap', 'community_detection/input/'+filename+".net"] + \
        list(args)
    )
    
    # Parse communities from Infomap output
    id_to_label = _get_id_to_label(filename)
    
    if 'multiplex' in list(args):
        parsed_communities, node_flow, community_flow = _parse_communities_multiplex(id_to_label, filename)
    if 'pajek' in list(args):
        parsed_communities = _parse_communities_planar(id_to_label, filename)
        
    _clean_up(filename)

    orig_clu = [item for sublist in parsed_communities.values() for item in sublist]
    communities = dict()
    for key, group in itertools.groupby(orig_clu, lambda x: x[1]):
        for thing in group:
            try:
                communities[key].append(thing[0])
            except KeyError:
                communities[thing[1]] = [thing[0]]
    communities = dict((k,set(v)) for k,v in communities.items())

    layer_communities = {}
    for layer, group in parsed_communities.items():
        communities = {}
        for no, clu in group:
            try:
                communities[clu-1].append(no)
            except KeyError:
                communities[clu-1] = [no]
        layer_communities[layer] = communities

    if return_flow:
        return communities, layer_communities, node_flow, community_flow
    else:
        return communities, layer_communities
    
    
def build_adjacency_tensor(layers, index="zero"):
    nodes = set([
        n
        for l in layers
        for n in list(l['user1']) + list(l['user2'])
    ])
    
    ind = dict((n, i) for i, n in enumerate(nodes))
    
    A = defaultdict(int)
    for l, layer in enumerate(layers):
        for _, row in layer.iterrows():
            # Must add both ways if undirected so A becomes symmetrical. If only added one-way
            # triu will only be connections from 'user' and and tril from 'bt_mac' or vice versa.
            if index == "zero":
                A[(ind[row['user1']], ind[row['user2']], l)] += 1
                A[(ind[row['user2']], ind[row['user1']], l)] += 1
            else:
                A[(row['user1'], row['user2'], l)] += 1
                A[(row['user2'], row['user1'], l)] += 1
    return A


def write_pajek(A, node_labels=None, index_from=0):
    """Return multiplex representation of multiplex network adjacency matrix A
    
    Providing an adjacency tensor where A[:, :, k] is adjacency matrix of temporal
    layer k, return a pajek format representation of the temporal network which weights interlayer
    edges by state node neighborhood similarity. 
    
    Parameters
    ----------
    A : numpy.3darray
        3d tensor where each A[:, :, k] is a layer adjacency matrix
    node_labels : list
        List of node labels if (optional)
    index_from : int
        From which number to index nodes and layers in pajek format from (default=0)

    Returns
    -------
    out : string
        A network string in pajek format
    """
    
    def _write_outfile(A):
        """Write nodes and intra/inter-edges from A and J to string."""
        def __remove_symmetry_A(A):
            A_triu = defaultdict(int)
            for (i, j, k), w in A.items():
                if j > i:
                    A_triu[(i, j, k)] = w
            return A_triu
        def __write_nodes(outfile):
            outfile += "*Vertices %d" % Nn
            for nid, label in enumerate(nodes):
                outfile += '\n%d "%s" 1.0' % (nid + index_from, str(label))
            return outfile
        def __write_intra_edges(outfile):
            outfile += "\n*Intra\n# layer node node [weight]"
            for (i, j, k), w in __remove_symmetry_A(A).items():
                outfile += '\n%d %d %d %f' % (
                    k + index_from,  # layer
                    nodemap[i] + index_from,  # node
                    nodemap[j] + index_from,  # node
                    w                # weight
                )
            return outfile
        
        outfile = ""
        outfile = __write_nodes(outfile)
        outfile = __write_intra_edges(outfile)
        
        return outfile
    
    nodes = sorted(set([n for i, j, _ in A.keys() for n in [i, j]]))
    Nn = len(nodes)
    Nl = len(set([k for i, j, k in A.keys()]))
    
    nodemap = dict(zip(nodes, range(Nn)))

    return _write_outfile(A)

# Load and preprocess

In [15]:
network_sensible = load_binned_network('10mins_short'); fof = 2

In [49]:
# Make slices for a span of days (e.g. monday to friday)
# The below configuration gives exactly the three-week period in january 2014
spd = 288 / fof  # slices per day
smargin_start = 0#spd / 3
smargin_end = 0#spd / 3 - 6
dow = 0

network = [
    l
    for d in [0, 1]
    for l in network_sensible[spd*(dow+5+d)+smargin_start:spd*(dow+6+d)-smargin_end]
]

layer_indices = [l for l, n in enumerate(network) if n.shape[0] > 0]
A = build_adjacency_tensor([n for n in network if n.shape[0] > 0], index=None)
network_pajek = write_pajek(A)

# Process

In [55]:
_, layer_commu = Infomap(
        network_pajek,
        'community_detection/output/',
        '-i',
        'multiplex',
        '--multiplex-js-relax-rate', '0.25',
        '--overlapping',
        '--expanded',
        '--clu',
        '--two-level',
        '-z',
        'pid%d' % random.randint(0, 1000000)
    )

# Analyse

In [56]:
all_nodes = set([
    n
    for l in layer_commu.keys()
    for nodes in layer_commu[l].values()
    for n in nodes
])

In [57]:
ds_out = []
for nn in all_nodes:
    for l, data in layer_commu.items():
        for c, nodes in data.items():
            if nn in nodes:
                for n in nodes:
                    ds_out.append(nn)
                    
Counter(ds_out).most_common()

# Save data

## Node 300

In [73]:
ds_out = []
for l, data in layer_commu.items():
    for c, nodes in data.items():
        if len(set([300]) & set(nodes)) > 0:
            for n in nodes:
                ds_out.append([n, l, c])
            
print len(ds_out)

858


In [74]:
pd.DataFrame(ds_out, columns=['id', 'timestamp', 'group']) \
    .to_csv("data/ds_300_2_days.tsv", "\t", header=True, index=False)

## Infection starting at node 300

In [75]:
ds_out = []
infected = set([300])
for l, data in sorted(layer_commu.items(), key=lambda (k, v): k):
    for c, nodes in data.items():
        if len(infected & set(nodes)) > 0:
            for n in nodes:
                ds_out.append([n, l, c])
                infected.add(n)
            
print len(ds_out)

17231


In [76]:
pd.DataFrame(ds_out, columns=['id', 'timestamp', 'group']) \
    .to_csv("data/ds_300_infection_2_days.tsv", "\t", header=True, index=False)

## Full network

In [77]:
ds_out = []
infected = set([300])
for l, data in sorted(layer_commu.items(), key=lambda (k, v): k):
    for c, nodes in data.items():
        for n in nodes:
            ds_out.append([n, l, c])
            infected.add(n)
            
print len(ds_out)

34148


In [78]:
pd.DataFrame(ds_out, columns=['id', 'timestamp', 'group']) \
    .to_csv("data/ds_full_2_days.tsv", "\t", header=True, index=False)

## Full network (minimum community size 4)

In [79]:
ds_out = []
infected = set([300])
for l, data in sorted(layer_commu.items(), key=lambda (k, v): k):
    for c, nodes in data.items():
        if len(nodes) > 3:
            for n in nodes:
                ds_out.append([n, l, c])
                infected.add(n)
            
print len(ds_out)

18224


In [80]:
pd.DataFrame(ds_out, columns=['id', 'timestamp', 'group']) \
    .to_csv("data/ds_full_min_size_4_2_days.tsv", "\t", header=True, index=False)