In [20]:
import sys
sys.path.append('..')
import quarantines as Q
import quarantines_mongo as qm
import graph_generators as gg 
import networkx as nx 
import networkit as nk
import pymongo
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
from tabulate import tabulate
from scipy import stats
from pprint import pprint
import random
sns.set()

In [74]:
# Part 0: Collect data by name and print all names...

def gather_data():
    data_file = open('../quarantine_by_props_1016A.pkl', 'rb')
    data = pickle.load(data_file)
    data_by_name = {}
    for datum in data:
        name = datum.get('name')
        if name not in data_by_name:
            data_by_name[name] = [] 
        data_by_name[name].append(datum)
    return data_by_name

def group_by_qprop(series):
    gather = {} 
    for datum in series:
        qprop = datum['quarantine_props']
        if qprop not in gather:
            gather[qprop] = [] 
        gather[qprop].append(datum)
        
    # and now modify to only collect final_R, max_I
    output = {}
    for qprop, datalist in gather.items():
        final_rs = [_['final_R'] for _ in datalist]
        max_is = [_['max_I'] for _ in datalist]
        output[qprop] = {'final_R': final_rs, 
                         'max_I': max_is}
    return output

def gather_by_name(data, name, i_or_r):
    # Gather triples of [(qprop, mean, std), ...]
    assert i_or_r in ['I', 'R']
    series = group_by_qprop(data[name])
    output = []
    def getter(doc):
        if i_or_r == 'I':
            return doc['max_I']
        else:
            return doc['final_R']
        
        
    for k, v in series.items():
        mean = np.mean(getter(v))
        std = np.std(getter(v))
        output.append((k, mean, std))
        
    return sorted(output, key=lambda trip: trip[0])
    

def size_lookup_by_name(name):
    if name in SYNTHETICS:
        return 10 ** 4 
    elif name.startswith('fb.'):
        return len(gg.load_gemsec_fb(name.split('.')[1]))
    elif name.startswith('deezer.'):
        return len(gg.load_gemsec_deezer(name.split('.')[1]))
    elif name.startswith('arxiv.'):
        return len(gg.load_arxiv_collab(name.split('.')[1]))
    elif name.startswith('hiv') or name.startswith('hs'):
        G = recreate_by_name(name)
        return len(G)
    
    
def recreate_by_name(name):
    # Don't worry too much about actual random seed, just gather parameters
    if name.startswith('ba'):
        return gg.ba_graph(10 ** 4, int(name[2:]))
    elif name.startswith('plc'):
        m, p = name[3:].split('.')
        return gg.plc_graph(10 **4, int(m), float(p) / 100)
    elif name.startswith('rw'):
        assert name == 'rw.91.94'
        return gg.random_walk_graph(10 ** 4, 0.91, 0.94)
    elif name.startswith('nn'):
        assert name == 'nn.886'
        return gg.nearestNeighbor_mod(10 **4, 0.88,6)
    elif name.startswith('ws'):
        assert name == 'ws10.05'
        return gg.watts_strogatz_graph(10 ** 4, 10, 0.05)
    elif name.startswith('fb.'):
        return gg.load_gemsec_fb(name.split('.')[1])
    elif name.startswith('deezer.'):
        return gg.load_gemsec_deezer(name.split('.')[1])
    elif name.startswith('arxiv.'):
        return gg.load_arxiv_collab(name.split('.')[1])
    elif name.startswith('hs'):
        params = name[2:].split('_')
        return gg.load_highschool(float(params[0]), int(params[1]))
    elif name.startswith('hiv'):
        param = int(name[3:].split('R')[0])
        return gg.load_hiv(param)
    else:
        raise Exception("wut")
    return
    
    
def parse_data_name(name):
    prefixes = ['ba', 'plc', 'nn', 'rw', 'ws' ]
    match = None
    for prefix in prefixes:
        if name.startswith(prefix):
            match = prefix 
    if match is None or len(name.split('_')) == 1:
        return recreate_by_name(name)
    params = name.split('_')
    R = params[-1]
    if name.startswith('ba'):
        return gg.ba_graph(10 ** 4, int(params[1]))
    elif name.startswith('plc'):
        return gg.plc_graph(10 **4, int(params[1]), float(params[2]))
    elif name.startswith('rw'):
        return gg.random_walk_graph(10 ** 4, float(params[1]), float(params[2]))
    elif name.startswith('nn'):
        return gg.nearestNeighbor_mod(10 **4, float(params[1]),int(params[2]))
    elif name.startswith('ws'):
        return gg.watts_strogatz_graph(10 ** 4, int(params[1]), float(params[2]))

    
def get_r_by_name(name):
    prefixes = ['ba', 'plc', 'nn', 'rw', 'ws' ]
    match = None
    for prefix in prefixes:
        if name.startswith(prefix):
            match = prefix 
    if match is None or len(name.split('_')) == 1:
        return 1
    params = name.split('_')
    return float(params[-1])



def get_minR_graph(data, name):
    # Gathers the graph (after minR optimal quarantine) 
    # 1) Get minR prop:
    minprop = min(gather_by_name(data, name, 'R'), key=lambda trip: trip[1])[0]
    
    
    # 2) Recreate graph and rerun 
    R = get_r_by_name(name)
    G = parse_data_name(name)
    outG = Q.run_until_prop_IR(G, R, 1, 10 / len(G), float('inf'), minprop)[0]
    return outG


#fig, ax = plt.subplots(figsize=(10,10)) #<---- general axis maker
def plot_vs(data, names, irs, ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(10,10))
    if not isinstance(names, list):
        names = [names]
    def plot_single_v(name, ir, ax=ax):
        try: 
            size = size_lookup[name]
        except:   
            size = 10 ** 4 # size_lookup[name]
        i_trips = gather_by_name(data, name, 'I')
        r_trips = gather_by_name(data, name, 'R')
        qprops = [_[0] for _ in i_trips]
        Is = [_[1] / size for _ in i_trips]
        qprops_r = [_[0] for _ in r_trips]
        Rs = [_[1] / size for _ in r_trips]
        if 'I' in ir:
            ax.plot(qprops, Is, label=name)
        if 'R' in ir:
            ax.plot(qprops, Rs, label=name)
    
    
    for name in names:
        plot_single_v(name, irs, ax=ax)
    ax.legend()
    

def degree_hist(G):
    pairs = {}
    for _, d in G.degree():
        pairs[d] = pairs.get(d, 0) + 1
    return pairs

def return_cbin(G):
    # Returns best-fit powerlaw exponent by using cumulative binning 
    items = sorted(degree_hist(G).items(), key=lambda p:-p[0])
    cdf = []
    runsum = 0
    for deg, num in items:
        runsum += num 
        cdf.append((deg, runsum))
    xform = [(np.log(_[0]), np.log(_[1] / len(G))) for _ in cdf]
    return xform 



def degree_hist(G):
    pairs = {}
    for _, d in G.degree():
        pairs[d] = pairs.get(d, 0) + 1
    return pairs


def get_powerlaw_exponent_cbin(G):
    # Returns best-fit powerlaw exponent by using cumulative binning 
    items = sorted(degree_hist(G).items(), key=lambda p:-p[0])
    cdf = []
    runsum = 0
    for deg, num in items:
        runsum += num 
        cdf.append((deg, runsum))
    xform = [(np.log(_[0]), np.log(_[1] / len(G))) for _ in cdf]
    #plt.scatter(*zip(*xform))
    slope, intercept, r_value, p_value, std_err = stats.linregress(*zip(*xform))
    return 1 -slope, r_value

        
def collect_graph_data_by_name(name, data=data):
    # Returns graph [name, nodes, average degree, cluster coefficient, powerlaw exponent, best_iTup, bestrTup] 
    G = recreate_by_name(name)
    nodes = len(G)
    avg_deg = sum(dict(G.degree()).values()) / len(G)
    cc = nx.average_clustering(G)
    powerlaw = get_powerlaw_exponent_cbin(G)
    no_q = gather_by_name(data, name, 'R')[0][1] /len(G)
    best_i = min(gather_by_name(data, name, 'I'), key=lambda tri: tri[1])
    best_i_prop = best_i[0]
    best_i_val = best_i[1] / nodes
    best_r = min(gather_by_name(data, name, 'R'), key=lambda tri: tri[1])
    best_r_prop = best_r[0]
    best_r_val = best_r[1] / nodes
    return [name, nodes, avg_deg, cc, powerlaw, no_q, best_r_prop, best_r_val]


def tabulate_by_name(names):
    headers=['name', 'nodes', 'avg_deg', 'cc', 'powerlaw', 'no_Q', 'best_r_prop', 'best_r_val']
    graph_data = [collect_graph_data_by_name(_) for _ in names]
    print(tabulate(graph_data, headers=headers, floatfmt='.2f'))
    return graph_data

In [96]:
def cluster(G):
    return nx.average_clustering(G)

def avg_deg(G):
    return G.number_of_edges()  / len(G)

def get_largest_cc(G):
    ccs = nx.connected_components(G)
    maxlen = 0
    max_ccset = None 
    for conn in ccs:
        if len(conn) > maxlen:
            maxlen = len(conn)
            max_ccset = conn
    return max_ccset

def avg_shortest_path(G, samples=10 * 1000):
    # Takes weighted average of connected component shortest paths? 
    # Randomly samples pairs of nodes in the largest connected component and computes average path length 
    ccset = get_largest_cc(G)
    sampleset = random.choices(list(ccset), k=samples * 2)
    len_sum = 0 
    count = 0
    for i in range(0, len(sampleset), 2):
        len_sum += nx.shortest_path_length(G, source=sampleset[i], target=sampleset[i+1])
        count += 1
    return len_sum / count
        
    
def get_change_summary(data, name):
    # Gets change of {cc, avg_deg, shortest path, powerlaw exponent}
    og_graph = parse_data_name(name)
    min_r_graph = get_minR_graph(data, name)
    
    og_cc = cluster(og_graph)
    og_avg = avg_deg(og_graph)
    og_sp = avg_shortest_path(og_graph)
    og_powerlaw = get_powerlaw_exponent_cbin(og_graph)
    
        
    cc = cluster(min_r_graph)
    avg = avg_deg(min_r_graph)
    sp = avg_shortest_path(min_r_graph)
    powerlaw = get_powerlaw_exponent_cbin(min_r_graph)
    
    return [name, og_cc, cc, og_avg, avg, og_sp, sp, og_powerlaw, powerlaw]
    
def tabulate_struct_change(rows):
    headers=['name', 'OGCC', 'CC', 'OGavgD', 'avgD', 'OGsp', 'sp', 'OGpl', 'pl']
    rows = sorted(rows, key=lambda p: p[0])
    print(tabulate(rows, headers=headers, floatfmt='.3f'))
    return rows

In [86]:
tabulate_struct_change([get_change_summary(data, NAME)])

  OGCC     CC    OGavgD    avgD    OGsp     sp  OGpl                                      pl
------  -----  --------  ------  ------  -----  ----------------------------------------  ----------
 0.060  0.042     9.981   2.598   2.969  5.089  (2.748073882813855, -0.9950026663030984)  (nan, nan)




[[0.0597557169546474,
  0.04238377760844903,
  9.9809,
  2.598333333333333,
  2.9692,
  5.0889,
  (2.748073882813855, -0.9950026663030984),
  (nan, nan)]]

In [98]:
quarters = [_ for _ in data.keys() if _.endswith('0.5')]
quarter_rows = [get_change_summary(data, _) for _ in quarters]




In [95]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))


In [104]:
REALS = [_ for _ in data.keys() if _.startswith('fb') or _.startswith('deezer') or _.startswith('arxiv')]

In [105]:
REALS

['fb.artist',
 'fb.athletes',
 'fb.company',
 'fb.new_sites',
 'fb.government',
 'deezer.RO',
 'deezer.HR',
 'deezer.HU',
 'arxiv.AstroPh',
 'arxiv.CondMat',
 'arxiv.HepPh',
 'arxiv.GrQc',
 'arxiv.HepTh']

In [111]:
real_rows = [get_change_summary(data, _) for _ in REALS]



In [112]:
tabulate_struct_change(real_rows)

name             OGCC     CC    OGavgD    avgD    OGsp      sp  OGpl                                       pl
-------------  ------  -----  --------  ------  ------  ------  -----------------------------------------  ----------
arxiv.AstroPh   0.631  0.435    10.553   1.582   4.202  11.083  (2.8341885079219566, -0.9250683427470897)  (nan, nan)
arxiv.CondMat   0.633  0.460     4.042   1.411   5.372  12.087  (3.105684718276472, -0.9633438670430492)   (nan, nan)
arxiv.GrQc      0.530  0.431     2.765   1.434   6.041   9.801  (2.8810900986395005, -0.9294668776290894)  (nan, nan)
arxiv.HepPh     0.611  0.381     9.870   1.402   4.676  12.138  (2.3100988004575402, -0.9075199300058858)  (nan, nan)
arxiv.HepTh     0.471  0.351     2.632   1.177   5.938  11.684  (3.276478575985644, -0.9175289535288472)   (nan, nan)
deezer.HR       0.136  0.048     9.129   1.032   4.500  10.160  (3.3908677535409915, -0.931590354383744)   (nan, nan)
deezer.HU       0.116  0.059     4.688   1.022   5.352  11.873  

[['arxiv.AstroPh',
  0.6305932411707943,
  0.43537441531421367,
  10.553483912209675,
  1.581805142915685,
  4.2017,
  11.0827,
  (2.8341885079219566, -0.9250683427470897),
  (nan, nan)],
 ['arxiv.CondMat',
  0.6334130270820665,
  0.460008176294107,
  4.041715298491333,
  1.4108282440129005,
  5.3722,
  12.0873,
  (3.105684718276472, -0.9633438670430492),
  (nan, nan)],
 ['arxiv.GrQc',
  0.529635811052136,
  0.4310050519444978,
  2.7653567340709655,
  1.4344422700587085,
  6.0414,
  9.8011,
  (2.8810900986395005, -0.9294668776290894),
  (nan, nan)],
 ['arxiv.HepPh',
  0.6114825236824339,
  0.3813557671569736,
  9.870169886742172,
  1.4016369902354968,
  4.6755,
  12.1382,
  (2.3100988004575402, -0.9075199300058858),
  (nan, nan)],
 ['arxiv.HepTh',
  0.4714390529669332,
  0.35110759950084963,
  2.6321757618710135,
  1.1771093176815848,
  5.9375,
  11.6842,
  (3.276478575985644, -0.9175289535288472),
  (nan, nan)],
 ['deezer.HR',
  0.13647174362435588,
  0.04846281117906052,
  9.12877691

In [99]:
tabulate_struct_change(quarter_rows)

name                OGCC     CC    OGavgD    avgD    OGsp      sp  OGpl                                       pl
----------------  ------  -----  --------  ------  ------  ------  -----------------------------------------  ----------
ba_10_0.5          0.011  0.001     9.990   1.745   3.063   6.209  (2.9678808406844963, -0.9957199022028757)  (nan, nan)
ba_5_0.5           0.007  0.001     4.997   1.430   3.657   7.493  (2.9195720961468057, -0.9963688228495106)  (nan, nan)
nn_0.88_6_0.5      0.125  0.080    13.023   1.163   3.400   7.885  (2.64825693807496, -0.8766132345933745)    (nan, nan)
plc_10_0.25_0.5    0.059  0.035     9.981   1.476   2.966   6.984  (2.7559722136942604, -0.996157286774496)   (nan, nan)
plc_5_0.5_0.5      0.177  0.122     4.995   1.566   3.554   7.458  (2.7031718951762675, -0.9970077720626407)  (nan, nan)
rw_0.91_0.94_0.5   0.290  0.190     9.562   1.375   3.468   8.239  (2.710887172150499, -0.9491624082709955)   (nan, nan)
ws_10_0.05_0.5     0.573  0.605     5.00

[['ba_10_0.5',
  0.01089538999656295,
  0.0013538751614662432,
  9.99,
  1.744920287589872,
  3.0632,
  6.2087,
  (2.9678808406844963, -0.9957199022028757),
  (nan, nan)],
 ['ba_5_0.5',
  0.007057165436221439,
  0.0005926602332852334,
  4.9975,
  1.4304166666666667,
  3.6574,
  7.4926,
  (2.9195720961468057, -0.9963688228495106),
  (nan, nan)],
 ['nn_0.88_6_0.5',
  0.12481417792396036,
  0.07995491853825179,
  13.0225,
  1.1626666666666667,
  3.4002,
  7.8848,
  (2.64825693807496, -0.8766132345933745),
  (nan, nan)],
 ['plc_10_0.25_0.5',
  0.05949136632417576,
  0.03473973696554346,
  9.9807,
  1.476451612903226,
  2.9657,
  6.9842,
  (2.7559722136942604, -0.996157286774496),
  (nan, nan)],
 ['plc_5_0.5_0.5',
  0.17659244664030213,
  0.12151047620305208,
  4.9953,
  1.565576923076923,
  3.5543,
  7.4584,
  (2.7031718951762675, -0.9970077720626407),
  (nan, nan)],
 ['rw_0.91_0.94_0.5',
  0.29012148727441683,
  0.18983512889071766,
  9.5616,
  1.3754,
  3.4681,
  8.2393,
  (2.71088717215

In [4]:
data = gather_data()
SYNTHETICS = ['ba10', 'ba5', 'plc10.25', 'plc5.5', 'rw.91.94', 'nn.886', 'ws10.05',]
EPIS = [_ for _ in data if (_.startswith('hs') or _.startswith('hiv'))]
REALS = [_ for _ in data if _ not in SYNTHETICS]
#size_lookup = {k: size_lookup_by_name(k) for k in data}

In [38]:
sorted(data.keys())

['arxiv.AstroPh',
 'arxiv.CondMat',
 'arxiv.GrQc',
 'arxiv.HepPh',
 'arxiv.HepTh',
 'ba10',
 'ba5',
 'ba_10_0.03125',
 'ba_10_0.0625',
 'ba_10_0.125',
 'ba_10_0.25',
 'ba_10_0.5',
 'ba_10_1',
 'ba_10_2',
 'ba_10_4',
 'ba_10_8',
 'ba_5_0.03125',
 'ba_5_0.0625',
 'ba_5_0.125',
 'ba_5_0.25',
 'ba_5_0.5',
 'ba_5_1',
 'ba_5_2',
 'ba_5_4',
 'ba_5_8',
 'deezer.HR',
 'deezer.HU',
 'deezer.RO',
 'fb.artist',
 'fb.athletes',
 'fb.company',
 'fb.government',
 'fb.new_sites',
 'hiv50R1',
 'hiv50R3',
 'hiv50R5',
 'hs15_10',
 'hs30_10',
 'hs5_10',
 'hs60_10',
 'nn.886',
 'nn_0.88_6_0.03125',
 'nn_0.88_6_0.0625',
 'nn_0.88_6_0.125',
 'nn_0.88_6_0.25',
 'nn_0.88_6_0.5',
 'nn_0.88_6_1',
 'nn_0.88_6_2',
 'nn_0.88_6_4',
 'nn_0.88_6_8',
 'plc10.25',
 'plc5.5',
 'plc_10_0.25_0.03125',
 'plc_10_0.25_0.0625',
 'plc_10_0.25_0.125',
 'plc_10_0.25_0.25',
 'plc_10_0.25_0.5',
 'plc_10_0.25_1',
 'plc_10_0.25_2',
 'plc_10_0.25_4',
 'plc_10_0.25_8',
 'plc_5_0.5_0.03125',
 'plc_5_0.5_0.0625',
 'plc_5_0.5_0.125',
 'pl

In [41]:
plc = parse_data_name('plc_10_0.25_0.25')
NAME = 'plc_10_0.25_0.25'
NAME2 = 'rw_0.91_0.94_0.25'

In [72]:
out_plc = get_minR_graph(data, NAME)


def 

In [73]:
out_rw = get_minR_graph(data, NAME2)
rw = 