In [None]:
%matplotlib inline

import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from datetime import datetime
from graph_tool.all import * 

In [None]:
#### Uncomment this block when running code for first time(and comment it back later)
#####################################################################################
#G = load_graph_from_csv("data/data.txt", directed=True, eprop_types=['int', 'int'], eprop_names=['start', 'end'], string_vals=True, csv_options={'delimiter':' '})
#G.save('data/original.gt')

#graphs = []
#for name in os.listdir('data/snapshots/csv'):
#    if not name.endswith('.csv'):
#        continue
#        
#    g = load_graph_from_csv(os.path.join('data/snapshots/csv', name), directed=True, string_vals=True)
#    timestamp = g.new_graph_property('python::object')
#    timestamp[g] = datetime.fromtimestamp(int(name[:-4]))
#    g.graph_properties['timestamp'] = timestamp
#    graphs.append(g)
    
#    g.save(os.path.join('data/snapshots/gt', name + '.gt'))

In [None]:
G = load_graph('data/original.gt')

graphs = []
for name in os.listdir('data/snapshots/gt'):
    if not name.endswith('.gt'):
        continue
        
    graphs.append(load_graph(os.path.join('data/snapshots/gt', name)))

In [None]:
graphs = sorted(graphs, key=lambda g : g.graph_properties['timestamp'])
graphs.append(G)

In [None]:
for g in graphs:       
    vertices = g.new_graph_property('float')
    vertices[g] = len(g.get_vertices())
    g.graph_properties['vertices'] = vertices
    
    edges = g.new_graph_property('float')
    edges[g] = len(g.get_edges())
    g.graph_properties['edges'] = edges

In [None]:
for g in graphs:
    degree_in = g.new_graph_property('float')
    degree_in_std = g.new_graph_property('float')
    degree_in[g], degree_in_std[g] = vertex_average(g, 'in')
    g.graph_properties['degree_in'] = degree_in
    g.graph_properties['degree_in_std'] = degree_in_std
    
    degree_out = g.new_graph_property('float')
    degree_out_std = g.new_graph_property('float')
    degree_out[g], degree_out_std[g] = vertex_average(g, 'out')
    g.graph_properties['degree_out'] = degree_out
    g.graph_properties['degree_out_std'] = degree_out_std
    
    degree_total = g.new_graph_property('float')
    degree_total_std = g.new_graph_property('float')
    degree_total[g], degree_total_std[g] = vertex_average(g, 'total')
    g.graph_properties['degree_total'] = degree_total
    g.graph_properties['degree_total_std'] = degree_total_std

In [None]:
for g in graphs:
    clustering = g.new_graph_property('float')
    clustering_std = g.new_graph_property('float')
    clustering[g], clustering_std[g] = global_clustering(g)
    g.graph_properties['clustering'] = clustering
    g.graph_properties['clustering_std'] = clustering_std

In [None]:
for g in graphs:
    g.vertex_properties['pagerank'] = pagerank(g)

In [None]:
for g in graphs:
    vs = g.get_vertices()
    
    degree_in = g.new_vertex_property('int')
    for (i,v) in enumerate(g.get_in_degrees(vs)):
        degree_in[i] = v
    g.vertex_properties['degree_in'] = degree_in
    
    degree_out = g.new_vertex_property('int')
    for (i,v) in enumerate(g.get_out_degrees(vs)):
        degree_out[i] = v
    g.vertex_properties['degree_out'] = degree_out
    
    degree_total = g.new_vertex_property('int')
    for (i,v) in enumerate(degree_in):
        degree_total[i] = v + degree_out[i]
    g.vertex_properties['degree_total'] = degree_total

In [None]:
# TODO
for g in graphs:
    modl = g.new_graph_property('float')
    b = minimize_blockmodel_dl(g)
    modl[g] = modularity(g, b)
    g.graph_properties['modularity'] = modl

In [None]:
for g in graphs:
    hits_eigen = g.new_graph_property('float')
    hits_eigen[g], g.vertex_properties['hits_authority'], g.vertex_properties['hits_hub'] = hits(g)
    g.graph_properties['hits_eigen'] = hits_eigen

In [None]:
for g in graphs:
    diameter = g.new_graph_property('float')
    diameter[g], _ = pseudo_diameter(g)
    g.graph_properties['diameter'] = diameter

In [None]:
for g in graphs:
    g.vertex_properties['largest_connected_component'] = label_largest_component(g)

In [None]:
#for (i,g) in enumerate(graphs):
    #print(i)
   # g.vertex_properties['shortest_distances'] = shortest_distance(g)

In [None]:
#for g in graphs:
    # TODO: similarity(g1, g2) or vertex_similarity(g1, g2) ??

In [None]:
d = {}
for g in graphs:
    for (p, _) in g.graph_properties.items():
        d[p] = [] 
        
for g in graphs:
    for p in d:
        if p in g.graph_properties:
            d[p].append(g.graph_properties[p])
        else:
            d[p].append(np.nan)
        
df = pd.DataFrame(d).sort_values('timestamp').reset_index(drop=True)
display(df)

In [None]:
d = {}
for name in graphs[-1].vertex_properties['name'].get_2d_array([0])[0]:
    d[name] = {}
    
for (i,g) in enumerate(graphs):
    print(i)
    for j in g.get_vertices():
        name = g.vertex_properties['name'][j]
        for (p, v) in g.vertex_properties.items():
            d[name]['{0}_{1}'.format(p, i)] = v[j]
            
df = pd.DataFrame.from_dict(dx, orient='index')
display(df)

In [None]:
cols = df.columns
for c in cols:
    if c.endswith('_15'):    
        df = df.rename(columns={c:c[:-3]+'_full'})

In [None]:
df = df.sort_index()
cols = df.columns
for c in cols:
    display(c)
    df["rank_{0}".format(c)] = df.sort_values(by=c, ascending=False).reset_index().rename_axis('rank').reset_index().set_index('id').sort_index()['rank'] + 1

In [None]:
df.to_pickle('data/df.xz')

In [None]:
### Start execution from here to avoid recomputations
df = pd.read_pickle('data/df.xz')

In [None]:
pr = df[df.columns[df.columns.str.contains('.*pagerank.*')]].sort_values(by='rank_pagerank_full')

In [None]:
display(pr[pr.columns[pr.columns.str.contains('^pagerank.*')]].head(20))

In [None]:
display(pr[pr.columns[pr.columns.str.contains('^rank_pagerank.*')]].head(20))

In [None]:
hit = df[df.columns[df.columns.str.contains('.*hits.*')]].sort_values(by='rank_hits_authority_full')

In [None]:
display(hit[hit.columns[hit.columns.str.contains('^hits_authority.*')]].head(20))

In [None]:
display(hit[hit.columns[hit.columns.str.contains('^rank_hits_authority.*')]].head(20))

In [None]:
hit = df[df.columns[df.columns.str.contains('.*hits.*')]].sort_values(by='rank_hits_hub_full')

In [None]:
display(hit[hit.columns[hit.columns.str.contains('^hits_hub.*')]].head(20))

In [None]:
display(hit[hit.columns[hit.columns.str.contains('^rank_hits_hub.*')]].head(20))

In [None]:
lcc = df[df.columns[df.columns.str.contains('.*largest_connected_component.*')]].sort_values(by='rank_largest_connected_component_full')

In [None]:
display(lcc[lcc.columns[lcc.columns.str.contains('^largest_connected_component.*')]].head(30))