## Analysis

In [4]:
import os
from tex_funcs import set_tex_var as set_tex_var_import
from tex_funcs import df_to_tex as df_to_tex_import

LATEX_DAT_PATH = './data/harvey_user_location/latex/network-analysis.dat'
LATEX_TABLE_PATH = './data/harvey_user_location/latex/network-analysis-tabs.dat'

for file in [LATEX_DAT_PATH, LATEX_TABLE_PATH]:
    try:    
        os.remove(file)
    except:
        pass

def set_tex_var(name, val, dec=3):
    set_tex_var_import(name, val, LATEX_DAT_PATH, dec)

def df_to_tex(df, cols=None, label='UNNAMED', caption='UNDEFINED', 
    width=0.8, row_sep=1, col_widths=None, dec_dict=None,sum_row=False):
    df_to_tex_import(df, LATEX_TABLE_PATH, cols, label, caption, width, row_sep, col_widths, dec_dict,sum_row)

In [5]:
DIR = './data/harvey_user_location/'
with open(DIR + 'locality_dict_hrv.txt') as json_file:
    hrv_locs = json.load(json_file)
print('Unique Harvey Locs:', len(hrv_locs))

DIR = './data/florence_user_location/'
with open(DIR + 'flr_locs.txt') as json_file:
    flr_locs = json.load(json_file)
print('Unique Florence Locs:', len(flr_locs))

Unique Harvey Locs: 5056
Unique Florence Locs: 29507


In [6]:
set_tex_var('hrvUniqLocs', len(hrv_locs))
set_tex_var('flrUniqLocs', len(flr_locs))

### Harvey Network

In [66]:
import networkx as nx

DIR = './data/harvey_user_location/graph_objs/'
GEXF_FILE = 'network_data_HurricaneHarvey_v1.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)
# Make undirected:
G2 = nx.Graph(G)

In [67]:
Gcc = max(nx.connected_components(G2), key=len)
G_lcc = G2.subgraph(Gcc)

lcc_nodes = '{:,.0f} ({:.2f})'.format(G_lcc.number_of_nodes(), G_lcc.number_of_nodes()/G.number_of_nodes())
lcc_edges = '{:,.0f} ({:.2f})'.format(G_lcc.number_of_edges(), G_lcc.number_of_edges()/G.number_of_edges())

hrv_g_row = {'Event':'Harvey', 'Nodes':'{:,.0f}'.format(G.number_of_nodes()), 'Edges':'{:,.0f}'.format(G.number_of_edges())}
hrv_lcc_row = {'Event':'Harvey$_{LCC}$', 'Nodes':lcc_nodes, 'Edges':lcc_edges}

Gcc = None
G_lcc = None

In [68]:
assort_coef = nx.attribute_assortativity_coefficient(G2, 'lcl_profile')
print('Harvey assortativity for undirected graph:', assort_coef)
assort_coef = nx.attribute_assortativity_coefficient(G, 'lcl_profile')
print('Harvey assortativity for directed graph:', assort_coef)

Harvey assortativity for undirected graph: 0.37909172708250594
Harvey assortativity for directed graph: 0.385031221987116


In [8]:
# Excluding non-geocoded nodes:
partition = [node for node, data in G2.nodes(data=True) if data.get('lcl_profile') != '']
G2 = G2.subgraph(partition)
print('{} nodes reduced to {}'.format(G.number_of_nodes(), G2.number_of_nodes()))

assort_coef_1 = nx.attribute_assortativity_coefficient(G2, 'lcl_profile')
print('Harvey assortativity for non-null profile location nodes, undirected:', assort_coef_1)

G3 = G.subgraph(partition)
assort_coef_2 = nx.attribute_assortativity_coefficient(G3, 'lcl_profile')
print('Harvey assortativity for non-null profile location nodes, directed:', assort_coef_2)

31932 nodes reduced to 21157
Harvey assortativity for non-null profile location nodes, undirected: 0.6165076429568769
Harvey assortativity for non-null profile location nodes, directed: 0.6265234987134032


In [9]:
set_tex_var('hrvAssortCoef', assort_coef_2, 3)

hrv_g_row = {'Event':'Harvey', 'Nodes':orig_nodes, 'Edges':orig_edges}
hrv_g2_row = {'Event':'Harvey$_{LCC}$', 'Nodes':G2.number_of_nodes(), 'Edges':G2.number_of_edges()}
hrv_assort_row = {'Event': 'Harvey', 'Nodes$_{total}$':orig_nodes, 'Nodes$_{filtered}$':G2.number_of_nodes(), 'Assortativity':assort_coef_2}

Assortativity based on the coded label is also calculated. This uses a subgraph of only nodes which have been coded.

In [10]:
G3 = G.copy()

nodes = [
    node
    for node, data
    in G3.nodes(data=True)
    if data.get("user_code") != ""
]
G3 = G3.subgraph(nodes)

attrs = {}
for node, data in G3.nodes(data=True):
    if data.get("user_code") != "Non-Witness":
        attrs[node] = {"user_code":True}
    else:
        attrs[node] = {"user_code":False}
nx.set_node_attributes(G3, attrs)

assort_coef_1 = nx.attribute_assortativity_coefficient(G3, 'user_code')
print('Harvey coded dataset assortativity, directed:', assort_coef)
assort_coef_2 = nx.attribute_assortativity_coefficient(nx.Graph(G3), 'user_code')
print('Harvey coded dataset assortativity, undirected:', assort_coef)


# Check LCC (of full graph)
Gcc = max(nx.connected_components(G2), key=len)
G4 = G3.subgraph(Gcc)

assort_coef_3 = nx.attribute_assortativity_coefficient(G4, 'user_code')
print('Harvey LCC coded dataset assortativity, directed:', assort_coef)
assort_coef_4 = nx.attribute_assortativity_coefficient(nx.Graph(G4), 'user_code')
print('Harvey LCC coded dataset assortativity, undirected:', assort_coef)

Harvey coded dataset assortativity, directed: 0.385031221987116
Harvey coded dataset assortativity, undirected: 0.385031221987116
Harvey LCC coded dataset assortativity, directed: 0.385031221987116
Harvey LCC coded dataset assortativity, undirected: 0.385031221987116


In [11]:
val = assort_coef_1

set_tex_var('hrvAssortCoefCoded', val, 3)

if val >= 0.5:
    interp = 'strong'
elif val >= 0.3:
    interp = 'moderate'
elif val > 0:
    interp = 'low'
else:
    raise Exception('No correlation, check how value is used in document.')
set_tex_var('hrvAssortCoefCodedInterpretation', interp)

### Florence Network
Now the same calculations are repeated for the Florence dataset (excluding the coded subgraph)

In [1]:
import networkx as nx

DIR = './data/florence_user_location/'
# LCC of users detected within first week of event:
GEXF_FILE = 'Florence_network_data_20210721.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)
# Make undirected:
G2 = nx.Graph(G)

In [23]:
########## NOTE: vals hardcoded as file is already the LCC subgraph.
orig_nodes = 124558
orig_edges = 3428659
##########

# Gcc = max(nx.connected_components(G2), key=len)
# G_lcc = G2.subgraph(Gcc)

# lcc_nodes = '{:,.0f} ({:.3f})'.format(G_lcc.number_of_nodes(), G_lcc.number_of_nodes()/G.number_of_nodes())
# lcc_edges = '{:,.0f} ({:.3f})'.format(G_lcc.number_of_edges(), G_lcc.number_of_edges()/G.number_of_edges())

lcc_nodes = '{:,.0f} ({:.2f})'.format(G.number_of_nodes(), G.number_of_nodes()/orig_nodes)
lcc_edges = '{:,.0f} ({:.2f})'.format(G.number_of_edges(), G.number_of_edges()/orig_edges)

flr_g_row = {'Event':'Florence', 'Nodes':'{:,.0f}'.format(orig_nodes), 'Edges':'{:,.0f}'.format(orig_edges)}
flr_lcc_row = {'Event':'Florence$_{LCC}$', 'Nodes':lcc_nodes, 'Edges':lcc_edges}

# Gcc = None
# G_lcc = None

In [None]:
assort_coef = nx.attribute_assortativity_coefficient(G2, 'lcl_profile')
print('Florence assortativity for undirected graph:', assort_coef)
assort_coef = nx.attribute_assortativity_coefficient(G, 'lcl_profile')
print('Florence assortativity for directed graph:', assort_coef)

In [13]:
# Excluding non-geocoded nodes:
orig_nodes = G2.number_of_nodes()
partition = [node for node, data in G2.nodes(data=True) if data.get('lcl_profile') != None]
G2 = G2.subgraph(partition)
print('{} nodes reduced to {}'.format(orig_nodes, G2.number_of_nodes()))

assort_coef_1 = nx.attribute_assortativity_coefficient(G2, 'lcl_profile')
print('Florence assortativity for non-null profile location nodes, undirected:', assort_coef_1)

G3 = G.subgraph(partition)
assort_coef_2 = nx.attribute_assortativity_coefficient(G3, 'lcl_profile')
print('Florence assortativity for non-null profile location nodes, directed:', assort_coef_2)

106732 nodes reduced to 79807
Florence assortativity for non-null profile location nodes, undirected: 0.560285625242073
Florence assortativity for non-null profile location nodes, directed: 0.5417783804383417


In [14]:
set_tex_var('flrAssortCoef', assort_coef_2, 3)

flr_assort_row = {'Event': 'Florence', 'Nodes$_{total}$':orig_nodes, 'Nodes$_{filtered}$':G2.number_of_nodes(), 'Assortativity':assort_coef_2}

In [19]:
###########################
######### Florence network after exclusion box applied
###########################
import networkx as nx

DIR = './data/florence_user_location/'
# Network of users detected within first week of event:
GEXF_FILE = 'Florence_network_data_exclusion_20210729.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)
# Make undirected:
G2 = nx.Graph(G)


# Gcc = max(nx.connected_components(G2), key=len)
# G_lcc = G2.subgraph(Gcc)

# lcc_nodes = '{:,.0f} ({:.2f})'.format(G_lcc.number_of_nodes(), G_lcc.number_of_nodes()/G.number_of_nodes())
# lcc_edges = '{:,.0f} ({:.2f})'.format(G_lcc.number_of_edges(), G_lcc.number_of_edges()/G.number_of_edges())

# flr_g_row = {'Event':'Florence', 'Nodes':'{:,.0f}'.format(G.number_of_nodes()), 'Edges':'{:,.0f}'.format(G.number_of_edges())}
# flr_lcc_row = {'Event':'Florence$_{LCC}$', 'Nodes':lcc_nodes, 'Edges':lcc_edges}

# Gcc = None
# G_lcc = None

assort_coef = nx.attribute_assortativity_coefficient(G2, 'lcl_prf')
print('Florence (exclusion) assortativity for undirected graph:', assort_coef)
assort_coef = nx.attribute_assortativity_coefficient(G, 'lcl_prf')
print('Florence (exclusion) assortativity for directed graph:', assort_coef)

# Excluding non-geocoded nodes:
orig_nodes = G2.number_of_nodes()
partition = [node for node, data in G2.nodes(data=True) if data.get('lcl_prf') != None]
G2 = G2.subgraph(partition)
print('{} nodes reduced to {}'.format(orig_nodes, G2.number_of_nodes()))

assort_coef_1 = nx.attribute_assortativity_coefficient(G2, 'lcl_prf')
print('Florence assortativity for non-null profile location nodes, undirected:', assort_coef_1)

G3 = G.subgraph(partition)
assort_coef_2 = nx.attribute_assortativity_coefficient(G3, 'lcl_prf')
print('Florence assortativity for non-null profile location nodes, directed:', assort_coef_2)

Florence (exclusion) assortativity for undirected graph: 0.2095158880510173
Florence (exclusion) assortativity for directed graph: 0.18983880059377306
100343 nodes reduced to 68897
Florence assortativity for non-null profile location nodes, undirected: 0.5364163672797281
Florence assortativity for non-null profile location nodes, directed: 0.5130369248901543


In [21]:
#Run again on LCC:
print(G.number_of_nodes(), G.number_of_edges())

G2 = nx.Graph(G)
Gcc = max(nx.connected_components(G2), key=len)
G2 = G2.subgraph(Gcc)
G = G.subgraph(Gcc)

print(G2.number_of_nodes(), G2.number_of_edges())

assort_coef = nx.attribute_assortativity_coefficient(G2, 'lcl_prf')
print('Florence (exclusion) assortativity for undirected graph:', assort_coef)
assort_coef = nx.attribute_assortativity_coefficient(G, 'lcl_prf')
print('Florence (exclusion) assortativity for directed graph:', assort_coef)

# Excluding non-geocoded nodes:
orig_nodes = G2.number_of_nodes()
partition = [node for node, data in G2.nodes(data=True) if data.get('lcl_prf') != None]
G2 = G2.subgraph(partition)
print('{} nodes reduced to {}'.format(orig_nodes, G2.number_of_nodes()))

assort_coef_1 = nx.attribute_assortativity_coefficient(G2, 'lcl_prf')
print('Florence assortativity for non-null profile location nodes, undirected:', assort_coef_1)

G3 = G.subgraph(partition)
assort_coef_2 = nx.attribute_assortativity_coefficient(G3, 'lcl_prf')
print('Florence assortativity for non-null profile location nodes, directed:', assort_coef_2)

100343 2523974
81274 1747636
Florence (exclusion) assortativity for undirected graph: 0.20943478414306793
Florence (exclusion) assortativity for directed graph: 0.18973403303935382
81274 nodes reduced to 60917
Florence assortativity for non-null profile location nodes, undirected: 0.5364150356059781
Florence assortativity for non-null profile location nodes, directed: 0.5130346105509414


In [None]:
############################
import networkx as nx
DIR = './data/florence_user_location/'

GEXF_FILE = 'Florence_network_data_exclusion_wronglclprf_20210729.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)
print(G.number_of_nodes(), G.number_of_edges())

GEXF_FILE = 'Florence_network_data_exclusion_20210729.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)
print(G.number_of_nodes(), G.number_of_edges())

### Exporting Tables

In [69]:
import pandas as pd

df = pd.DataFrame([hrv_g_row, hrv_lcc_row, {}, flr_g_row, flr_lcc_row])
df = df[['Event', 'Nodes', 'Edges']]
df = df.fillna('')
df

Unnamed: 0,Event,Nodes,Edges
0,Harvey,31932,101096
1,Harvey$_{LCC}$,"18,410 (0.58)","76,341 (0.76)"
2,,,
3,Florence,124558,3428659
4,Florence$_{LCC}$,"106,732 (0.86)","3,428,033 (1.00)"


In [70]:
df_to_tex(df, label='network-size', caption='Network Sizes', width=0.8, row_sep=0)

Writing df to file as tex...


In [15]:
df = pd.DataFrame([hrv_row, flr_row])
df = df[['Event', 'Nodes$_{total}$', 'Nodes$_{filtered}$', 'Assortativity']]
df

Unnamed: 0,Event,Nodes$_{total}$,Nodes$_{filtered}$,Assortativity
0,Harvey,31932,21157,0.626523
1,Florence,106732,79807,0.541778


In [16]:
dec_d = {'Nodes$_{total}$':0, 'Nodes$_{filtered}$':0, 'Assortativity':3}
#col_widths = 'X[-1,lm] X[-1,rm] X[-1,rm] X[-1,rm]'
col_widths = None
df_to_tex(df, label='assortativity', caption='Network Assortativity Coefficient', width=0.8, row_sep=0, dec_dict=dec_d, col_widths=col_widths)

Writing df to file as tex...


### Monte Carlo Testing
Testing the significance of findings above using monte carlo testing and configuration networks.

In [75]:
import networkx as nx

DIR = './data/harvey_user_location/graph_objs/'
GEXF_FILE = 'network_data_HurricaneHarvey_v1.gexf'
G = nx.read_gexf(DIR + GEXF_FILE)

nodes = [
    node
    for node, data
    in G.nodes(data=True)
    if data.get("lcl_profile") != ""
]
G = G.subgraph(nodes)

In [76]:
seq_lbl = [(G.degree(n[0]), n[1]['lcl_profile']) for n in G.nodes(data=True)]

seq = [x[0] for x in seq_lbl]

labels = [x[1] for x in seq_lbl]
index = [i for i in range(len(labels))]
attrs = dict(zip(index, labels))

In [77]:
assort_coef = nx.attribute_assortativity_coefficient(G, 'lcl_profile')
print('Original assort: {:.5f}\n'.format(assort_coef))

vals = []
for i in range(99):
    G2 = nx.configuration_model(seq, seed=i**i)
    nx.set_node_attributes(G2, attrs, name='lcl_profile')
    
    assort_coef = nx.attribute_assortativity_coefficient(G2, 'lcl_profile')
    vals.append(assort_coef)
    #print('Assort for configuration model {}: {:.5f}'.format(i, assort_coef))

print('Avg config assort: {:.4f}'.format(sum(vals)/len(vals)))

Original assort: 0.62652



Avg assort: -0.0005
