In [8]:
import networkx as nx
import pandas as pd
from graph_tool.all import *

from pathlib import Path
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import re
import matplotlib
import numpy as np

import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})



matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

import sys
sys.path.append('/data/spencer/manuscript/src/')

from utils_tiramisu import *

`../cache/pdfs_word_excel_powerpoint_010924.parquet` is simply a Pandas DataFrame that contains the combined texts of the scanned/electronic PDFs and MS documents. The columns are `text`, which is the raw text, and `nodeID` which is the nodeIDs of the split single-page PDFs or the MS documents.

In [None]:
together = pd.read_parquet(
    "../cache/pdfs_word_excel_powerpoint_010924.parquet"
)
# together = pd.merge(nhgri_text.reset_index(drop = True).reset_index(), nhgri_text_paths, on="nodeID")
map_nodeID_to_docID = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:PART_OF] -> (d:Document) 
where e.fileExtension = 'pdf' 
return c.nodeID as nodeID, c.page as page, d.nodeID as documentID, e.originalPath as path
""")
all_pdfs = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:CONVERT_TO] -> (f:File) 
where e.fileExtension = 'pdf' and f.fileExtension = 'png' 
return c.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

all_ms = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['doc', 'docx', 'ppt', 'pptx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")


folder_structure = pd.concat([all_pdfs, all_ms])


map_nodeID_to_page = map_nodeID_to_docID.set_index('nodeID').to_dict()['page']
# map_nodeID_to_path = map_nodeID_to_docID.set_index("nodeID").to_dict()['path']
map_nodeID_to_docID = map_nodeID_to_docID.set_index('nodeID').to_dict()['documentID']

together['docID'] = together['nodeID'].apply(lambda x: map_nodeID_to_docID[x] if x in map_nodeID_to_docID else x)
together['page'] = together['nodeID'].apply(lambda x: map_nodeID_to_page[x] if x in map_nodeID_to_page else 0)
together = pd.merge(together, folder_structure, left_on = 'nodeID', right_on = 'nodeID')

all_excel = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) 
where e.fileExtension in ['xls', 'xlsx'] 
return e.nodeID as nodeID, e.originalPath as path, e.fileExtension as fileExtension
""")

together = together.loc[~together.nodeID.isin(all_excel['nodeID'].to_list())]

together['text'] = together['text'].apply(lambda x: x + " ")

together = together.sort_values(['docID', 'page']).groupby('docID').agg({"text": "sum", "path": set}).reset_index()

together['path'] = together['path'].apply(lambda x: list(x)[0])
together['text'] = together['text'].str.lower()

project_folders = {
        "ENCODE":[
            "ENCODE/Participants", "ENCODE/MS", "ENCODE/SAP", "ENCODE/OC Information",
            "ENCODE/PressRelease", "ENCODE/ENCODE_2004", "ENCODE/publications", "ENCODE/Drafts",
        "ENCODE/Data Standards", "ENCODE/encode_align_sop.pdf", "ENCODE/ENCODE-PublicationGuidelines 3-29-06.doc",
        "ENCODE/Minutes", "ENCODE/CACR", "ENCODE/SAP call minutes 3-15-06.doc", "ENCODE/Data release",
        "ENCODE/Abstracts", "ENCODE/Presentations", "ENCODE/Scaling", "ENCODE/Meeting", "ENCODE/MS2",
        "ENCODE/WorkingGroups", "ENCODE/Documents", "ENCODE/criteria", "ENCODE/Web_site", "ENCODE/Hox.doc", "ENCODE/Policy"],
        "modENCODE": ["ENCODE/modENCODE", "modENCODE"],
        "HapMap":[
 'Haplotype Map Project'],
     "HGP": [
         "Large scale sequence/human sequence", "Celera", "HGP History Summer 2011", "sequencingrampupfiles"],
    "sequence": ["Large scale sequence/Box026-010.pdf", "Sequence target files"],
    "ELSI": ["ELSI"]
}

list_of_entities = []


for i, row in tqdm(together.iterrows(), total = together.shape[0]):
    temp = []
    for group, (folder) in enumerate(project_folders):
        
        
        if any([Path("/tiramisu/"+ subfolder) in Path(row['path']).parents for subfolder in project_folders[folder]]):
            
            list_of_entities.append((True, folder, row['docID'], row['path']))
        elif any([Path("/tiramisu/"+ subfolder) == Path(row['path']) for subfolder in project_folders[folder]]):
            list_of_entities.append((True, folder, row['docID'], row['path']))
        else:
            list_of_entities.append((False, folder, row['docID'], row['path']))
projects_df = pd.DataFrame(list_of_entities, columns = ["entity", "text", 'docID', 'path'])


In [4]:
hgp = projects_df.loc[(projects_df.text == "HGP") & (projects_df.entity)].docID.unique()
hapmap = projects_df.loc[(projects_df.text == "HapMap") & (projects_df.entity)].docID.unique()
lsac = projects_df.loc[(projects_df.text == "sequence") & (projects_df.entity)].docID.unique()
encode = projects_df.loc[(projects_df.text == "ENCODE") & (projects_df.entity)].docID.unique()
modencode = projects_df.loc[(projects_df.text == "modENCODE") & (projects_df.entity)].docID.unique()
elsi = projects_df.loc[(projects_df.text == "ELSI") & (projects_df.entity)].docID.unique()

In [9]:
network_df = pd.read_parquet("../cache/all_email_pairs_240520.parquet")

In [None]:
network_df

# which emails fall under which projects

In [None]:
network_df["hgp"] = network_df['documentID'].progress_apply(lambda x: x in hgp)
network_df["hapmap"] = network_df['documentID'].progress_apply(lambda x: x in hapmap)
network_df["lsac"] = network_df['documentID'].progress_apply(lambda x: x in lsac)
network_df["encode"] = network_df['documentID'].progress_apply(lambda x: x in encode)
network_df["modencode"] = network_df['documentID'].progress_apply(lambda x: x in modencode)
network_df['elsi'] = network_df['documentID'].progress_apply(lambda x: x in elsi)

In [11]:
network_df['year'] = network_df['date'].dt.year

In [13]:
G = nx.from_pandas_edgelist(network_df, source = 'From', target = "To", edge_attr = \
                    [ 'hgp', 'hapmap', 'lsac'], create_using=nx.MultiDiGraph())
G.remove_edges_from(list(nx.selfloop_edges(G)))

In [None]:
len(G.nodes())

In [53]:
node_list = pd.read_csv('../models/email_clean_manual/nodes_240520_not_randomized.csv')
category_dict = node_list.set_index("ID")['category'].to_dict()
out_degrees_map = dict([(n, d) for n, d in G.out_degree()])
degrees_map = dict([(n, d) for n, d in G.in_degree()])
nih_map = {}

for n in G.nodes():
    if category_dict[int(n)] == "nih":
        nih_map[n] = '1'
    elif category_dict[int(n)] == "private-nonprofit" or  category_dict[int(n)] == "academic":
        nih_map[n] = "2"
    else:
        nih_map[n] = "3"
# deg_central= pd.DataFrame([g for g in G.nodes()], columns = ["node"])
# deg_central['indegrees'] = deg_central['node'].map(degrees_map)
# deg_central['outdegrees'] = deg_central['node'].map(out_degrees_map)
# deg_central['affiliation'] = deg_central['node'].map(nih_map)
# deg_central['indegrees'] = deg_central['indegrees'].astype(int)
# deg_central['outdegrees'] = deg_central['outdegrees'].astype(int)

### venn diagram of Figure 3

In [54]:
sequencing_nodes = set()
hapmap_nodes = set()
lsac_nodes = set()
for i, row in network_df.iterrows():
    if row['hgp']:
        sequencing_nodes.add(row['From'])
        sequencing_nodes.add(row['To'])
    if row['hapmap']:
        hapmap_nodes.add(row['From'])
        hapmap_nodes.add(row['To'])
    if row['lsac'] :
        lsac_nodes.add(row['From'])
        lsac_nodes.add(row['To'])
        

a_two = set([i for i in sequencing_nodes if int(i) in category_dict and (category_dict[int(i)] == "nih")])
# b_two = set([i for i in lsac_nodes if  int(i) in category_dict and (category_dict[int(i)] == "nih")])
c_two = set([i for i in hapmap_nodes if  int(i) in category_dict and (category_dict[int(i)] == "nih")])



second_set = a_two.union(c_two)
total_second = len(second_set)

a = set([i for i in sequencing_nodes if int(i) in category_dict and (category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic")])
# b = set([i for i in lsac_nodes if int(i) in category_dict and (category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic")])
c = set([i for i in hapmap_nodes if int(i) in category_dict and (category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic")])

third_set = a.union(c)
total_third = len(third_set)

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

sns.set_style('white', rc={
    'xtick.bottom': False,
    'ytick.left': True,
})

lsaccolor = "green"
hapmapcolor = "#f1cc32"
sequencingcolor = "gray"

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('font', serif='Helvetica') 
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams.update({"axes.labelsize": 30,
"xtick.labelsize": 30,
"ytick.labelsize": 30,
"legend.fontsize": 20,
"font.size":50})


figure, axes = plt.subplots(nrows = 1, ncols = 2, figsize=(40,40), dpi = 300)\
#                             gridspec_kw=dict(width_ratios=norm))
axes[0].spines['right'].set_linewidth(0)
axes[0].spines['top'].set_linewidth(0)

venntwo = venn2([a_two, c_two], ('HGP', 'HapMap'), ax = axes[0],
                 subset_label_formatter=lambda x: f"{x} ({(x/total_second):1.0%})")
# #hapmap

venntwo.get_patch_by_id('01').set_color(hapmapcolor)
venntwo.get_patch_by_id('10').set_color('gray')
venntwo.get_patch_by_id('01').set_edgecolor('black')
venntwo.get_patch_by_id('10').set_edgecolor('black')
venntwo.get_patch_by_id('11').set_edgecolor('black')
venntwo.get_patch_by_id('01').set_alpha(0.5)
venntwo.get_patch_by_id('10').set_alpha(0.5)
venntwo.get_patch_by_id('11').set_alpha(0.5)


vennthree = venn2([a, c], ('HGP',  'HapMap'), ax = axes[1],
                 subset_label_formatter=lambda x: f"{x} ({(x/total_third):1.0%})")

vennthree.get_patch_by_id('01').set_color(hapmapcolor)
vennthree.get_patch_by_id('10').set_color('gray')
vennthree.get_patch_by_id('01').set_edgecolor('black')
vennthree.get_patch_by_id('10').set_edgecolor('black')
vennthree.get_patch_by_id('11').set_edgecolor('black')
vennthree.get_patch_by_id('01').set_alpha(0.5)
vennthree.get_patch_by_id('10').set_alpha(0.5)
vennthree.get_patch_by_id('11').set_alpha(0.5)

plt.savefig('../figures/figure_3_venndiagrams.pdf')

In [58]:
from scipy.stats import fisher_exact
from collections import Counter

In [None]:
fisher_exact([[54, (31 + 29)], [41, (50 + 100)]])

In [None]:
network_df.groupby(["From_category", "To_category"]).count()

In [None]:
16235/network_df.shape[0]

In [None]:
(5493 + 2520)/network_df.shape[0]

In [None]:
(1862 + 1535)/network_df.shape[0]

In [64]:
# adapted from https://bbengfort.github.io/2016/06/graph-tool-from-networkx/
def get_prop_type(value, key=None):
    """
    Performs typing and value conversion for the graph_tool PropertyMap class.
    If a key is provided, it also ensures the key is in a format that can be
    used with the PropertyMap. Returns a tuple, (type name, value, key)
    """
    if isinstance(key, str):
        # Encode the key as ASCII
        key = key

    # Deal with the value
    if isinstance(value, bool):
        tname = 'bool'

    elif isinstance(value, int):
        tname = 'float'
        value = float(value)

    elif isinstance(value, float):
        tname = 'float'


    elif isinstance(value, dict):
        tname = 'object'

    else:
        tname = 'string'
        value = str(value)

    return tname, value, key


def nx2gt(nxG):
    """
    Converts a networkx graph to a graph-tool graph.
    """
    # Phase 0: Create a directed or undirected graph-tool Graph
    gtG = graph_tool.Graph(directed=nxG.is_directed())

    # Add the Graph properties as "internal properties"
    for key, value in nxG.graph.items():
        # Convert the value and key into a type for graph-tool
        tname, value, key = get_prop_type(value, key)

        prop = gtG.new_graph_property(tname) # Create the PropertyMap
        gtG.graph_properties[key] = prop     # Set the PropertyMap
        gtG.graph_properties[key] = value    # Set the actual value

    # Phase 1: Add the vertex and edge property maps
    # Go through all nodes and edges and add seen properties
    # Add the node properties first
    nprops = set() # cache keys to only add properties once
    for node, data in nxG.nodes(data=True):

        # Go through all the properties if not seen and add them.
        for key, val in data.items():
            
            if key in nprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key  = get_prop_type(val, key)

            prop = gtG.new_vertex_property(tname) # Create the PropertyMap
            gtG.vertex_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            nprops.add(key)

    # Also add the node id: in NetworkX a node can be any hashable type, but
    # in graph-tool node are defined as indices. So we capture any strings
    # in a special PropertyMap called 'id' -- modify as needed!
    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')

    # Add the edge properties second
    eprops = set() # cache keys to only add properties once
    for src, dst, data in nxG.edges(data=True):

        # Go through all the edge properties if not seen and add them.
        for key, val in data.items():
            if key in eprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key = get_prop_type(val, key)

            prop = gtG.new_edge_property(tname) # Create the PropertyMap
            gtG.edge_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            eprops.add(key)

    # Phase 2: Actually add all the nodes and vertices with their properties
    # Add the nodes
    vertices = {} # vertex mapping for tracking edges later
    for node, data in nxG.nodes(data=True):

        # Create the vertex and annotate for our edges later
        v = gtG.add_vertex()
        vertices[node] = v

        # Set the vertex properties, not forgetting the id property
        data['id'] = str(node)
        for key, value in data.items():
            gtG.vp[key][v] = value # vp is short for vertex_properties

    # Add the edges
    for src, dst, data in nxG.edges(data=True):

        # Look up the vertex structs from our vertices mapping and add edge.
        e = gtG.add_edge(vertices[src], vertices[dst])

        # Add the edge properties
        for key, value in data.items():
            gtG.ep[key][e] = value # ep is short for edge_properties

    # Done, finally!
    return gtG


In [None]:


network_df["hgp"] = network_df['documentID'].progress_apply(lambda x: x in hgp)
network_df["hapmap"] = network_df['documentID'].progress_apply(lambda x: x in hapmap)


In [67]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)

In [68]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hapmap].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G.remove_edges_from(list(nx.selfloop_edges(G)))

In [69]:
network = nx2gt(G)

In [None]:
kitchen = [129, 224, 258, 260, 440, 488, 503, 505, 590, 67, 74]
steering = [8175, 1491, 508, 128, 132, 13462, 14505, 146, 14825, 15958, 188, 19, 19960, 20, 20420, 21759, 22, 22345, 23541, 24, 247, 25, 253, 272, 28, 29182, 294, 30, 3036, 32, 324, 327, 331, 339, 383, 388, 429, 444, 448, 478, 495, 546, 560, 566, 568, 5857, 68, 6903, 7, 76, 7899, 8, 9537, 129, 148, 224, 258, 260, 440, 488, 503, 505, 590, 67, 74]
rest_of_hapmap = set([int(i) for i in G.nodes()]) - set(kitchen) - set(steering)

len(set(rest_of_hapmap)) + len(set(steering)) + len(set(kitchen))

In [71]:
node_list = pd.read_csv('../models/email_clean_manual/nodes_240520_not_randomized.csv')

ID_to_gtID = {}

for node in G.nodes():
    ID_to_gtID[node] = graph_tool.util.find_vertex(network, network.vertex_properties['id'], node)[0]

category_dict = node_list.set_index("ID")['category'].to_dict()
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in kitchen:
        v_prop[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
    elif int(i) in steering:
        v_prop[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 0.5]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 1, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 1, 0.5]

In [None]:
distributions = ["real-exponential", "discrete-geometric", "discrete-poisson", "discrete-binomial"]
# ew = contract_parallel_edges(network)
data_hapmap = []

for distribution in tqdm(distributions):
    for degree_corrected in [False, True]:

        state_to_save = None
        entropy = np.inf
        for i in tqdm(range(10)):
            
            # state  = minimize_nested_blockmodel_dl(network, state_args=dict(deg_corr=degree_corrected, recs =[network.ep.weight], rec_types = [distribution]))
            state = NestedBlockState(network, base_type=RankedBlockState, state_args=dict(eweight=network.ep.weight, deg_corr=degree_corrected, recs =[network.ep.weight], rec_types = [distribution]))
            mcmc_anneal(state, beta_range=(1, 10), niter=1000, mcmc_equilibrate_args=dict(force_niter=10))
        
            if state.entropy() < entropy:
                state_to_save = state
                entropy = state.entropy()
        data_hapmap.append((distribution, degree_corrected, entropy))

In [None]:
data_hapmap_df = pd.DataFrame(data_hapmap, columns = ["distribution", "degree corrected", "entropy"])
data_hapmap_df

# find the lowest entropy configuration of HapMap emails

In [86]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hapmap].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G.remove_edges_from(list(nx.selfloop_edges(G)))

network = nx2gt(G)



### finds the lowest entropy state of the exponential model

In [None]:
state_to_save = None
entropy = np.inf
for i in tqdm(range(20)):
    state  = minimize_nested_blockmodel_dl(network, state_args=dict(deg_corr=True, recs =[network.ep.weight], rec_types = ["real-exponential"]))
    mcmc_anneal(state, beta_range=(1, 10), niter=1000, mcmc_equilibrate_args=dict(force_niter=10))
    
    if state.entropy() < entropy:
        state_to_save = state
        entropy = state.entropy()

In [194]:
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
v_prop_names = network.new_vertex_property("string")
for i in ID_to_gtID:
    if int(i) in kitchen:
        v_prop[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
    elif int(i) in steering:
        v_prop[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 0.5]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 1, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 1, 0.5]
    v_prop_names[ID_to_gtID[i]] = str(i)

In [None]:
entropy

In [None]:
state_to_save.draw( vertex_fill_color=v_prop, vertex_color=v_prop_edge , vertex_text = v_prop_names,  vertex_font_size=4,)

## determine how many conversations happen between two large communities

In [181]:
ID_to_gtID = {}

for node in G.nodes():
    ID_to_gtID[node] = graph_tool.util.find_vertex(network, network.vertex_properties['id'], node)[0]

In [182]:
blocks_hapmap = [state_to_save.levels[0].get_blocks()[ID_to_gtID[i]] for i in G.nodes()]
blocks_hapmap_df = pd.DataFrame({"blocks": blocks_hapmap, "level2":  [state_to_save.levels[1].get_blocks()[ID_to_gtID[i]] for i in G.nodes()], "ID": [i for i in G.nodes()]})
blocks_hapmap_df['ID'] = blocks_hapmap_df['ID'].astype(int)

In [186]:
def kitchen_or_steering_or_hapmap(x):
    if x in kitchen:
        return "kitchen"
    elif x in steering:
        return "steering"
blocks_hapmap_df['type'] = blocks_hapmap_df["ID"].apply(lambda x: kitchen_or_steering_or_hapmap(x))

In [None]:
blocks_hapmap_df.groupby("blocks").count()

In [None]:
(2 + 17 + 22)/(2 + 4 + 17 + 153 + 101 + 22)

In [None]:
blocks_hapmap_df.loc[blocks_hapmap_df['ID'] == 312]

In [None]:
blocks_hapmap_df.loc[blocks_hapmap_df['type'] == "kitchen"]

In [157]:
from collections import defaultdict

In [191]:
total = defaultdict(int)

for group_name, data in blocks_hapmap_df.groupby("blocks"):
    
    for group_name_2, data_2 in blocks_hapmap_df.groupby("blocks"):
        
        for j in data['ID'].unique():
            for k in data_2['ID'].unique():
                
                if  G.get_edge_data(str(j), str(k)) is not None:
                    total[(group_name, group_name_2)] += G.get_edge_data(str(j), str(k))['weight']
                if  G.get_edge_data(str(k), str(j)) is not None:
                    total[(group_name, group_name_2)] += G.get_edge_data(str(k), str(j))['weight']

In [98]:
level_data = []
for node in ID_to_gtID.keys():
    level_data.append((state_to_save.levels[0].get_blocks()[ID_to_gtID[node]],  node))
level_data = pd.DataFrame(level_data, columns = ["block",  "ID"])

In [99]:
level_data = pd.DataFrame(level_data, columns = ["block", "ID"])

In [None]:
merged.groupby("blocks").count()

In [None]:
merged.loc[merged['type'] == "kitchen"]

### now find the lowest state for HGP emails

In [None]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hgp].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G_hgp = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G_hgp.remove_edges_from(list(nx.selfloop_edges(G_hgp)))

In [None]:
state  = minimize_nested_blockmodel_dl(network, state_args=dict(deg_corr=True, recs =[network.ep.weight], rec_types = ['real-exponential']))
mcmc_equilibrate(state, wait=1000, mcmc_args=dict(niter=10))

In [None]:
state.draw( vertex_fill_color=v_prop, vertex_color=v_prop_edge )

# create sankey diagram of HGP to HapMap

In [None]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hgp].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G_hgp = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G_hgp.remove_edges_from(list(nx.selfloop_edges(G_hgp)))

In [None]:
blocks_hgp_df = [int(i) for i in G_hgp.nodes()]

blocks_hgp_df = pd.DataFrame({"ID": blocks_hgp_df})

In [None]:
not_in_hgp = []
in_hgp = []

rest_of_hapmap_in_hgp = []
rest_of_hapmap_not_in_hgp = []
for i in kitchen:
    if i not in blocks_hgp_df['ID'].to_list():
        not_in_hgp.append(i)
    else:
        in_hgp.append(i)
for i in steering:
    if i not in blocks_hgp_df["ID"].to_list():
        not_in_hgp.append(i)
    else:
        in_hgp.append(i)
for i in rest_of_hapmap:
    if i not in blocks_hgp_df["ID"].to_list():
        not_in_hgp.append(i)
        rest_of_hapmap_not_in_hgp.append(i)
    else:
        in_hgp.append(i)
        rest_of_hapmap_in_hgp.append(i)
not_in_hgp = list(set(not_in_hgp))
in_hgp = list(set(in_hgp))

In [None]:
hgp_sankey = pd.concat([pd.DataFrame({"ID": in_hgp, "rank": "In HGP", "name": [IDs_inverted[i] for i in in_hgp], "block": -2}),
           pd.DataFrame({"ID": not_in_hgp, "rank": "Not in HGP", "name": [IDs_inverted[i] for i in not_in_hgp], 
                         "block": -1})])

In [None]:
merged = pd.merge(hgp_sankey, blocks_hapmap_df, on = "ID", how = 'right')

In [None]:
def kitchen_or_steering_or_hapmap(x):
    if x in kitchen:
        return "kitchen"
    elif x in steering:
        return "steering"
    elif x in rest_of_hapmap_in_hgp:
        return "hapmap_in_hgp"
    elif x in rest_of_hapmap_not_in_hgp:
        return "hapmap_not_in_hgp"
merged['type'] = merged["ID"].apply(lambda x: kitchen_or_steering_or_hapmap(x))

In [None]:
merged['blocks'] = merged['blocks'].map({ 13: 2,188:1, 23:4, 190:3, 76:5,  136:6,})
committee_by_type = Partition.Simple('type', ['kitchen', 'steering', "hapmap_in_hgp", "hapmap_not_in_hgp"])

palette = {'kitchen': '#41521f', 'steering': '#e27982', "hapmap_in_hgp": "#00ffff", "hapmap_not_in_hgp": "gray"}
flows = (
    merged.groupby(["rank", "blocks", "type"])
    .agg({"ID": "count"})
    .dropna()
    .reset_index()
    .sort_values(["rank", "blocks"], ascending = [True, False])
)


nodes = {
    "rank": ProcessGroup(flows["rank"].unique().tolist()),
    "blocks": ProcessGroup(flows["blocks"].unique().tolist()),
}

embark_port = Partition.Simple("process", flows["rank"].unique().tolist())
disembark_port = Partition.Simple("process", flows["blocks"].unique().tolist())

flows = (
    flows.rename(
        columns={
            "rank": "source",
            "blocks": "target",
            "ID": "value",
        }
    )
)

ordering = [["rank"], ["waypoint"], ["blocks"]]

nodes["rank"].partition = embark_port
nodes["blocks"].partition = disembark_port
nodes['waypoint'] = Waypoint(committee_by_type)
bundles = [Bundle("rank", "blocks",  waypoints = ["waypoint"])]
sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=committee_by_type)





# Create Sankey diagram
weave(
    sdd, flows, palette=palette
).to_widget().auto_save_svg('sankey_sbm_with_distinction.svg')


# visualize by count, membership, etc.

In [None]:
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in kitchen:
        v_prop[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
    elif int(i) in steering:
        v_prop[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 0.5]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 1, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 1, 0.5]
state.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.8, hedge_pen_width= 2,
hvertex_fill_color= np.array([0., 0., 0., .5]),
hedge_color= np.array([0., 0., 0., .5]),
hedge_marker_size= 15,
hvertex_size=15 , output = "../models/sbm/hapmap_communities_by_committee.pdf")

import matplotlib.cm as cm
import matplotlib as mpl

v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
counter = Counter(np.vstack([network_df.loc[network_df.hapmap]['To'], network_df.loc[network_df.hapmap]['From']]).flatten())

min_val, max_val = counter.most_common()[-1][-1], counter.most_common()[0][-1]

# use the coolwarm colormap that is built-in, and goes from blue to red
cmap = mpl.cm.coolwarm
norm = mpl.colors.Normalize(vmin=min_val, vmax=max_val)

# convert your distances to color coordinates
for i in ID_to_gtID:
    v_prop[ID_to_gtID[i]] = cmap(counter[i])
    v_prop_edge[ID_to_gtID[i]] =  cmap(counter[i])
  
state.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.8, hedge_pen_width= 2,
    hvertex_fill_color= np.array([0., 0., 0., .5]),
    hedge_color= np.array([0., 0., 0., .5]),
    hedge_marker_size= 15,
    hvertex_size=15, output = "../models/sbm/hapmap_communities_by_count.pdf")

# state = minimize_nested_blockmodel_dl(network)

category_dict = node_list.set_index("ID")['category'].to_dict()
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in category_dict:
        if category_dict[int(i)] == "nih":
            v_prop[ID_to_gtID[i]] = [0, 0, 1, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 0, 1, 1]
        elif category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic":
            v_prop[ID_to_gtID[i]] = [1, 0, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [1, 0, 0, 1]
        else:
            v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]
state.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.9, hedge_pen_width= 2,
    hvertex_fill_color= np.array([0., 0., 0., .5]),
    hedge_color= np.array([0., 0., 0., .5]),
    hedge_marker_size= 15,
    hvertex_size=15, output = "../models/sbm/hapmap_communities_by_nih.pdf" )

# brokerage role analysis

In [None]:
'''
This module implements the disparity filter to compute a significance score of edge weights in networks

taken from https://github.com/aekpalakorn/python-backbone-network
'''

import networkx as nx
import numpy as np
from scipy import integrate


def disparity_filter(G, weight='weight'):
    ''' Compute significance scores (alpha) for weighted edges in G as defined in Serrano et al. 2009
        Args
            G: Weighted NetworkX graph
        Returns
            Weighted graph with a significance score (alpha) assigned to each edge
        References
            M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488.
    '''
    
    if nx.is_directed(G): #directed case    
        N = nx.DiGraph()
        for u in G:
            
            k_out = G.out_degree(u)
            k_in = G.in_degree(u)
            
            if k_out > 1:
                sum_w_out = sum(np.absolute(G[u][v][weight]) for v in G.successors(u))
                for v in G.successors(u):
                    w = G[u][v][weight]
                    p_ij_out = float(np.absolute(w))/sum_w_out
                    alpha_ij_out = 1 - (k_out-1) * integrate.quad(lambda x: (1-x)**(k_out-2), 0, p_ij_out)[0]
                    N.add_edge(u, v, weight = w, alpha_out=float('%.4f' % alpha_ij_out))
                    
            elif k_out == 1 and G.in_degree(list(G.successors(u))[0]) == 1:
                #we need to keep the connection as it is the only way to maintain the connectivity of the network
                print(list(G.successors(u)))
                v = list(G.successors(u))[0]
                w = G[u][v][weight]
                N.add_edge(u, v, weight = w, alpha_out=0., alpha_in=0.)
                #there is no need to do the same for the k_in, since the link is built already from the tail
            
            if k_in > 1:
                sum_w_in = sum(np.absolute(G[v][u][weight]) for v in G.predecessors(u))
                for v in G.predecessors(u):
                    w = G[v][u][weight]
                    p_ij_in = float(np.absolute(w))/sum_w_in
                    alpha_ij_in = 1 - (k_in-1) * integrate.quad(lambda x: (1-x)**(k_in-2), 0, p_ij_in)[0]
                    N.add_edge(v, u, weight = w, alpha_in=float('%.4f' % alpha_ij_in))
        return N
    
    else: #undirected case
        B = nx.Graph()
        for u in G:
            k = len(G[u])
            if k > 1:
                sum_w = sum(np.absolute(G[u][v][weight]) for v in G[u])
                for v in G[u]:
                    w = G[u][v][weight]
                    p_ij = float(np.absolute(w))/sum_w
                    alpha_ij = 1 - (k-1) * integrate.quad(lambda x: (1-x)**(k-2), 0, p_ij)[0]
                    B.add_edge(u, v, weight = w, alpha=float('%.4f' % alpha_ij))
        return B

def disparity_filter_alpha_cut(G,weight='weight',alpha_t=0.4, cut_mode='or'):
    ''' Performs a cut of the graph previously filtered through the disparity_filter function.
        
        Args
        ----
        G: Weighted NetworkX graph
        
        weight: string (default='weight')
            Key for edge data used as the edge weight w_ij.
            
        alpha_t: double (default='0.4')
            The threshold for the alpha parameter that is used to select the surviving edges.
            It has to be a number between 0 and 1.
            
        cut_mode: string (default='or')
            Possible strings: 'or', 'and'.
            It works only for directed graphs. It represents the logic operation to filter out edges
            that do not pass the threshold value, combining the alpha_in and alpha_out attributes
            resulting from the disparity_filter function.
            
            
        Returns
        -------
        B: Weighted NetworkX graph
            The resulting graph contains only edges that survived from the filtering with the alpha_t threshold
    
        References
        ---------
        .. M. A. Serrano et al. (2009) Extracting the Multiscale backbone of complex weighted networks. PNAS, 106:16, pp. 6483-6488.
    '''    
    
    
    if nx.is_directed(G):#Directed case:   
        B = nx.DiGraph()
        for u, v, w in G.edges(data=True):
            try:
                alpha_in =  w['alpha_in']
            except KeyError: #there is no alpha_in, so we assign 1. It will never pass the cut
                alpha_in = 1
            try:
                alpha_out =  w['alpha_out']
            except KeyError: #there is no alpha_out, so we assign 1. It will never pass the cut
                alpha_out = 1  
            
            if cut_mode == 'or':
                if alpha_in<alpha_t or alpha_out<alpha_t:
                    B.add_edge(u,v, weight=w[weight])
            elif cut_mode == 'and':
                if alpha_in<alpha_t and alpha_out<alpha_t:
                    B.add_edge(u,v, weight=w[weight])
        return B

    else:
        B = nx.Graph()#Undirected case:   
        for u, v, w in G.edges(data=True):
            
            try:
                alpha = w['alpha']
            except KeyError: #there is no alpha, so we assign 1. It will never pass the cut
                alpha = 1
                
            if alpha<alpha_t:
                B.add_edge(u,v, weight=w[weight])
        return B           



In [None]:
"""
Alex Levenson
alex@isnotinvain.com	| www.isnotinvain.com
(c) Reya Group 			| http://www.reyagroup.com
Friday July 23rd 2010

Calculates brokerage roles, as described by Steven Borgatti in http://www.analytictech.com/essex/Lectures/Brokerage.pdf
"""

import networkx as nx
import itertools

class _RoleClassifier(object):
	roleTypes = { \
				 "coordinator"		: lambda pred,broker,succ: pred == broker == succ, \
				 "gatekeeper" 	 	: lambda pred,broker,succ: pred != broker == succ, \
				 "representative"	: lambda pred,broker,succ: pred == broker != succ, \
				 "consultant"		: lambda pred,broker,succ: pred == succ != broker, \
				 "liaison"			: lambda pred,broker,succ: pred != succ and pred != broker and broker != succ, \
				}
				
	@classmethod
	def classify(cls,predecessor_group,broker_group,successor_group):
		for role,predicate in cls.roleTypes.items():
			if predicate(predecessor_group,broker_group,successor_group):
				return role
		raise Exception("Could not classify... this should never happen")
	
def getBrokerageRoles(graph,partition):
	"""
	Counts how many times each node in graph acts as one of the five brokerage roles described by Steven Borgatti in
	http://www.analytictech.com/essex/Lectures/Brokerage.pdf
	
	graph: a networx DiGraph
	partition: a dictionary mapping node -> group, must map every node. If a node has no group associate then put it by itself in a new group
	
	returns: {node -> {"cooridnator": n, "gatekeeper": n, "representative": n, "consultant": n, "liaison": n}} where n is the number of times
	node acted as that role
	"""
	
	roleClassifier = _RoleClassifier()
	
	roles = dict((node, dict((role,0) for role in roleClassifier.roleTypes)) for node in graph)
	for node in graph:
		
		for successor in graph.successors(node):
			for predecessor in graph.predecessors(node):
				
				if successor == predecessor or successor == node or predecessor == node: continue

				if node == "575":
					print(predecessor, node, successor)
				if not (graph.has_edge(predecessor, successor)):
					# found a broker!
					# now which kind depends on who is in which group
					roles[node][roleClassifier.classify(partition[predecessor],partition[node],partition[successor])] += 1
	return roles


def get_brokerage(G, partition):

    brokers = getBrokerageRoles(G, partition)
    
    totalbrokers = pd.DataFrame([(j, sum(brokers[j].values())) for j in brokers], columns = ['id', 'sum'])
    totalbrokers['coordinator'] = [i['coordinator'] for i in brokers.values()] 
    totalbrokers['gatekeeper'] = [i['gatekeeper'] for i in brokers.values()]
    totalbrokers['representative'] = [i['representative'] for i in brokers.values()]
    totalbrokers['consultant'] = [i['consultant'] for i in brokers.values()]
    totalbrokers['liaison'] = [i['liaison'] for i in brokers.values()]
    
    
    totalbrokers['coordinator'] = totalbrokers['coordinator'] / totalbrokers['sum']
    totalbrokers['gatekeeper'] = totalbrokers['gatekeeper'] / totalbrokers['sum']
    totalbrokers['representative'] =totalbrokers['representative'] / totalbrokers['sum']
    totalbrokers['consultant'] = totalbrokers['consultant'] / totalbrokers['sum']
    totalbrokers['liaison'] = totalbrokers['liaison'] / totalbrokers['sum']
    totalbrokers['committee'] = totalbrokers['id'].map(partition)

    melted = totalbrokers.loc[totalbrokers['sum'] > 10].melt(id_vars = ['id',  'committee'], value_vars = ['coordinator', 'representative', 'liaison', 'gatekeeper', 'consultant'])
    return melted


In [None]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hapmap].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G.remove_edges_from(list(nx.selfloop_edges(G)))

In [None]:
partition = {}
for g in G.nodes():
    # partition[g] = category_dict[int(g)]
    if category_dict[int(g)] == "nih":
        partition[g] = 'NIH'
    elif category_dict[int(g)] == "academic" or category_dict[int(g)] == "private-nonprofit":
        partition[g] = 'external academia'
    else:
        partition[g] = 'other'

committee_partition = {}
for g in G.nodes():
    if int(g) in kitchen:
        committee_partition[g] = "kitchen"
    elif int(g) in steering:
        committee_partition[g] = "steering"
    else:
        committee_partition[g] = "rest of hapmap"

test_partition = {}

for g in G.nodes():
    if int(g) in steering:
        test_partition[g] = "steering & kitchen cabinet"
    else:
        test_partition[g] = "rest of hapmap"



In [None]:
for alpha in tqdm(np.arange(0, 1.1, 0.1)):
    transformed = disparity_filter(G)
    transformed = disparity_filter_alpha_cut(transformed, alpha_t = alpha, cut_mode="or")
    print("alpha: ", alpha)
    print("nodes: ", len(transformed.nodes()))
    print("edges: ", len(transformed.edges()))

    melted = get_brokerage(transformed)

    print(melted.groupby(["committee", "variable"]).count())
    sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
    })
    
    sns.color_palette("Set1")
    
    matplotlib.rc('font', family='Helvetica') 
    matplotlib.rc('pdf', fonttype=42)
    matplotlib.rc('text', usetex='false') 
    matplotlib.rcParams['axes.unicode_minus'] = False
    
    matplotlib.rcParams['xtick.major.size'] = 2
    matplotlib.rcParams['xtick.major.width'] = 0.5
    matplotlib.rcParams['xtick.minor.size'] = 2
    matplotlib.rcParams['xtick.minor.width'] = 0.5
    
    matplotlib.rcParams['ytick.major.size'] = 2
    matplotlib.rcParams['ytick.major.width'] = 0.5
    matplotlib.rcParams['ytick.minor.size'] = 2
    matplotlib.rcParams['ytick.minor.width'] = 0.5
    
    
    matplotlib.rcParams.update({"axes.labelsize": 10,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
    "font.size":7})
    figure, axes = plt.subplots(nrows = 1, ncols = 1, figsize=(2,2), dpi = 300)
    PROPS = {
        'boxprops':{ 'edgecolor':'k'},
        'medianprops':{'color':'k'},
        'whiskerprops':{'color':'k'},
        'capprops':{'color':'k'},
        'flierprops': {'markersize': 2, 'markeredgewidth': 0.5}}
    
    sns.stripplot(melted.sort_values(['committee', 'variable'], key = lambda x: x.map({"kitchen": 0, "steering": 1, "rest of hapmap": 2, "consultant":3,
                                                                              "coordinator": 4, "gatekeeper": 5, "liaison": 6, "representative": 7})), x = 'variable', y = 'value', 
                palette = ["#41521F", "#e27982","#00FFFF"  ], hue = 'committee', ax = axes, size = 3, dodge = True, alpha = 0.5)
    sns.boxplot(melted.sort_values(['committee', 'variable'], key = lambda x: x.map({"kitchen": 0, "steering": 1, "rest of hapmap": 2, "consultant":3,
                                                                              "coordinator": 4, "gatekeeper": 5, "liaison": 6, "representative": 7})), x = 'variable', y = 'value', 
                palette = ["#41521F", "#e27982","#00FFFF"  ], hue = 'committee', ax = axes, linewidth =0.5,showfliers=False, **PROPS)
    axes.set_xlabel("")
    axes.set_ylabel("Brokerage roles", color = "k")
    # axes.legend(handles=[mpatches.Patch(color="#41521F", label='Kitchen Cabinet'),
    #                      mpatches.Patch(color="#e27982", label='Steering Committee'),
    #                      mpatches.Patch(color="#00FFFF", label='Rest of HapMap')
    #                     ], frameon = False,  bbox_to_anchor = (1.4, .6))
    
    axes.legend().remove()
    for i, line in enumerate(axes.get_lines()):
        line.set_color('k')
    
    axes.tick_params(axis='x', colors='black')
    axes.set_xticklabels(axes.get_xticklabels(), rotation=45, ha='right')
    axes.yaxis.label.set_color('black')
    axes.xaxis.label.set_color('black')
    axes.tick_params(axis='y', colors='black')
    axes.spines['bottom'].set_linewidth(0.5)
    axes.spines['left'].set_linewidth(0.5)
    
    sns.despine()
    plt.savefig(f"../figures/triad_analysis_by_committee_{alpha}.pdf", dpi = 300, bbox_inches = "tight")

    

In [None]:
melted = get_brokerage(G, committee_partition)
sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})

sns.color_palette("Set1")

matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5


matplotlib.rcParams.update({"axes.labelsize": 10,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
figure, axes = plt.subplots(nrows = 1, ncols = 1, figsize=(2,2), dpi = 300)

PROPS = {
    'boxprops':{ 'edgecolor':'k'},
    'medianprops':{'color':'k'},
    'whiskerprops':{'color':'k'},
    'capprops':{'color':'k'},
    'flierprops': {'markersize': 2, 'markeredgewidth': 0.5}}

sns.stripplot(melted.sort_values(['committee', 'variable'], key = lambda x: x.map({"kitchen": 0, "steering": 1, "rest of hapmap": 2, "consultant":3,
                                                                          "coordinator": 4, "gatekeeper": 5, "liaison": 6, "representative": 7})), x = 'variable', y = 'value', 
            palette = ["#41521F", "#e27982","#00FFFF"  ], hue = 'committee', ax = axes, size = 3, dodge = True, alpha = 0.5)
sns.boxplot(melted.sort_values(['committee', 'variable'], key = lambda x: x.map({"kitchen": 0, "steering": 1, "rest of hapmap": 2, "consultant":3,
                                                                          "coordinator": 4, "gatekeeper": 5, "liaison": 6, "representative": 7})), x = 'variable', y = 'value', 
            palette = ["#41521F", "#e27982","#00FFFF"  ], hue = 'committee', ax = axes, linewidth =0.5,showfliers=False, **PROPS)
axes.set_xlabel("")
axes.set_ylabel("Brokerage roles", color = "k")
# axes.legend(handles=[mpatches.Patch(color="#41521F", label='Kitchen Cabinet'),
#                      mpatches.Patch(color="#e27982", label='Steering Committee'),
#                      mpatches.Patch(color="#00FFFF", label='Rest of HapMap')
#                     ], frameon = False,  bbox_to_anchor = (1.4, .6))

axes.legend().remove()
for i, line in enumerate(axes.get_lines()):
    line.set_color('k')

axes.tick_params(axis='x', colors='black')
axes.set_xticklabels(axes.get_xticklabels(), rotation=45, ha='right')
axes.yaxis.label.set_color('black')
axes.xaxis.label.set_color('black')
axes.tick_params(axis='y', colors='black')
axes.spines['bottom'].set_linewidth(0.5)
axes.spines['left'].set_linewidth(0.5)

sns.despine()
plt.savefig("../figures/triad_analysis_by_committee.pdf", dpi = 300, bbox_inches = "tight")
# plt.show()

In [None]:
print(mannwhitneyu(melted.loc[(melted.variable == "consultant") & (melted.committee == "kitchen")]['value'],
             melted.loc[(melted.variable == "consultant") & (melted.committee == "steering")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "consultant") & (melted.committee == "steering")]['value'],
             melted.loc[(melted.variable == "consultant") & (melted.committee == "rest of hapmap")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "coordinator") & (melted.committee == "kitchen")]['value'],
             melted.loc[(melted.variable == "coordinator") & (melted.committee == "steering")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "coordinator") & (melted.committee == "steering")]['value'],
             melted.loc[(melted.variable == "coordinator") & (melted.committee == "rest of hapmap")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "gatekeeper") & (melted.committee == "kitchen")]['value'],
             melted.loc[(melted.variable == "gatekeeper") & (melted.committee == "steering")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "gatekeeper") & (melted.committee == "steering")]['value'],
             melted.loc[(melted.variable == "gatekeeper") & (melted.committee == "rest of hapmap")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "liaison") & (melted.committee == "kitchen")]['value'],
             melted.loc[(melted.variable == "liaison") & (melted.committee == "steering")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "liaison") & (melted.committee == "steering")]['value'],
             melted.loc[(melted.variable == "liaison") & (melted.committee == "rest of hapmap")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "representative") & (melted.committee == "kitchen")]['value'],
             melted.loc[(melted.variable == "representative") & (melted.committee == "steering")]['value']))

print(mannwhitneyu(melted.loc[(melted.variable == "representative") & (melted.committee == "steering")]['value'],
             melted.loc[(melted.variable == "representative") & (melted.committee == "rest of hapmap")]['value']))

# do the same for HGP

In [None]:
network_df['tuple'] = network_df.apply(lambda x: (x['From'], x['To']), axis = 1)
temp = network_df.loc[network_df.hgp].groupby('tuple').count().reset_index()[["tuple", "conversation"]]
temp.columns = ["tuple", "weight"]
temp = temp.loc[temp.weight >0]
temp['To'] = temp.tuple.apply(lambda x: x[1])
temp['From'] = temp.tuple.apply(lambda x: x[0])
G = nx.from_pandas_edgelist(temp, target = 'To', source = "From", edge_attr = \
                    ["weight"], create_using=nx.DiGraph())
G.remove_edges_from(list(nx.selfloop_edges(G)))


network = nx2gt(G)


node_list = pd.read_csv('../models/email_clean_manual/nodes_all_240417.csv')

ID_to_gtID = {}

for node in G.nodes():
    ID_to_gtID[node] = graph_tool.util.find_vertex(network, network.vertex_properties['id'], node)[0]

category_dict = node_list.set_index("ID")['category'].to_dict()
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in category_dict:
        if category_dict[int(i)] == "nih":
            v_prop[ID_to_gtID[i]] = [0, 0, 1, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 0, 1, 1]
        elif category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic":
            v_prop[ID_to_gtID[i]] = [1, 0, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [1, 0, 0, 1]
        else:
            v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]


    
state_hgp  = minimize_nested_blockmodel_dl(network, state_args=dict(deg_corr=True, recs =[network.ep.weight], rec_types = ['real-exponential']))
mcmc_anneal(state_hgp, beta_range=(1, 10), niter=1000, mcmc_equilibrate_args=dict(force_niter=10))

# improve solution with merge-split
v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in kitchen:
        v_prop[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [65/255, 82/255, 31/255, 1]
    elif int(i) in steering:
        v_prop[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 1]
        v_prop_edge[ID_to_gtID[i]] = [226/255, 121/255, 130/255, 0.5]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 1, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 1, 0.5]

state_to_save.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.8, hedge_pen_width= 2,
hvertex_fill_color= np.array([0., 0., 0., .5]),
hedge_color= np.array([0., 0., 0., .5]),
hedge_marker_size= 15,
hvertex_size=15 , output = "../models/sbm/hgp_communities_by_committee.pdf")


counter = Counter(np.vstack([network_df.loc[network_df.hgp]['To'], network_df.loc[network_df.hgp]['From']]).flatten())

v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")

min_val, max_val = counter.most_common()[-1][-1], counter.most_common()[0][-1]
print(min_val, max_val)
# use the coolwarm colormap that is built-in, and goes from blue to red
cmap = mpl.cm.coolwarm
norm = mpl.colors.Normalize(vmin=min_val, vmax=max_val)

# convert your distances to color coordinates
for i in ID_to_gtID:
    if int(i) in kitchen:
        v_prop[ID_to_gtID[i]] = cmap(counter[i])
        v_prop_edge[ID_to_gtID[i]] =  cmap(counter[i])
    elif int(i) in steering:
        v_prop[ID_to_gtID[i]] =  cmap(counter[i])
        v_prop_edge[ID_to_gtID[i]] =  cmap(counter[i])
    else:
        v_prop[ID_to_gtID[i]] =  cmap(counter[i])
        v_prop_edge[ID_to_gtID[i]] =  cmap(counter[i])
v_prop_label = network.new_vertex_property("string")
for i in kitchen:
    if str(i) in ID_to_gtID:
        v_prop_label[ID_to_gtID[str(i)]] = str(i)
for i in steering:
    if str(i) in ID_to_gtID:
        v_prop_label[ID_to_gtID[str(i)]] = str(i)
state_to_save.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.8, hedge_pen_width= 2,
    hvertex_fill_color= np.array([0., 0., 0., .5]),
    hedge_color= np.array([0., 0., 0., .5]),
    hedge_marker_size= 15,
    hvertex_size=15, output = "../models/sbm/hgp_communities_by_count.pdf")



v_prop = network.new_vertex_property("vector<double>")
v_prop_edge = network.new_vertex_property("vector<double>")
for i in ID_to_gtID:
    if int(i) in category_dict:
        if category_dict[int(i)] == "nih":
            v_prop[ID_to_gtID[i]] = [0, 0, 1, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 0, 1, 1]
        elif category_dict[int(i)] == "private-nonprofit" or  category_dict[int(i)] == "academic":
            v_prop[ID_to_gtID[i]] = [1, 0, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [1, 0, 0, 1]
        else:
            v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
            v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]
    else:
        v_prop[ID_to_gtID[i]] = [0, 1, 0, 1]
        v_prop_edge[ID_to_gtID[i]] = [0, 1, 0, 1]
state_to_save.draw(vertex_fill_color=v_prop, vertex_color=v_prop_edge, beta = 0.9, hedge_pen_width= 2,
    hvertex_fill_color= np.array([0., 0., 0., .5]),
    hedge_color= np.array([0., 0., 0., .5]),
    hedge_marker_size= 20,
    hvertex_size=20, output = "../models/sbm/hgp_communities_by_nih.pdf" )