# Create Network Graph

Basic setup

In [24]:
import pandas as pd
import numpy as np
from pyvis.network import Network
import networkx as nx
import seaborn as sns


Loading previously calculated dataframes

In [25]:
dataframe_dir = 'OrfPathHealth'
df = pd.read_csv(f"./data_output/{dataframe_dir}/chunks.csv", sep="|")
df_concepts = pd.read_csv(f"./data_output/{dataframe_dir}/concepts.csv", sep="|")
df.head()


Unnamed: 0,text,source,page,chunk_id
0,147\n147Mental Health as a \nPathway to Health...,data/MedicalDocuments/OrfPathHealth/MentalHeal...,0,83d4d0367bb0467e811782a4ada3bbb9
1,"sense, mental health encompasses promotive and...",data/MedicalDocuments/OrfPathHealth/MentalHeal...,0,80ab9a7c29b44098961576ccd53fb7f6
2,148Accelerating Global Health: Pathways to Hea...,data/MedicalDocuments/OrfPathHealth/MentalHeal...,1,1d5cc8cbbb074190b824e1fbc0d644aa
3,adversity is a recognised risk factor in condi...,data/MedicalDocuments/OrfPathHealth/MentalHeal...,1,1f02a81a02054efaacb0ace0c5eadd14
4,both in healthcare service delivery.\nThe chal...,data/MedicalDocuments/OrfPathHealth/MentalHeal...,1,33e4526e998e4865bd5b0dde036c2a20


## Calculate Graphy Dataframe

Graph dataframe is a dataframe where every row is a connection between two nodes. 

It is basically an inner self join of the nodes dataframe

In [26]:
dfne_join = pd.merge(df_concepts, df_concepts, how='inner', on='chunk_id', suffixes=('_L', '_R'))
print("Total number of nodes = ", dfne_join.shape[0])

Total number of nodes =  16370


#### Clean the graph dataframe

Cleaning the graph to make it smaller for visualisation purpose

- Calculate self loops
- remove the less important nodes (for performance)

In [27]:
df_join_selfloops = dfne_join[dfne_join['entity_L'] == dfne_join['entity_R']]
df_low_importance = dfne_join[(dfne_join['importance_L']< 5) & (dfne_join['importance_R'] < 5)]
drops = df_join_selfloops.index.union(df_low_importance.index)
print("Self Loops", df_join_selfloops.index.shape[0])
print("less important edges", df_low_importance.index.shape[0])
print("Total Removable Edges = ", drops.shape[0])

## Remove these rows from the graph dataframe
df_graph = dfne_join.drop(index=drops).reset_index()
df_graph['weight'] = df_graph.apply(lambda x: (x.importance_L + x.importance_R)/2, axis=1)
print("Final Number of Edges  = ", df_graph.shape[0])
df_graph.head()

Self Loops 1536
less important edges 13577
Total Removable Edges =  13748
Final Number of Edges  =  2622


Unnamed: 0,index,entity_L,importance_L,category_L,chunk_id,type_L,entity_R,importance_R,category_R,type_R,weight
0,1,Mental Health,5,concept,83d4d0367bb0467e811782a4ada3bbb9,concept,Health Equity,4,concept,concept,4.5
1,2,Mental Health,5,concept,83d4d0367bb0467e811782a4ada3bbb9,concept,World Health Organization (WHO),3,organisation,concept,4.0
2,3,Mental Health,5,concept,83d4d0367bb0467e811782a4ada3bbb9,concept,United Nations (UN),3,organisation,concept,4.0
3,4,Mental Health,5,concept,83d4d0367bb0467e811782a4ada3bbb9,concept,Sustainable Development Goals (SDGs),4,document,concept,4.5
4,5,Health Equity,4,concept,83d4d0367bb0467e811782a4ada3bbb9,concept,Mental Health,5,concept,concept,4.5


## Creating a NetworkX Graph

Calculate nodes

Here I am grouping the graph dataframe by left node and calculating the mean importance. This way we will end up with only the unique nodes from the graph dataframe along with their weights. 

In [36]:
# nodes = df_graph["entity_L"].unique()
nodes = df_graph.groupby(['entity_L']).agg({'importance_L': 'mean'}).reset_index()
nodes.head()

Unnamed: 0,entity_L,importance_L
0,Member states,4.0
1,10 million deaths,3.0
2,100 Days Mission,3.0
3,"150,000 Health and Wellness Centres (HWC)",3.0
4,191 countries,2.0


Build a NetworkX object with nodes and edges

In [37]:
G = nx.Graph()
for index, row in nodes.iterrows():
    G.add_node(row['entity_L'])
    
for index, row in df_graph.iterrows():
    G.add_weighted_edges_from(
        [(str(row["entity_L"]), str(row["entity_R"]), row["weight"])]
    )

### Community Detection

Detect communities using the Girvan Newman algorithm 

In [38]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))



Add colors to nodes based on community

In [74]:

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette("hls", len(communities)).as_hex()
    rows = []
    for community in communities:
        color = p.pop()
        for node in community:
            rows += [{'entity_L': node, 'color':color}]
    df_colors = pd.DataFrame(rows)
    return df_colors

colors = colors2Community(communities)

df_nodes_colors = pd.merge(nodes, colors, how='left', on='entity_L', suffixes=('_N', '_C'))
# nodes.head()
df_nodes_colors.head()

Unnamed: 0,entity_L,importance_L,color
0,Member states,4.0,#db5769
1,10 million deaths,3.0,#db5784
2,100 Days Mission,3.0,#db5784
3,"150,000 Health and Wellness Centres (HWC)",3.0,#db5784
4,191 countries,2.0,#db579e


So now we have a nodes dataframe with colors and sizes of each node. 

lets recreate our graph.

In [75]:
G = nx.Graph()
for index, row in df_nodes_colors.iterrows():
    G.add_node(row['entity_L'], size=row['importance_L']*8, title=row['entity_L'], color=row['color'])
    
for index, row in df_graph.iterrows():
    G.add_weighted_edges_from(
        [(str(row["entity_L"]), str(row["entity_R"]), row["weight"])]
    )

## Visualisation

In [76]:
graph_output_directory = './graph/nodes.html'

net = Network(
    notebook=False,
    cdn_resources="remote",
    bgcolor="#111212",
    height="900px",
    width="100%",
    select_menu=True,
    font_color='#dbdbdb',
    # filter_menu=True,
)
net.from_nx(G)
net.repulsion(node_distance=150, spring_length=400)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_='physics')
net.show(graph_output_directory, notebook=False)


./graph/nodes.html
