# Make a graph network of communities connected through records


In [None]:
# to produce as visual output, you will need to install pyviz and networkx
!pip install networkx
!pip install pyvis
!pip install tqdm

In [None]:
import requests
import pandas as pd
from pathlib import Path
import base64
import numpy as np

from eossr.api.zenodo import get_community, search_records

In [None]:
def encode_image(url):
    """
    encode an image from url as base64
    necessary for Zenodo as the url for logos is a download content only
    """
    response = requests.get(url)

    if response.status_code == 200:
        # Encode the image content in base64 and create a Data URI
        encoded_image = base64.b64encode(response.content).decode('utf-8')
        data_uri = f"data:image/png;base64,{encoded_image}"  
        return data_uri
    else:
        raise FileNotFoundError("Failed to fetch the image")
        
def extract_community_info(community_slug):
    """
    Extract info about a single community.
    Will be used to create nodes.
    """
    try:
        community = get_community(community_slug)
        records = requests.get(community['links']['records'], timeout=20).json()
        num_records = records['aggregations']['access_status']['buckets'][0]['doc_count']
        response_logo = requests.get(community['links']['logo'])

        community_info =  {
            'id': community['id'],
            'slug': community['slug'],
            'access_visibility': community['access']['visibility'],
            'title': community['metadata']['title'],
            'num_records': num_records,
            'logo_link': encode_image(community['links']['logo']) if response_logo.status_code==200 and community['links']['logo'] is not np.nan else ''
        }
    except:
        community_info = {
            'id': 'unknown',
            'slug': community_slug,
            'access_visibility':'unknown',
            'title': 'unknown',
            'num_records': 0,
            'logo_link': ''
        }
    return pd.DataFrame([community_info])

In [None]:
def extract_connections_community(community_slug):
    """
    Extract connections data between communities based on shared records
    Will be used to create edges
    """
    connections = {}
    records = search_records(communities=community_slug)
    for record in records:
        related_com_ids = [com['id'] for com in record.metadata['communities'] if com['id'] != community_slug]
        for cid in related_com_ids:
            if cid not in connections:
                connections[cid] = 1
            else:
                connections[cid] += 1
    return pd.DataFrame([{'source': community_slug, 'target': key, 'shared_records': value} for key, value in connections.items()])

In [None]:
from tqdm.auto import tqdm

def extract_graph_data(starting_community_slug,
                       communities_csv_filename,
                       connections_csv_filename,
                       number_of_iterations=2,
                      ):
    """
    Extract all data required to produce the graph and save them into two files.
    If the files already exists, they are first loaded and the existing data is used
    instead of requesting Zenodo. 
    If you want to re-extract the data, provide an non-existing filename.
    
    """
    
    if Path(communities_csv_filename).exists():
        communities = pd.read_csv(communities_csv_filename)
    else:
        communities = pd.DataFrame()
        
    if Path(connections_csv_filename).exists():
        connections = pd.read_csv(connections_csv_filename)
    else:
        connections = pd.DataFrame()
        
    if not 'slug' in communities.columns or not starting_community_slug in communities['slug']:
        communities = pd.concat([communities,
                                 extract_community_info(starting_community_slug)],
                                ignore_index=True)
    
    nodes = [starting_community_slug]
    
    for i in range(number_of_iterations):
        new_nodes = []
        print(f'iteration {i} - extracting connections')
        for node in tqdm(nodes):
            if not 'source' in connections.columns or not node in connections['source'].values:
                # print(f'extracting {node} connections')
                new_connections = extract_connections_community(node)
                connections = pd.concat([connections,
                                         new_connections,
                                        ],
                                        ignore_index=True
                                       )
            new_nodes.extend(connections[connections['source']==node]['target'].values)
                
        
        print(f'iteration {i} - extracting communities data')
        for node in tqdm(nodes + new_nodes):
            if not node in communities['slug'].values:
                # print(f'extracting {node}')
                communities = pd.concat([communities,
                                         extract_community_info(node)
                                        ],
                                        ignore_index=True,
                                       )
                
        nodes = list(set(new_nodes))
        
        communities.to_csv(communities_csv_filename, index=False)
        connections.to_csv(connections_csv_filename, index=False)
    
    return communities, connections


In [None]:
communities_csv_filename = 'communities.csv'
connections_csv_filename = 'connections.csv'

In [None]:
if Path(communities_csv_filename).exists() and Path(connections_csv_filename).exists():
    communities = pd.read_csv(communities_csv_filename)
    connections = pd.read_csv(connections_csv_filename)
else:
    communities, connections = extract_graph_data('escape2020',
                    communities_csv_filename,
                    connections_csv_filename,
                    number_of_iterations=2
                    )

In [None]:
communities

In [None]:
connections

## Add clustering

Make clusters of communities using the Louvain algorithm.     
Note: the clusters should be assigned to communities, not edges, but that will not display well in the graph as we want to use logos.

In [None]:
import networkx as nx
from networkx.algorithms.community import louvain_communities

import networkx as nx

G = nx.Graph()  

for community_slug, community in communities.iterrows():
    com_dict = community.to_dict()
    G.add_node(community['slug'], **com_dict)
    
for cidx, connection in connections.iterrows():
    G.add_edge(connection['source'], connection['target'], weight=connection['shared_records'])

# Detect communities (clusters)
partition = louvain_communities(G)


node_to_community = {}
for i, community in enumerate(partition):
    for node in community:
        node_to_community[node] = i

# Map the community number to each row in the DataFrame
connections['community'] = connections['source'].map(node_to_community)

In [None]:
connections

## Display graph

In [None]:
import matplotlib.pyplot as plt

color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
num_clusters = len(set(connections['community']))  # or set this to your number of clusters
cluster_colors = {i: color_cycle[i % len(color_cycle)] for i in range(num_clusters)}

In [None]:
from pyvis.network import Network
import os

# Create a Pyvis network
net = Network(notebook=True, cdn_resources='local' )

# Add nodes with images
for community_idx, community in communities.iterrows():
    shape = 'image' if community['logo_link'] is not np.nan else 'ellipse'
    brokenImage = 'https://escape-ossr.gitlab.io/ossr-pages/img/Services_ESCAPE_OSSR-01.png'
    image = community['logo_link'] if community['logo_link'] is not np.nan else brokenImage
    fontcolor = 'black' if community['logo_link'] is not np.nan else 'white'
    mass = 30 if community['slug']=='escape2020' else 12
    label = ' ' if community['logo_link'] is not np.nan else community['title']

    net.add_node(community['slug'],
                 label=label,
                 title=community['slug'],
                 image=image,
                 brokenImage=brokenImage,
                 shape=shape,
                 physics=True,
                 mass=mass,
                 font={"color": 'white'},
                )

# Add edges
for cname, connection in connections.iterrows():
    net.add_edge(connection['source'],
                 connection['target'], 
                 value=connection['shared_records'],
                 color = cluster_colors.get(connection['community'], 'grey'),
                 smooth={'type': 'curved', 'forceDirection': 'none', 'roundness': 0.3}
                )


html_output_path = os.path.join(os.getcwd(), "community_network.html")

net.show(html_output_path)
