In [1]:
from graphdatascience import GraphDataScience
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns

This notebook was run with a free Neo4j sandbox. To run it yourself, log in to https://sandbox.neo4j.com and launch the "Data Science" project. Then, copy the connection information from your sandbox into the cell below.

In [2]:
gds = GraphDataScience("bolt://54.166.68.113:7687", auth=('neo4j', 'morals-graphs-probes'))
gds.set_database("neo4j")

# Discover communities

*This section of the notebook was run in advance of the GraphConnect presentation.*

Project a graph of airports and HAS_ROUTE relationships.

In [3]:
G_air, results = gds.graph.project("air-routes", "Airport", 
                                   {"HAS_ROUTE": {"orientation": "UNDIRECTED"}})

In [4]:
results

nodeProjection            {'Airport': {'label': 'Airport', 'properties':...
relationshipProjection    {'HAS_ROUTE': {'orientation': 'UNDIRECTED', 'a...
graphName                                                        air-routes
nodeCount                                                              3503
relationshipCount                                                     92778
projectMillis                                                            27
Name: 0, dtype: object

Check the count of weakly connected components. Weakly connected components are communities within the graph where all nodes in within the same component are connected by paths.

In [5]:
gds.wcc.stats(G_air)['componentDistribution']

{'p99': 1,
 'min': 1,
 'max': 3292,
 'mean': 16.52358490566038,
 'p90': 1,
 'p50': 1,
 'p999': 3292,
 'p95': 1,
 'p75': 1}

It looks like there are several components of size 1. They aren't very interesting for community detection. Let's make a subgraph that just has the giant connected component.

In [6]:
gds.wcc.mutate(G_air, mutateProperty = "wccComponent")

mutateMillis                                                             0
nodePropertiesWritten                                                 3503
componentCount                                                         212
componentDistribution    {'p99': 1, 'min': 1, 'max': 3292, 'mean': 16.5...
postProcessingMillis                                                     6
preProcessingMillis                                                      0
computeMillis                                                            3
configuration            {'seedProperty': None, 'consecutiveIds': False...
Name: 0, dtype: object

In [7]:
component_df = gds.graph.streamNodeProperty(G_air, "wccComponent")

Find the component that has the most nodes.

In [8]:
component_df.value_counts('propertyValue').sort_values(ascending=False)

propertyValue
0       3292
3013       1
2993       1
472        1
941        1
        ... 
3479       1
3480       1
3481       1
3483       1
3490       1
Length: 212, dtype: int64

Record the component ids in the persistent graph on disk.

In [9]:
gds.graph.writeNodeProperties(G_air, ["wccComponent"])

writeMillis                      10
graphName                air-routes
nodeProperties       [wccComponent]
propertiesWritten              3503
Name: 0, dtype: object

Create a subgraph that has only the connected airports.

In [10]:
G_connected, result = gds.beta.graph.project.subgraph("connected-airports", 
                                                       G_air, 
                                                       f"n.wccComponent = 0", 
                                                       "*")

In [11]:
result

fromGraphName                 air-routes
nodeFilter            n.wccComponent = 0
relationshipFilter                     *
graphName             connected-airports
nodeCount                           3292
relationshipCount                  92778
projectMillis                         57
Name: 0, dtype: object

Create communities with the Louvain algorithm.

In [12]:
gds.louvain.write(G_connected, writeProperty = "louvainCommunity")

writeMillis                                                            264
nodePropertiesWritten                                                 3292
modularity                                                        0.595042
modularities             [0.572640785441552, 0.5945237672433423, 0.5950...
ranLevels                                                                3
communityCount                                                          22
communityDistribution    {'p99': 799, 'min': 2, 'max': 799, 'mean': 149...
postProcessingMillis                                                     3
preProcessingMillis                                                      0
computeMillis                                                         1934
configuration            {'maxIterations': 10, 'writeConcurrency': 4, '...
Name: 0, dtype: object

Create communities with the label propagation algorithm.

In [13]:
gds.labelPropagation.write(G_connected, writeProperty = "labelPropCommunity")

writeMillis                                                             10
nodePropertiesWritten                                                 3292
ranIterations                                                            3
didConverge                                                           True
communityCount                                                           2
communityDistribution    {'p99': 3193, 'min': 99, 'max': 3193, 'mean': ...
postProcessingMillis                                                     4
preProcessingMillis                                                      0
computeMillis                                                           21
configuration            {'maxIterations': 10, 'writeConcurrency': 4, '...
Name: 0, dtype: object

K-Means isn't available in the Neo4j sandboxes yet, so let's stream embeddings and calculate K-Means in Python.

In [14]:
embedding_df = gds.fastRP.stream(G_connected, embeddingDimension=64)

We're using the kmeans algorithm from scikit learn here.

In [15]:
kmeans = KMeans(n_clusters = 6)

In [16]:
kmeans_clusters = kmeans.fit_transform(list(embedding_df['embedding']))

In [17]:
embedding_df['cluster'] = kmeans.labels_

How many nodes ended up in each cluster?

In [18]:
embedding_df['cluster'].value_counts()

1    762
3    749
0    717
2    506
4    366
5    192
Name: cluster, dtype: int64

What is the silhouette score for the k-means clusters?

In [19]:
silhouette_score(list(embedding_df['embedding']), embedding_df['cluster'])

0.2521656786202897

Now we'll write the K-Means clusters back to Neo4j. To do that, turn the dataframe into a list of dictionaries.

In [20]:
kmeans_dict = embedding_df[['nodeId', 'cluster']].to_dict("records")

Pass the list of dictionaries as a parameter to a Cypher query.

In [21]:
gds.run_cypher("""
UNWIND $kmeansAssigments as row
MATCH (n) where id(n) = row['nodeId']
SET n.kMeansCommunity = row['cluster']
""", {"kmeansAssigments": kmeans_dict})

Drop the old graph projections.

In [22]:
gds.graph.drop(G_air)
gds.graph.drop(G_connected)

graphName                                           connected-airports
database                                                         neo4j
memoryUsage                                                           
sizeInBytes                                                         -1
nodeCount                                                         3292
relationshipCount                                                92778
configuration        {'relationshipProjection': {'HAS_ROUTE': {'ori...
density                                                       0.008564
creationTime                       2022-06-08T16:20:55.460720000+00:00
modificationTime                   2022-06-08T16:20:55.519346000+00:00
schema               {'relationships': {'HAS_ROUTE': {}}, 'nodes': ...
Name: 0, dtype: object

Create a new graph projection that contains only connected airports and includes all three of our community ids as node properties.

Start by adding a new ConnectedAirport label to airports in component 0.

In [23]:
gds.run_cypher("""MATCH (a:Airport {wccComponent:0}) SET a:ConnectedAirport RETURN count(a) as airportCount""")

Unnamed: 0,airportCount
0,3292


Create the new projection.

In [24]:
G_air, results = gds.graph.project("air-routes", 
                                   {"ConnectedAirport": 
                                        {"properties": ["louvainCommunity", "labelPropCommunity", "kMeansCommunity"]}}, 
                                   {"HAS_ROUTE": {"orientation": "UNDIRECTED"}})

Create a dataframe with the node counts for each community.

In [25]:
community_df = gds.graph.streamNodeProperties(G_air, ["louvainCommunity", "labelPropCommunity", "kMeansCommunity"])
community_df = community_df.groupby(['nodeProperty', 'propertyValue']).count()
community_df.reset_index(inplace=True)
community_df.rename({"nodeProperty": "community_property", "propertyValue": "community", "nodeId":"node_count"}, 
                    axis=1, inplace=True)

Write a function to create a subgraph for a community property. 

In [26]:
def create_subgraph(community_property, community_id):
    subgraph_name = f"{community_property}_{community_id}"
    subgraph_filter = f"n.{community_property} = {community_id}"
    subgraph, result = gds.beta.graph.project.subgraph(subgraph_name, 
                                                       G_air,
                                                       subgraph_filter, 
                                                       "*")
    return subgraph

Run the function for each row in the community_df data frame.

In [27]:
community_df['subgraph'] = community_df.apply(lambda row: 
                                              create_subgraph(row['community_property'], row['community']),
                                              axis=1)  

# Setup complete
Before the session, I created communities from a graph of airports and air routes. The dataframe below contains the community property, community ID, and a node count for each community in the graph. It also contains a column which holds a python graph object for each community.

In [28]:
community_df

Unnamed: 0,community_property,community,node_count,subgraph
0,kMeansCommunity,0,717,<graphdatascience.graph.graph_object.Graph obj...
1,kMeansCommunity,1,762,<graphdatascience.graph.graph_object.Graph obj...
2,kMeansCommunity,2,506,<graphdatascience.graph.graph_object.Graph obj...
3,kMeansCommunity,3,749,<graphdatascience.graph.graph_object.Graph obj...
4,kMeansCommunity,4,366,<graphdatascience.graph.graph_object.Graph obj...
5,kMeansCommunity,5,192,<graphdatascience.graph.graph_object.Graph obj...
6,labelPropCommunity,8629,3193,<graphdatascience.graph.graph_object.Graph obj...
7,labelPropCommunity,8634,99,<graphdatascience.graph.graph_object.Graph obj...
8,louvainCommunity,65,271,<graphdatascience.graph.graph_object.Graph obj...
9,louvainCommunity,117,478,<graphdatascience.graph.graph_object.Graph obj...


# Conductance
What percentage of relationships that start in one cluster end in a different cluster?

Call the conductance algorithm for each community property in G_air and collect the results.

In [None]:
community_properties = ['louvainCommunity', 'labelPropCommunity', 'kMeansCommunity']

In [None]:
conductance_dfs = []
for prop in community_properties:
    conductance_df = gds.alpha.conductance.stream(G_air, communityProperty=prop)
    conductance_df['community_property'] = prop
    conductance_dfs.append(conductance_df)
conductance_df = pd.concat(conductance_dfs)

In [None]:
conductance_df

In [None]:
community_df = community_df.merge(conductance_df, on=['community_property', 'community'])

Plot the conductance versus the node count for each community. It seems that the largest communities are big enough to be mostly self-contained, while the smaller communities have a range of conductance scores.

In [None]:
sns.scatterplot(x="node_count", y="conductance", hue="community_property", data=community_df)

# Modularity
What is the difference between the fraction of within-cluster relationships that we observe versus what we would see if nodes kept the same degree, but the relationships were connected randomly? We can calculate this for the Louvain communities, but there's not a quick way to calculate it for the others.

In [None]:
gds.louvain.stats(G_air)

# Clustering Coefficient
What is the probability that two neighbors of a node also have a relationship between them?

Call the localClusteringCoefficient algorithm for each subgraph in the community_df data frame.

In [None]:
community_df['average_clustering_coefficient'] = (
    community_df['subgraph'].map(lambda x: gds.localClusteringCoefficient.stats(x)['averageClusteringCoefficient']))

In [None]:
community_df

Plot the clustering coefficient versus cluster size. In general, it seems that larger clusters tend to also have a higher clustering coefficient.

In [None]:
sns.scatterplot(x="node_count", y="average_clustering_coefficient", hue="community_property", data=community_df)

# What is a good example of a member of each cluster?

Write a function that applies a centrality score to a subgraph and gets back the top few airports IDs and the average degree centrality for the cluster.

In [None]:
def get_central_airports(row):
    subgraph_name = row['subgraph'].name()
    top_airport_df = gds.run_cypher("""
    call gds.degree.stream($subgraphName) yield nodeId, score
    WITH gds.util.asNode(nodeId) as a, score
    ORDER BY score desc
    RETURN collect(a.descr)[..3] as airportDescription, avg(score) as averageDegree
    """, {"subgraphName": subgraph_name})
    return top_airport_df.iloc[0].tolist()

In [None]:
centrality_df = community_df.apply(get_central_airports, axis=1, result_type='expand')

In [None]:
centrality_df.columns = ['central_airports', 'average_degree']

In [None]:
community_df = pd.concat([community_df, centrality_df], axis=1)

In [None]:
community_df

## Summarize properties of the clusters

In [None]:
def get_continent_percent(community_property):
    continent_df = gds.run_cypher(f"""MATCH (a:ConnectedAirport)-[:ON_CONTINENT]->(c)
    WITH a.{community_property} as community, c.name as continent, count(*) as airportCount
    WITH community, collect(continent) as continents, collect(airportCount) as counts, 
    sum(airportCount) as totalAirports
    UNWIND range(0, size(continents) - 1) as i
    RETURN community,  continents[i] as continent, 
    round(100.0 * counts[i]/totalAirports, 1) as percent_on_continent""")
    continent_df = continent_df.pivot(index="community", columns="continent", values ="percent_on_continent")
    continent_df.reset_index(inplace=True)
    continent_df.fillna(0, inplace=True)
    continent_df['community_property'] = community_property
    return continent_df

In [None]:
continent_dfs = [get_continent_percent(prop) for prop in community_properties]
continent_df = pd.concat(continent_dfs)

In [None]:
community_df = community_df.merge(continent_df, on=['community_property', 'community'])

In [None]:
community_df

## Clean up in-memory graphs

In [None]:
for g in community_df['subgraph']:
    gds.graph.drop(g)

In [None]:
community_df.drop('subgraph', axis=1, inplace=True)

In [None]:
gds.graph.drop(G_air)

In [None]:
gds.graph.list()