<a href="https://colab.research.google.com/github/sudhang/arangodb-cugraph-hackathon/blob/master/Top_Physics_Researchers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Step 0: Package Installation & setup

In [2]:
# 1. Install nx-arangodb via pip
# Github: https://github.com/arangodb/nx-arangodb

!pip install nx-arangodb



In [3]:
# 2. Check if you have an NVIDIA GPU
# Note: If this returns "command not found", then GPU-based algorithms via cuGraph are unavailable

!nvidia-smi
!nvcc --version

Sun Mar  9 20:33:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# 3. Install nx-cugraph via pip
# Note: Only enable this installation if the step above is working!

!pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com # Requires CUDA-capable GPU

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [5]:
# 4. Install LangChain & LangGraph

!pip install --upgrade langchain langchain-community langchain-openai langgraph langchain-google-genai



In [6]:
!pip install plotly



In [7]:
!pip install gradio



In [8]:
# 5. Import the required modules

import networkx as nx
import nx_arangodb as nxadb

from arango import ArangoClient

import pandas as pd
import numpy as np
import networkx as nx
import geopandas as gpd
import plotly.express as px
import matplotlib.pyplot as plt
from random import randint
import re

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool


[20:33:53 +0000] [INFO]: NetworkX-cuGraph is available.
INFO:nx_arangodb:NetworkX-cuGraph is available.


In [9]:
# 6. Connect to the ArangoDB database
from google.colab import userdata

myarangohost = userdata.get('ARANGO_HOST')
mypass = userdata.get('ARANGO_PASS')
db = ArangoClient(hosts=myarangohost).db(username="root", password=mypass, verify=True)

print(db)

<StandardDatabase _system>


## Step 1: Choose & prepare your dataset for NetworkX

This section will provide a template for data transformation, including placeholders for loading CSV/JSON files, defining graph schemas, and preparing data for ingestion.

## Step 2: Convert and Load Graph Data into NetworkX

Here, we load the data of physics research papers and their authors

In [10]:
import networkx as nx

G = nx.read_graphml("coauthorship_bipartite.graphml")


Let us now check out one of the physicists and their papers and co-authors

In [11]:
def get_author_network(G, author_id):
  try:
    sub_nodes = {author_id} | set(G.neighbors(author_id))

    print(f"{sub_nodes=}")

    return G.subgraph(sub_nodes)
  except:
    print(f"Author {author_id} not found")
    return None

import networkx as nx
import plotly.graph_objs as go
import plotly.offline as pyo

def plot_author_network(G, author_id):
  G_author = get_author_network(G, author_id)
  pos = nx.spring_layout(G_author, seed=42)

  # Prepare lists for nodes separated by type.
  authors_x, authors_y, authors_text = [], [], []
  papers_x, papers_y, papers_text = [], [], []

  for node, data in G_author.nodes(data=True):
      x, y = pos[node]
      if data.get('type') == 'Author':
          authors_x.append(x)
          authors_y.append(y)
          # Display the author's name as the label.
          authors_text.append(data.get('name', str(node)))
      else:
          papers_x.append(x)
          papers_y.append(y)
          # Display the paper title as the label.
          papers_text.append(data.get('title', str(node)))

  # Create the author trace with distinct marker (circle) and color.
  authors_trace = go.Scatter(
      x=authors_x,
      y=authors_y,
      mode='markers+text',
      name='Authors',
      text=authors_text,
      textposition="top center",
      marker=dict(symbol="circle", size=20, color="skyblue"),
      hoverinfo="text"
  )

  # Create the paper trace with a different marker (square) and color.
  papers_trace = go.Scatter(
      x=papers_x,
      y=papers_y,
      mode='markers+text',
      name='Papers',
      text=papers_text,
      textposition="bottom center",
      marker=dict(symbol="square", size=20, color="lightgreen"),
      hoverinfo="text"
  )

  # Build edge traces for all edges.
  edge_x = []
  edge_y = []
  for source, target, data in G_author.edges(data=True):
      x0, y0 = pos[source]
      x1, y1 = pos[target]
      edge_x.extend([x0, x1, None])
      edge_y.extend([y0, y1, None])

  edge_trace = go.Scatter(
      x=edge_x,
      y=edge_y,
      line=dict(width=1, color='#888'),
      hoverinfo='none',
      mode='lines'
  )

  # Create the Plotly figure.
  fig = go.Figure(
      data=[edge_trace, authors_trace, papers_trace],
      layout=go.Layout(
          title='Author Ego Network',
          titlefont_size=16,
          showlegend=True,
          hovermode='closest',
          margin=dict(b=20, l=5, r=5, t=40),
          xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
          yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
      )
  )

  # Display the interactive visualization.
  fig.show()





In [12]:
plot_author_network(G, "ur.011344733521.27")


sub_nodes={'ur.0762271037.55', 'ur.07353444575.53', '10.1002/CPA.20124', 'ur.011344733521.27'}


## Step 3: Persist the Graph in ArangoDB

As in the template, we persist the graph inside ArangoDB.  


#### Bad Data Handling
I noticed that some of my data is "bad" in the sense that there are nan values in some columns.  Tools like networkx and gephi can handle this, but  ArangoDB seems not to like that, and rolls back the whole transaction (seemingly).  Therefore, I clean up by replacing 'country', 'abstract', id, and 'year' with default values when they are nan

In [13]:
import math

for node, data in G.nodes(data=True):
    val = data.get('abstract', None)

    if val is None or (isinstance(val, float) and math.isnan(val)):
        data['abstract'] = ""

    countryval = data.get('country', None)

    if countryval is None or (isinstance(countryval, float) and math.isnan(countryval)):
        data['country'] = "UNKNOWN"


    year = data.get('year', None)

    # For the authors, the year is nan, but just to get past the problem,
    # i set it to year 0
    if year is None or (isinstance(year, float) and math.isnan(year)):
        data['year'] = 0


    gender = data.get('gender', None)

    if gender is None or (isinstance(gender, float) and math.isnan(gender)):
        data['gender'] = "male"

In [14]:
edge_definitions = [
    {
        "edge_collection": "written_by",
        "from_vertex_collections": ["Paper"],
        "to_vertex_collections": ["Author"],
    },
    {
        "edge_collection": "co_author_with",
        "from_vertex_collections": ["Author"],
        "to_vertex_collections": ["Author"],
    }
]

from adbnx_adapter import ADBNX_Controller, ADBNX_Adapter

class Custom_ADBNX_Controller(ADBNX_Controller):
    """Custom controller to map heterogeneous nodes and edges."""

    def _identify_networkx_node(self, nx_node_id, nx_node, adb_v_cols):
        """Determine the correct ArangoDB collection for a given node."""
        if "type" in nx_node:
            return nx_node["type"]  # Returns either "Author" or "Paper"
        raise ValueError(f"Node {nx_node_id} does not have a 'type' attribute.")

    def _identify_networkx_edge(self, nx_edge, from_node_id, to_node_id, nx_map, adb_e_cols):
        """Determine the correct ArangoDB edge collection based on the 'relation' key."""
        relation = nx_edge.get("relation", None)
        if relation == "WRITTEN_BY":
            return "written_by"
        elif relation == "CO_AUTHOR_WITH":
            return "co_author_with"
        else:
            raise ValueError(f"Unknown edge relation: {relation}")

    def _keyify_networkx_node(self, i, nx_node_id, nx_node, col):
        """Generate valid ArangoDB _key values."""
        key = re.sub(r'[^a-zA-Z0-9_-]', '_', nx_node_id)
        return str(key)

# Initialize ArangoDB Adapter
custom_adbnx_adapter = ADBNX_Adapter(db, Custom_ADBNX_Controller())

# Define Graph Name
graph_name = "physics"

# Delete existing graph (optional)
db.delete_graph(graph_name, drop_collections=True, ignore_missing=True)

# Convert NetworkX Graph to ArangoDB
adb_g = custom_adbnx_adapter.networkx_to_arangodb(graph_name, G, edge_definitions)

# Verify the graph structure
print(db.graph(graph_name).edge_definitions())

[2025/03/09 20:33:58 +0000] [16579] [INFO] - adbnx_adapter: Instantiated ADBNX_Adapter with database '_system'
INFO:adbnx_adapter:Instantiated ADBNX_Adapter with database '_system'


Output()

Output()

[2025/03/09 20:34:02 +0000] [16579] [INFO] - adbnx_adapter: Created ArangoDB 'physics' Graph
INFO:adbnx_adapter:Created ArangoDB 'physics' Graph


[{'edge_collection': 'co_author_with', 'from_vertex_collections': ['Author'], 'to_vertex_collections': ['Author']}, {'edge_collection': 'written_by', 'from_vertex_collections': ['Paper'], 'to_vertex_collections': ['Author']}]


#### Abstract Search View
ArangoDB is a multi-model database, which means we are able to leverage full-text search on our data in addition to graph queries.  For that, I am creating a "search view" to search through the text of the abstracts

In [15]:
# Create ArangoSearch View
view_name = "abstract_search_view"

# If it already exists, delete it
try:
    db.delete_view(view_name)
except:
    pass

# Create a new ArangoSearch View
db.create_view(view_name, view_type="arangosearch", properties={})

print(f"Created ArangoSearch view: {view_name}")


Created ArangoSearch view: abstract_search_view


In [16]:
# Define ArangoSearch properties (indexes for searching)
view_properties = {
    "links": {
        "Paper": {  # Index Paper collection
            "fields": {
                "abstract": {"analyzers": ["text_en"]}
            }
        },
        "Author": {  # Index Author collection
            "fields": {
                "name": {"analyzers": ["text_en"]}
            }
        }
    }
}

db.update_view(view_name, view_properties)


{'global_id': 'hBE61A6883233/212448',
 'id': '212448',
 'name': 'abstract_search_view',
 'type': 'arangosearch',
 'cleanup_interval_step': 2,
 'commit_interval_msec': 1000,
 'consolidation_interval_msec': 1000,
 'consolidation_policy': {'type': 'tier',
  'segments_min': 1,
  'segments_max': 10,
  'segments_bytes_max': 5368709120,
  'segments_bytes_floor': 2097152,
  'min_score': 0},
 'primary_sort': [],
 'primary_sort_compression': 'lz4',
 'stored_values': [],
 'writebuffer_idle': 64,
 'writebuffer_active': 0,
 'writebuffer_max_size': 33554432,
 'links': {'Paper': {'analyzers': ['identity'],
   'fields': {'abstract': {'analyzers': ['text_en']}},
   'includeAllFields': False,
   'storeValues': 'none',
   'trackListPositions': False},
  'Author': {'analyzers': ['identity'],
   'fields': {'name': {'analyzers': ['text_en']}},
   'includeAllFields': False,
   'storeValues': 'none',
   'trackListPositions': False}},
 'optimizeTopK': []}

We can add some niceties like stemming and lemmatization to make the searches a bit nicer

In [17]:
db.create_analyzer(
    name="stem_en",
    analyzer_type="stem",
    properties={ "locale": "en" },  # English stemming
    features=["frequency", "norm", "position"]  # Required features
)

{'name': '_system::stem_en',
 'type': 'stem',
 'properties': {'locale': 'en'},
 'features': ['frequency', 'position', 'norm']}

#### Author Search View
It is useful to allow one to search for authors easily without knowing the exact spelling etc

In [18]:
# Create ArangoSearch View
view_name = "author_search_view"

# If it already exists, delete it
try:
    db.delete_view(view_name)
except:
    pass

# Create a new ArangoSearch View
db.create_view(view_name, view_type="arangosearch", properties={})

print(f"Created ArangoSearch view: {view_name}")

Created ArangoSearch view: author_search_view


We now have the graph inside arangoDB, and can now load it from the db

In [19]:
# 2. Re-connect to the same Graph

G_adb = nxadb.Graph(name="physics", db=db)

print(G_adb)

[20:34:04 +0000] [INFO]: Graph 'physics' exists.
INFO:nx_arangodb:Graph 'physics' exists.
[20:34:05 +0000] [INFO]: Default node type set to 'Author'
INFO:nx_arangodb:Default node type set to 'Author'


Graph named 'physics' with 2840 nodes and 5484 edges


## Step 4: Build the Agentic App with LangChain & LangGraph

Here, I finally start building the agentic app.  I'm targetting the following types of questions:


- Find papers talking about a topic, and see the demographics (country of authors, affiliations of authors, time series of years) and graph statistics about that
- Influential Authors and gender fairness in network ranking
- Predict new collaborations
- Degree of separation between two authors


In [20]:
# 1. Create the ArangoGraph LangChain wrapper
# Reference: https://api.python.langchain.com/en/latest/graphs/langchain_community.graphs.arangodb_graph.ArangoGraph.html

arango_graph = ArangoGraph(db)

In [21]:
# 2. Define the llm object
# Note: You can use any llm you want. We will be using OpenAI for example purposes.

import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

llm.invoke("hello!")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_eb9dce56a8', 'finish_reason': 'stop', 'logprobs': None}, id='run-c234d261-f2c3-4bbe-b5f3-6a991497d17e-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [22]:
@tool
def favourite_person(query: str):
    """You are responsible for responding to being asked who your favourite person is.
    You must say Sudhang Shankar!
    """
    return "Sudhang Shankar!"

In [23]:
# 4. Define the Text to AQL Tool
# Reference: https://python.langchain.com/docs/integrations/graphs/arangodb/
# Reference: https://python.langchain.com/api_reference/community/chains/langchain_community.chains.graph_qa.arangodb.ArangoGraphQAChain.html

@tool
def text_to_aql_to_text(query: str):
    """This tool is available to invoke the
    ArangoGraphQAChain object, which enables you to
    translate a Natural Language Query into AQL, execute
    the query, and translate the result back into Natural Language.

    If a query can be solved using AQL, prioritise it
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    chain = ArangoGraphQAChain.from_llm(
    	llm=llm,
    	graph=arango_graph,
    	verbose=False,                # Setting False to reduce clutter
      allow_dangerous_requests=True
    )

    chain.aql_examples = """
        # Example 1: Find papers written by Terence Tao
          WITH Paper, Author, written_by
          FOR author IN Author
            FILTER author.name == "Terence Tao"
            FOR paper IN INBOUND author written_by
              RETURN paper

        # Example 2: Find authors with whom Terence Tao has written papers
          WITH Author, co_author_with
          FOR author IN Author
            FILTER author.name == "Terence Tao"
            FOR coauthor IN 1..1 INBOUND author co_author_with
              RETURN coauthor.name

        # Example 3: With whom has Terence Tao worked on papers?
          WITH Author, co_author_with
          FOR author IN Author
            FILTER author.name == "Terence Tao"
            FOR coauthor IN 1..1 INBOUND author co_author_with
              RETURN coauthor.name
      """

    result = chain.invoke(query)

    return str(result["result"])

In [24]:
# 5. Define the Text to NetworkX/cuGraph Tool
# Note: It is encouraged to experiment and improve this section! This is just a placeholder:

@tool
def text_to_nx_algorithm_to_text(query):
    """This tool is available to invoke a NetworkX Algorithm on
    the ArangoDB Graph. You are responsible for accepting the
    Natural Language Query, establishing which algorithm needs to
    be executed, executing the algorithm, and translating the results back
    to Natural Language, with respect to the original query.

    If the query (e.g traversals, shortest path, etc.) can be solved using the Arango Query Language, then do not use
    this tool.
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    ######################
    print("1) Generating NetworkX code")

    text_to_nx = llm.invoke(f"""
    I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

    I have the following graph analysis query: {query}.

    Generate the Python Code required to answer the query using the `G_adb` object.

    Be very precise on the NetworkX algorithm you select to answer this query. Think step by step.

    Only assume that networkx is installed, and other base python dependencies.

    Always set the last variable as `FINAL_RESULT`, which represents the answer to the original query.

    Only provide python code that I can directly execute via `exec()`. Do not provide any instructions.

    Make sure that `FINAL_RESULT` stores a short & consice answer. Avoid setting this variable to a long sequence.

    Your code:
    """).content

    text_to_nx_cleaned = re.sub(r"^```python\n|```$", "", text_to_nx, flags=re.MULTILINE).strip()

    print('-'*10)
    print(text_to_nx_cleaned)
    print('-'*10)

    ######################

    print("\n2) Executing NetworkX code")
    global_vars = {"G_adb": G_adb, "nx": nx}
    local_vars = {}

    try:
        exec(text_to_nx_cleaned, global_vars, local_vars)
        text_to_nx_final = text_to_nx
    except Exception as e:
        print(f"EXEC ERROR: {e}")
        return f"EXEC ERROR: {e}"

        # TODO: Consider experimenting with a code corrector!
        attempt = 1
        MAX_ATTEMPTS = 3

        # while attempt <= MAX_ATTEMPTS
            # ...

    print('-'*10)
    FINAL_RESULT = local_vars["FINAL_RESULT"]
    print(f"FINAL_RESULT: {FINAL_RESULT}")
    print('-'*10)

    ######################

    print("3) Formulating final answer")

    nx_to_text = llm.invoke(f"""
        I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

        I have the following graph analysis query: {query}.

        I have executed the following python code to help me answer my query:

        ---
        {text_to_nx_final}
        ---

        The `FINAL_RESULT` variable is set to the following: {FINAL_RESULT}.

        Based on my original Query and FINAL_RESULT, generate a short and concise response to
        answer my query.

        Your response:
    """).content

    return nx_to_text

In [25]:
# 6. Create the Agentic Application

tools = [
  text_to_aql_to_text,
  text_to_nx_algorithm_to_text,
  favourite_person
]


def query_graph(query):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    app = create_react_agent(llm, tools)
    final_state = app.invoke({"messages": [{"role": "user", "content": query}]})
    return final_state["messages"][-1].content

In [26]:
query_graph("How many authors are in the network")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Author
FOR author IN Author
  COLLECT WITH COUNT INTO length
  RETURN length
[0m
AQL Result:
[32;1m[1;3m[1740][0m

[1m> Finished chain.[0m


'There are 1,740 authors in the network.'

In [27]:
query_graph("How many papers are in the network")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Paper
FOR paper IN Paper
  COLLECT WITH COUNT INTO length
RETURN length
[0m
AQL Result:
[32;1m[1;3m[1100][0m

[1m> Finished chain.[0m


'The network contains a total of 1,100 papers.'

In [28]:
query_graph("What papers did Terence Tao write?")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Paper, Author, written_by
FOR author IN Author
  FILTER author.name == "Terence Tao"
  FOR paper IN INBOUND author written_by
    RETURN paper
[0m
AQL Result:
[32;1m[1;3m[{'_key': '10_1002_CPA_20124', '_id': 'Paper/10_1002_CPA_20124', '_rev': '_jVwbTwu--h', 'type': 'Paper', 'title': 'Sparse Signal Recovery via L1-Regularization and Matrix Uncertainty', 'year': 2005, 'num_citations': 6257, 'abstract': "Suppose we wish to recover a vector x_0 Є R^m (e.g., a digital signal or image) from incomplete and contaminated observations y = Ax_0 + e; A is an n by m matrix with far fewer rows than columns (n « m) and e is an error term. Is it possible to recover x_0 accurately based on the data y? \nTo recover x_0, we consider the solution x^# to the l_(1-)regularization problem min ‖x‖l_1 subject to ‖Ax - y‖l(2) ≤ Є, where Є is the size of the error term e. We show that if A obeys a uniform uncertainty princip

'Terence Tao authored the paper titled "Sparse Signal Recovery via L1-Regularization and Matrix Uncertainty," published in 2005. This influential paper, cited 6,257 times, addresses the challenge of recovering a vector from incomplete and contaminated observations using L1-regularization. It delves into the conditions necessary for accurate recovery and offers insights into the exact recovery phenomenon, as well as methodologies for nearly recovering approximately sparse signals.'

In [29]:
query_graph("With whom has Terence Tao worked on papers?")



[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Author, co_author_with
FOR author IN Author
  FILTER author.name == "Terence Tao"
  FOR coauthor IN 1..1 INBOUND author co_author_with
    RETURN coauthor.name
[0m
AQL Result:
[32;1m[1;3m['Justin K Romberg', 'Emmanuel J Candès'][0m

[1m> Finished chain.[0m


'Terence Tao has worked on academic papers with Justin K. Romberg and Emmanuel J. Candès.'

### Get Stats about a Topic

We allow the user to input some physics-related topic, and then search our abstracts for that topic.  If we find it, we get the following information about the papers:
- Who worked on these topics
- Where in the world these authors are
- What are their affiliations
- What are the trends for this topic over time
- How influential are the authors who worked on this topic

#### Tools for the dashboard

In [33]:
# The following is public domain, so it is okay to use
WORLD_MAP_URL = "https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip"
world = gpd.read_file(WORLD_MAP_URL)

# Tool: Search Papers Based on Topic
def search_papers(search_term: str):
    """
    Finds papers matching a research topic using full-text search in ArangoDB.
    """

    # Using PHRASE search gives better results than FILTER, which
    # is what I was trying before
    query = """
    WITH Paper
    FOR doc IN abstract_search_view
    SEARCH ANALYZER(
        PHRASE(doc.abstract, @search_term),
        "text_en"
    )
    RETURN { id: doc._id, title: doc.title, year: doc.year }
    """

    results = list(db.aql.execute(query, bind_vars={"search_term": f"%{search_term}%"}))
    return results

# Tool: Find Authors of Those Papers
def get_authors_for_papers(paper_ids):
    """
    Finds authors of the given papers and their demographics.
    """
    query = """
    WITH written_by, Author
    FOR e IN written_by
    FILTER e._from IN @paper_ids
    FOR a IN Author
    FILTER a._id == e._to
    RETURN { name: a.name, country: a.country, affiliation: a.affiliation, paper_id: e._from }

    """
    results = list(db.aql.execute(query, bind_vars={"paper_ids": paper_ids}))
    return results

# Tool: visualise plots about authors and papers
def generate_plots(author_data, paper_data):
    """
    Creates interactive Plotly plots for:
    1. Country distribution (Choropleth Map)
    2. Top affiliations (Bar Chart)
    3. Yearly publication trends (Time Series) - Now using paper data
    """

    df_authors = pd.DataFrame(author_data)
    df_papers = pd.DataFrame(paper_data)

    ## **World Map (Choropleth) - Country Distribution**
    country_counts = df_authors["country"].value_counts().reset_index()
    country_counts.columns = ["country", "count"]  # Rename columns correctly

    fig1 = px.choropleth(
        country_counts,
        locations="country",
        locationmode="country names",
        color="count",
        title="Researcher Distribution by Country",
        color_continuous_scale="Blues"
    )

    ## **Bar Chart - Top 10 Affiliations**
    affiliation_counts = df_authors["affiliation"].value_counts().nlargest(10).reset_index()
    affiliation_counts.columns = ["affiliation", "count"]

    fig2 = px.bar(
        affiliation_counts,
        x="affiliation",
        y="count",
        labels={"affiliation": "Institution", "count": "Number of Researchers"},
        title="Top 10 Research Institutions"
    )

    ## **Time Series - Yearly Publication Trends**
    year_counts = df_papers["year"].value_counts().sort_index().reset_index()
    year_counts.columns = ["year", "count"]  # Fix column naming

    fig3 = px.line(
        year_counts,
        x="year",
        y="count",
        labels={"year": "Year", "count": "Papers Published"},
        title="Yearly Publication Trends"
    )

    # Return the JSON representations of the plots
    return {
        "plots": {
            "world_map": fig1.to_json(),
            "affiliations": fig2.to_json(),
            "yearly_trends": fig3.to_json()
        }
    }

# Tool: Find most influential authors
def find_influential_authors():
    """
    Finds the most influential authors in the network using the PageRank algorithm.
    """

    # Step 1: Construct the graph from ArangoDB
    query = aql = """
      LET edges = (
        FOR edge IN co_author_with
          RETURN edge
      )
      LET nodes = (
        FOR edge IN edges
          FOR nodeId IN [edge._from, edge._to]
            COLLECT uniqueId = nodeId INTO nodesGrouped
            RETURN DOCUMENT(uniqueId)
      )
      RETURN { nodes, edges }
      """
    results = list(db.aql.execute(query))
    cursor = db.aql.execute(aql)
    result = cursor.next()

    # Make the networkX graph
    # Have to do it this way, as
    # G_adb does not support subgraphs
    G = nx.Graph()
    for node in result["nodes"]:
        G.add_node(node["_key"], **node)
    for edge in result["edges"]:
        source_key = edge["_from"].split("/")[1]
        target_key = edge["_to"].split("/")[1]
        G.add_edge(source_key, target_key, weight=edge.get("weight", 1))

    pagerank_scores = nx.pagerank(G)
    top_10_authors = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)[:10]
    top_authors_data = [
      {
          "name": G.nodes[author].get("name"),
          "gender": G.nodes[author].get("gender"),
          "country": G.nodes[author].get("country"),
          "affiliation": G.nodes[author].get("affiliation"),
          "pagerank": score
      }
      for author, score in top_10_authors
    ]

    top_10_authors_df = pd.DataFrame(top_authors_data)
    return top_10_authors_df

#### Dashboard
The following is a nice (?) interactive dashboard that offers a mix of free-form queries, full-text, AQL and cuGraph queries

In [None]:
import gradio as gr
import plotly.io as pio

# just a convenience function
def agent_query(question: str) -> str:
    answer = query_graph(question)
    return answer

def generate_dashboard(user_query):
    """Fetches the response and extracts Plotly figures in one step."""
    papers = search_papers(user_query)
    #print(papers)
    paper_ids = [paper["id"] for paper in papers]

    authors = get_authors_for_papers(paper_ids)
    #print(authors)

    plot_dict = generate_plots(authors, papers)
    plotly_plots = plot_dict["plots"]
    #print(plotly_plots)


    figs = []

    # Try to extract Plotly figures
    try:
      for key, fig_json in plotly_plots.items():
          fig = pio.from_json(fig_json)
          figs.append(fig)
    except:
      # Don't crash if the plots are not present
      pass

    return tuple(figs)

# Create Gradio app
with gr.Blocks() as demo:
    gr.Markdown("Full-Text Abstracts Search Dashboard")
    with gr.Row():
      free_text_input = gr.Textbox(label="Enter your question")
      free_text_btn = gr.Button(">>")
      free_text_output = gr.Textbox(label="Answer")
    free_text_btn.click(
      agent_query,
      inputs=free_text_input,
      outputs=free_text_output
    )

    # Just a static dataframe
    gr.Markdown("influential Authors")
    gr.DataFrame(find_influential_authors())

    gr.Markdown("Full-Text Abstracts Search Dashboard")
    user_query = gr.Textbox(label="Enter a topic for searching within Abstracts")
    execute_btn = gr.Button("Submit")

    # Visualisation placeholders
    plot1 = gr.Plot()
    plot2 = gr.Plot()
    plot3 = gr.Plot()

    execute_btn.click(
        generate_dashboard,
        inputs=user_query,
        outputs=[plot1, plot2, plot3]
    )

# Launch the Gradio app
demo.launch(share=True, inbrowser=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a8eb0ab6e90429762b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




[1m> Entering new ArangoGraphQAChain chain...[0m
AQL Query (1):[32;1m[1;3m
WITH Paper, Author, written_by
FOR author IN Author
  FILTER author.name == "Terence Tao"
  FOR paper IN INBOUND author written_by
    RETURN paper
[0m
AQL Result:
[32;1m[1;3m[{'_key': '10_1002_CPA_20124', '_id': 'Paper/10_1002_CPA_20124', '_rev': '_jVwbTwu--h', 'type': 'Paper', 'title': 'Sparse Signal Recovery via L1-Regularization and Matrix Uncertainty', 'year': 2005, 'num_citations': 6257, 'abstract': "Suppose we wish to recover a vector x_0 Є R^m (e.g., a digital signal or image) from incomplete and contaminated observations y = Ax_0 + e; A is an n by m matrix with far fewer rows than columns (n « m) and e is an error term. Is it possible to recover x_0 accurately based on the data y? \nTo recover x_0, we consider the solution x^# to the l_(1-)regularization problem min ‖x‖l_1 subject to ‖Ax - y‖l(2) ≤ Є, where Є is the size of the error term e. We show that if A obeys a uniform uncertainty princip