<a href="https://colab.research.google.com/github/sudhang/arangodb-cugraph-hackathon/blob/master/Top_Physics_Researchers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview



```
# This is formatted as code
```

<p align="center">
    <img src="attachment:b5af8b9d-6ec3-4eba-a73d-934ea6f1439c.png" style="height: 500px;">
</p>

In [1]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### Step 0: Package Installation & setup

In [2]:
# 1. Install nx-arangodb via pip
# Github: https://github.com/arangodb/nx-arangodb

!pip install nx-arangodb



In [3]:
# 2. Check if you have an NVIDIA GPU
# Note: If this returns "command not found", then GPU-based algorithms via cuGraph are unavailable

!nvidia-smi
!nvcc --version

Sun Mar  9 14:04:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# 3. Install nx-cugraph via pip
# Note: Only enable this installation if the step above is working!

!pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com # Requires CUDA-capable GPU

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [5]:
# 4. Install LangChain & LangGraph

!pip install --upgrade langchain langchain-community langchain-openai langgraph langchain-google-genai



In [6]:
# 5. Visualistion etc libraries -- Not from template
!pip install plotly



In [7]:
# 5. Import the required modules

import networkx as nx
import nx_arangodb as nxadb

from arango import ArangoClient

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from random import randint
import re

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool

[14:04:41 +0000] [INFO]: NetworkX-cuGraph is available.
INFO:nx_arangodb:NetworkX-cuGraph is available.


In [8]:
# 6. Connect to the ArangoDB database
# TODO: Configure your credentials here!

from google.colab import userdata
myarangohost = userdata.get('ARANGO_HOST')
mypass = userdata.get('ARANGO_PASS')
db = ArangoClient(hosts=myarangohost).db(username="root", password=mypass, verify=True)

print(db)

<StandardDatabase _system>


### Step 1: Choose & prepare your dataset for NetworkX

This section will provide a template for data transformation, including placeholders for loading CSV/JSON files, defining graph schemas, and preparing data for ingestion.

### Step 2: Convert and Load Graph Data into NetworkX

Here, we load the data of physics research papers and their authors

In [9]:
import networkx as nx

G = nx.read_graphml("coauthorship_bipartite.graphml")


Let us now check out one of the physicists and their papers and co-authors

In [10]:
def get_author_network(G, author_id):
  try:
    sub_nodes = {author_id} | set(G.neighbors(author_id))

    print(f"{sub_nodes=}")

    return G.subgraph(sub_nodes)
  except:
    print(f"Author {author_id} not found")
    return None

import networkx as nx
import plotly.graph_objs as go
import plotly.offline as pyo

def plot_author_network(G, author_id):
  G_author = get_author_network(G, author_id)
  pos = nx.spring_layout(G_author, seed=42)

  # Prepare lists for nodes separated by type.
  authors_x, authors_y, authors_text = [], [], []
  papers_x, papers_y, papers_text = [], [], []

  for node, data in G_author.nodes(data=True):
      x, y = pos[node]
      if data.get('type') == 'Author':
          authors_x.append(x)
          authors_y.append(y)
          # Display the author's name as the label.
          authors_text.append(data.get('name', str(node)))
      else:
          papers_x.append(x)
          papers_y.append(y)
          # Display the paper title as the label.
          papers_text.append(data.get('title', str(node)))

  # Create the author trace with distinct marker (circle) and color.
  authors_trace = go.Scatter(
      x=authors_x,
      y=authors_y,
      mode='markers+text',
      name='Authors',
      text=authors_text,
      textposition="top center",
      marker=dict(symbol="circle", size=20, color="skyblue"),
      hoverinfo="text"
  )

  # Create the paper trace with a different marker (square) and color.
  papers_trace = go.Scatter(
      x=papers_x,
      y=papers_y,
      mode='markers+text',
      name='Papers',
      text=papers_text,
      textposition="bottom center",
      marker=dict(symbol="square", size=20, color="lightgreen"),
      hoverinfo="text"
  )

  # Build edge traces for all edges.
  edge_x = []
  edge_y = []
  for source, target, data in G_author.edges(data=True):
      x0, y0 = pos[source]
      x1, y1 = pos[target]
      edge_x.extend([x0, x1, None])
      edge_y.extend([y0, y1, None])

  edge_trace = go.Scatter(
      x=edge_x,
      y=edge_y,
      line=dict(width=1, color='#888'),
      hoverinfo='none',
      mode='lines'
  )

  # Create the Plotly figure.
  fig = go.Figure(
      data=[edge_trace, authors_trace, papers_trace],
      layout=go.Layout(
          title='Author Ego Network',
          titlefont_size=16,
          showlegend=True,
          hovermode='closest',
          margin=dict(b=20, l=5, r=5, t=40),
          xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
          yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
      )
  )

  # Display the interactive visualization.
  fig.show()





In [11]:
plot_author_network(G, "ur.011344733521.27")


sub_nodes={'ur.0762271037.55', '10.1002/CPA.20124', 'ur.07353444575.53', 'ur.011344733521.27'}


In [12]:
G.nodes["ur.011344733521.27"]

{'type': 'Author',
 'name': 'Terence Tao',
 'affiliation': 'University of California| Los Angeles',
 'country': 'United States',
 'num_papers': 3,
 'num_citations': 25028.0,
 'degree': 3}

In [13]:
G.nodes["10.1002/CPA.20124"]

{'type': 'Paper',
 'title': 'Unknown',
 'year': 2005.0,
 'num_citations': 6257.0,
 'abstract': "Suppose we wish to recover a vector x_0 Є R^m (e.g., a digital signal or image) from incomplete and contaminated observations y = Ax_0 + e; A is an n by m matrix with far fewer rows than columns (n « m) and e is an error term. Is it possible to recover x_0 accurately based on the data y? \nTo recover x_0, we consider the solution x^# to the l_(1-)regularization problem min ‖x‖l_1 subject to ‖Ax - y‖l(2) ≤ Є, where Є is the size of the error term e. We show that if A obeys a uniform uncertainty principle (with unit-normed columns) and if the vector x_0 is sufficiently sparse, then the solution is within the noise level ‖x^# - x_0‖l_2 ≤ C Є. As a first example, suppose that A is a Gaussian random matrix; then stable recovery occurs for almost all such A's provided that the number of nonzeros of x_0 is of about the same order as the number of observations. As a second instance, suppose one obse

In [14]:
G_terry = get_author_network(G, "ur.011344733521.27")

sub_nodes={'ur.0762271037.55', '10.1002/CPA.20124', 'ur.07353444575.53', 'ur.011344733521.27'}


### Step 3: Persist the Graph in ArangoDB

As in the template, we persist the graph inside ArangoDB.  


#### Bad Data Handling
I noticed that some of my data is "bad" in the sense that there are nan values in some columns.  Tools like networkx and gephi can handle this, but  ArangoDB seems not to like that, and rolls back the whole transaction (seemingly).  Therefore, I clean up by replacing 'country', 'abstract', id, and 'year' with default values when they are nan

In [15]:
import math

for node, data in G.nodes(data=True):
    val = data.get('abstract', None)

    if val is None or (isinstance(val, float) and math.isnan(val)):
        data['abstract'] = ""

    countryval = data.get('country', None)

    if countryval is None or (isinstance(countryval, float) and math.isnan(countryval)):
        data['country'] = "UNKNOWN"


    year = data.get('year', None)

    # For the authors, the year is nan, but just to get past the problem,
    # i set it to year 0
    if year is None or (isinstance(year, float) and math.isnan(year)):
        data['year'] = 0

In [16]:
edge_definitions = [
    {
        "edge_collection": "written_by",
        "from_vertex_collections": ["Author"],
        "to_vertex_collections": ["Paper"],
    },
    {
        "edge_collection": "co_author_with",
        "from_vertex_collections": ["Author"],
        "to_vertex_collections": ["Author"],
    }
]

from adbnx_adapter import ADBNX_Controller, ADBNX_Adapter

class Custom_ADBNX_Controller(ADBNX_Controller):
    """Custom controller to map heterogeneous nodes and edges."""

    def _identify_networkx_node(self, nx_node_id, nx_node, adb_v_cols):
        """Determine the correct ArangoDB collection for a given node."""
        if "type" in nx_node:
            return nx_node["type"]  # Returns either "Author" or "Paper"
        raise ValueError(f"Node {nx_node_id} does not have a 'type' attribute.")

    def _identify_networkx_edge(self, nx_edge, from_node_id, to_node_id, nx_map, adb_e_cols):
        """Determine the correct ArangoDB edge collection based on the 'relation' key."""
        relation = nx_edge.get("relation", None)
        if relation == "WRITTEN_BY":
            return "written_by"
        elif relation == "CO_AUTHOR_WITH":
            return "co_author_with"
        else:
            raise ValueError(f"Unknown edge relation: {relation}")

    def _keyify_networkx_node(self, i, nx_node_id, nx_node, col):
        """Generate valid ArangoDB _key values."""
        key = re.sub(r'[^a-zA-Z0-9_-]', '_', nx_node_id)
        return str(key)

# Initialize ArangoDB Adapter
custom_adbnx_adapter = ADBNX_Adapter(db, Custom_ADBNX_Controller())

# Define Graph Name
graph_name = "physics"

# Delete existing graph (optional)
db.delete_graph(graph_name, drop_collections=True, ignore_missing=True)

# Convert NetworkX Graph to ArangoDB
adb_g = custom_adbnx_adapter.networkx_to_arangodb(graph_name, G, edge_definitions)

# Verify the graph structure
print(db.graph(graph_name).edge_definitions())

[2025/03/09 14:04:44 +0000] [41268] [INFO] - adbnx_adapter: Instantiated ADBNX_Adapter with database '_system'
INFO:adbnx_adapter:Instantiated ADBNX_Adapter with database '_system'


Output()

Output()

[2025/03/09 14:04:45 +0000] [41268] [INFO] - adbnx_adapter: Created ArangoDB 'physics' Graph
INFO:adbnx_adapter:Created ArangoDB 'physics' Graph


[{'edge_collection': 'co_author_with', 'from_vertex_collections': ['Author'], 'to_vertex_collections': ['Author']}, {'edge_collection': 'written_by', 'from_vertex_collections': ['Author'], 'to_vertex_collections': ['Paper']}]


#### Abstract Search View
ArangoDB is a multi-model database, which means we are able to leverage full-text search on our data in addition to graph queries.  For that, I am creating a "search view" to search through the text of the abstracts

In [17]:
# Create ArangoSearch View
view_name = "abstract_search_view"

# If it already exists, delete it
try:
    db.delete_view(view_name)
except:
    pass

# Create a new ArangoSearch View
db.create_view(view_name, view_type="arangosearch", properties={})

print(f"Created ArangoSearch view: {view_name}")


Created ArangoSearch view: abstract_search_view


In [18]:
# Define ArangoSearch properties (indexes for searching)
view_properties = {
    "links": {
        "Paper": {  # Index Paper collection
            "fields": {
                "abstract": {"analyzers": ["text_en"]}
            }
        },
        "Author": {  # Index Author collection
            "fields": {
                "name": {"analyzers": ["text_en"]}
            }
        }
    }
}

db.update_view(view_name, view_properties)


{'global_id': 'hBE61A6883233/186845',
 'id': '186845',
 'name': 'abstract_search_view',
 'type': 'arangosearch',
 'cleanup_interval_step': 2,
 'commit_interval_msec': 1000,
 'consolidation_interval_msec': 1000,
 'consolidation_policy': {'type': 'tier',
  'segments_min': 1,
  'segments_max': 10,
  'segments_bytes_max': 5368709120,
  'segments_bytes_floor': 2097152,
  'min_score': 0},
 'primary_sort': [],
 'primary_sort_compression': 'lz4',
 'stored_values': [],
 'writebuffer_idle': 64,
 'writebuffer_active': 0,
 'writebuffer_max_size': 33554432,
 'links': {'Paper': {'analyzers': ['identity'],
   'fields': {'abstract': {'analyzers': ['text_en']}},
   'includeAllFields': False,
   'storeValues': 'none',
   'trackListPositions': False},
  'Author': {'analyzers': ['identity'],
   'fields': {'name': {'analyzers': ['text_en']}},
   'includeAllFields': False,
   'storeValues': 'none',
   'trackListPositions': False}},
 'optimizeTopK': []}

We can add some niceties like stemming and lemmatization to make the searches a bit nicer

In [19]:
db.create_analyzer(
    name="stem_en",
    analyzer_type="stem",
    properties={ "locale": "en" },  # English stemming
    features=["frequency", "norm", "position"]  # Required features
)

{'name': '_system::stem_en',
 'type': 'stem',
 'properties': {'locale': 'en'},
 'features': ['frequency', 'position', 'norm']}

#### Author Search View
It is useful to allow one to search for authors easily without knowing the exact spelling etc

In [20]:
# Create ArangoSearch View
view_name = "author_search_view"

# If it already exists, delete it
try:
    db.delete_view(view_name)
except:
    pass

# Create a new ArangoSearch View
db.create_view(view_name, view_type="arangosearch", properties={})

print(f"Created ArangoSearch view: {view_name}")

Created ArangoSearch view: author_search_view


We now have the graph inside arangoDB, and can now load it from the db

In [21]:
# 2. Re-connect to the same Graph

G_adb = nxadb.Graph(name="physics", db=db)

print(G_adb)

[14:04:46 +0000] [INFO]: Graph 'physics' exists.
INFO:nx_arangodb:Graph 'physics' exists.
[14:04:46 +0000] [INFO]: Default node type set to 'Author'
INFO:nx_arangodb:Default node type set to 'Author'


Graph named 'physics' with 2840 nodes and 5484 edges


In [22]:
query = """
WITH Paper
FOR doc IN abstract_search_view
SEARCH ANALYZER(doc.abstract LIKE "%quantum%" OR doc.abstract LIKE "%relativity%", "text_en")
SORT BM25(doc) DESC
LIMIT 5
RETURN { abstract: doc.abstract }
"""

# Execute Query
cursor = db.aql.execute(query)

# Print results
for doc in cursor:
    print(doc)


{'abstract': 'Methods are developed for discussing the photon statistics of arbitrary radiation fields in fully quantummechanical terms. In order to keep the classical limit of quantum electrodynamics plainly in view, extensive use is made of the coherent states of the field. These states, which reduce the field correlation functions to factorized forms, are shown to offer a convenient basis for the description of fields of all types. Although they are not orthogonal to one another, the coherent states form a complete set. It is shown that any quantum state of the field may be expanded in terms of them in a unique way. Expansions are also developed for arbitrary operators in terms of products of the coherent state vectors. These expansions are discussed as a general method of representing the density operator for the field. A particular form is exhibited for the density operator which makes it possible to carry out many quantum- mechanical calculations by methods resembling those of cl

### Step 4: Build the Agentic App with LangChain & LangGraph

Here, I finally start building the agentic app.  I'm targetting the following types of questions:


- Degree of separation between two authors
- Find papers talking about a topic, and see the demographics (country of authors, affiliations of authors, time series of years) and graph statistics about that
- Influential Authors
- What percentage of research is international?
- Predict new collaborations
- Network Ranking Fairness


In [23]:
# 1. Create the ArangoGraph LangChain wrapper
# Reference: https://api.python.langchain.com/en/latest/graphs/langchain_community.graphs.arangodb_graph.ArangoGraph.html

arango_graph = ArangoGraph(db)

In [24]:
# 2. Define the llm object
# Note: You can use any llm you want. We will be using OpenAI for example purposes.

import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

llm.invoke("hello!")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_eb9dce56a8', 'finish_reason': 'stop', 'logprobs': None}, id='run-83c37152-a5a0-449e-ba1e-2aee4b860ea8-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [25]:
# 4. Define the Text to AQL Tool
# Reference: https://python.langchain.com/docs/integrations/graphs/arangodb/
# Reference: https://python.langchain.com/api_reference/community/chains/langchain_community.chains.graph_qa.arangodb.ArangoGraphQAChain.html

@tool
def text_to_aql_to_text(query: str):
    """This tool is available to invoke the
    ArangoGraphQAChain object, which enables you to
    translate a Natural Language Query into AQL, execute
    the query, and translate the result back into Natural Language.
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    chain = ArangoGraphQAChain.from_llm(
    	llm=llm,
    	graph=arango_graph,
    	verbose=True,
        allow_dangerous_requests=True
    )

    result = chain.invoke(query)

    return str(result["result"])

In [26]:
# 5. Define the Text to NetworkX/cuGraph Tool
# Note: It is encouraged to experiment and improve this section! This is just a placeholder:

@tool
def text_to_nx_algorithm_to_text(query):
    """This tool is available to invoke a NetworkX Algorithm on
    the ArangoDB Graph. You are responsible for accepting the
    Natural Language Query, establishing which algorithm needs to
    be executed, executing the algorithm, and translating the results back
    to Natural Language, with respect to the original query.

    If the query (e.g traversals, shortest path, etc.) can be solved using the Arango Query Language, then do not use
    this tool.
    """

    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    ######################
    print("1) Generating NetworkX code")

    text_to_nx = llm.invoke(f"""
    I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

    I have the following graph analysis query: {query}.

    Generate the Python Code required to answer the query using the `G_adb` object.

    Be very precise on the NetworkX algorithm you select to answer this query. Think step by step.

    Only assume that networkx is installed, and other base python dependencies.

    Always set the last variable as `FINAL_RESULT`, which represents the answer to the original query.

    Only provide python code that I can directly execute via `exec()`. Do not provide any instructions.

    Make sure that `FINAL_RESULT` stores a short & consice answer. Avoid setting this variable to a long sequence.

    Your code:
    """).content

    text_to_nx_cleaned = re.sub(r"^```python\n|```$", "", text_to_nx, flags=re.MULTILINE).strip()

    print('-'*10)
    print(text_to_nx_cleaned)
    print('-'*10)

    ######################

    print("\n2) Executing NetworkX code")
    global_vars = {"G_adb": G_adb, "nx": nx}
    local_vars = {}

    try:
        exec(text_to_nx_cleaned, global_vars, local_vars)
        text_to_nx_final = text_to_nx
    except Exception as e:
        print(f"EXEC ERROR: {e}")
        return f"EXEC ERROR: {e}"

        # TODO: Consider experimenting with a code corrector!
        attempt = 1
        MAX_ATTEMPTS = 3

        # while attempt <= MAX_ATTEMPTS
            # ...

    print('-'*10)
    FINAL_RESULT = local_vars["FINAL_RESULT"]
    print(f"FINAL_RESULT: {FINAL_RESULT}")
    print('-'*10)

    ######################

    print("3) Formulating final answer")

    nx_to_text = llm.invoke(f"""
        I have a NetworkX Graph called `G_adb`. It has the following schema: {arango_graph.schema}

        I have the following graph analysis query: {query}.

        I have executed the following python code to help me answer my query:

        ---
        {text_to_nx_final}
        ---

        The `FINAL_RESULT` variable is set to the following: {FINAL_RESULT}.

        Based on my original Query and FINAL_RESULT, generate a short and concise response to
        answer my query.

        Your response:
    """).content

    return nx_to_text

*Find the Distance Between Two Authors*


In [27]:
from langchain.tools import StructuredTool
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Define the input schema
class NameVariationInput(BaseModel):
    name: str = Field(description="The full name of the author (e.g., 'Stephen Hawking')")

# Define LLM and prompt for generating variations
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

name_variation_prompt = PromptTemplate(
    input_variables=["name"],
    template="""
    Given the scientist's name "{name}", generate possible variations including:
    - The originally input name
    - Full name (if known)
    - Name with middle initials
    - Name with the first and middle names only as initials with and without a dot
    - Shortened first names (e.g., "Alex" for "Alexander")
    - Common abbreviations and spellings
    - Variations where the last name might have spacing issues (e.g., "De Witt" vs "DeWitt")
    - Variations where the name may contain punctuation marks (e.g., "O'hara")
    - For East Asian names, try variations where the given name is before or after the family name

    If you know who the scientist is, you can also try variations that include their full middle name and initials.

    DO NOT RETURN MORE THAN 10 VARIATIONS!

    Only return a **comma-separated list** of name variations.
    """
)

name_chain = LLMChain(llm=llm, prompt=name_variation_prompt)

# Function to call LLM for name variations
def generate_name_variations(name: str):
    result = name_chain.run(name)
    return result.split(", ")

# Define LangChain Tool
name_variation_tool = StructuredTool(
    name="GenerateNameVariations",
    func=generate_name_variations,
    description="Generates possible variations of a researcher's name to improve search accuracy.",
    args_schema=NameVariationInput
)



The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-openai package and should be used instead. To use it run `pip install -U :class:`~langchain-openai` and import as `from :class:`~langchain_openai import ChatOpenAI``.


The class `LLMChain` was deprecated in LangChain 0.1.17 and will be removed in 1.0. Use :meth:`~RunnableSequence, e.g., `prompt | llm`` instead.



In [28]:
class FindAuthorIDInput(BaseModel):
    name_variations: list[str] = Field(description="A list of possible author name variations")

def get_best_author_id(name_variations):
    if not name_variations:
        return None

    # Build the OR clauses, making both sides lowercase:
    #   LOWER(doc.name) LIKE CONCAT("%", LOWER(@var0), "%") OR ...
    filter_clauses = " OR ".join(
        f'LOWER(doc.name) LIKE CONCAT("%", LOWER(@var{i}), "%")'
        for i in range(len(name_variations))
    )

    query = f"""
    FOR doc IN Author
        FILTER {filter_clauses}
        LIMIT 1
        RETURN {{ name: doc.name, id: doc._id }}
    """

    # Bind parameters
    bind_vars = {f"var{i}": v for i, v in enumerate(name_variations)}

    result = list(db.aql.execute(query, bind_vars=bind_vars))
    return result[0] if result else None


# Define LangChain Tool
author_id_tool = StructuredTool(
    name="FindAuthorID",
    func=get_best_author_id,
    description="Finds an author's ID in the ArangoDB database using multiple name variations.",
    args_schema=FindAuthorIDInput
)

In [29]:
import networkx as nx

class FindAuthorDistanceInput(BaseModel):
    author1_id: str = Field(description="The ID of the first author")
    author2_id: str = Field(description="The ID of the second author")

def shortest_path_between_authors(author1_id: str, author2_id: str):
    if nx.has_path(G_adb, author1_id, author2_id):
        path = nx.shortest_path(G_adb, source=author1_id, target=author2_id)
        return f"Degrees of separation: {len(path)-1}\nPath: {' → '.join(path)}"
    else:
        return "No connection found between these authors."

# Define LangChain Tool
distance_tool = StructuredTool(
    name="FindAuthorDistance",
    func=shortest_path_between_authors,
    description="Finds the shortest path (degrees of separation) between two researchers based on co-authorship.",
    args_schema=FindAuthorDistanceInput
)


In [30]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.prompts import PromptTemplate

# Register tools
tools = [name_variation_tool, author_id_tool, distance_tool]

# Custom decision-making prompt
custom_prompt = PromptTemplate(
    input_variables=["input", "agent_scratchpad"],
    template="""
    You are an AI assistant specializing in academic research networks.
    You have access to a **graph database** and **graph algorithms**.

    **Available Tools:**
    1. **GenerateNameVariations(name)** → Generates name variations for better author search.
    2. **FindAuthorID(name_variations)** → Finds an author's ID in ArangoDB.
    3. **FindAuthorDistance(author1_id, author2_id)** → Finds the degrees of separation between two authors.

    **Decision Rules:**
    - If the user asks for **"distance between X and Y"**:
      1. Call **GenerateNameVariations(X)**
      2. Call **GenerateNameVariations(Y)**
      3. Call **FindAuthorID(name_variations) for X and Y**
      4. Call **FindAuthorDistance(author1_id, author2_id)**

    **Your Approach:**
    1. **Identify** the best tool sequence.
    2. **Extract parameters** (author names).
    3. **Call the tools in the correct order**.
    4. **Format a clear response** for the user.

    User Query: {input}

    {agent_scratchpad}
    """
)

def query_graph(query):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

    agent = initialize_agent(
        tools=tools,
        llm=llm,
        agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
        verbose=True,
        agent_kwargs={"prompt": custom_prompt},
        return_intermediate_steps=True
    )

    # Invoke the agent
    response = agent.invoke({"input": query})
    return response["output"]


In [31]:
query_graph("What is the distance between Edward Witten and Terence Tao?")


LangChain agents will continue to be supported, but it is recommended for new use cases to be built with LangGraph. LangGraph offers a more flexible and full-featured framework for building agents, including support for tool-calling, persistence of state, and human-in-the-loop workflows. For details, refer to the `LangGraph documentation <https://langchain-ai.github.io/langgraph/>`_ as well as guides for `Migrating from AgentExecutor <https://python.langchain.com/docs/how_to/migrate_agent/>`_ and LangGraph's `Pre-built ReAct agent <https://langchain-ai.github.io/langgraph/how-tos/create-react-agent/>`_.





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find the distance between Edward Witten and Terence Tao, I need to first generate possible name variations for both researchers, find their author IDs, and then calculate the distance between them.
Action:
```
{
  "action": "GenerateNameVariations",
  "action_input": {
    "name": "Edward Witten"
  }
}
```[0m


The method `Chain.run` was deprecated in langchain 0.1.0 and will be removed in 1.0. Use :meth:`~invoke` instead.




Observation: [36;1m[1;3m['Edward Witten', 'Edward J. Witten', 'E. J. Witten', 'E J Witten', 'Ed Witten', 'Edward W. Witten', 'E. Witten', 'Ed W. Witten', 'Edward Witten', 'Edward J Witten'][0m
Thought:[32;1m[1;3mTo continue finding the distance between Edward Witten and Terence Tao, I will now generate name variations for Terence Tao.

Action:
```
{
  "action": "GenerateNameVariations",
  "action_input": {
    "name": "Terence Tao"
  }
}
```[0m
Observation: [36;1m[1;3m['Terence Tao', 'Terence Chi-Shen Tao', 'T. C. Tao', 'T C Tao', 'T. Tao', 'Terence C. Tao', 'Terry Tao', 'Terence C.-S. Tao', 'Tao Terence', 'Tao T.'][0m
Thought:[32;1m[1;3mQuestion: To find the distance between Edward Witten and Terence Tao, I need to find their author IDs using the generated name variations.
Thought: I need to find the author IDs for both Edward Witten and Terence Tao using their name variations.
Action:
```
{
  "action": "FindAuthorID",
  "action_input": {
    "name_variations": ["Edward Wi

[14:05:05 +0000] [INFO]: Graph 'physics' load took 0.9547224044799805s
INFO:nx_arangodb:Graph 'physics' load took 0.9547224044799805s
[14:05:05 +0000] [INFO]: NXCG Graph construction took 0.31113290786743164s
INFO:nx_arangodb:NXCG Graph construction took 0.31113290786743164s



Observation: [38;5;200m[1;3mNo connection found between these authors.[0m
Thought:[32;1m[1;3mIt seems there is no direct co-authorship connection between Edward Witten and Terence Tao in the database. This means they have not collaborated on a paper together, nor is there a known chain of co-authors linking them within the data available. If you have any other questions or need further assistance, feel free to ask![0m

[1m> Finished chain.[0m


'It seems there is no direct co-authorship connection between Edward Witten and Terence Tao in the database. This means they have not collaborated on a paper together, nor is there a known chain of co-authors linking them within the data available. If you have any other questions or need further assistance, feel free to ask!'

*Get Stats about a Topic*
We allow the user to input some physics-related topic, and then search our abstracts for that topic.  If we find it, we get the following information about the papers:
- Who worked on these topics
- Where in the world these authors are
- What are their affiliations
- What are the trends for this topic over time
- How influential are the authors who worked on this topic

In [32]:
from langchain.tools import StructuredTool
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import plotly.express as px

WORLD_MAP_URL = "https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip"
world = gpd.read_file(WORLD_MAP_URL)


# Tool: Search Papers Based on Topic
def search_papers(search_term: str):
    """
    Finds papers matching a research topic using full-text search in ArangoDB.
    """
    query = """
    WITH Paper
    FOR doc IN abstract_search_view
    SEARCH ANALYZER(
        LIKE(doc.abstract, @search_term),
        "text_en"
    )
    RETURN { id: doc._id, title: doc.title, year: doc.year }
    """

    results = list(db.aql.execute(query, bind_vars={"search_term": f"%{search_term}%"}))
    return results


# Tool: Find Authors of Those Papers
def get_authors_for_papers(paper_ids):
    """
    Finds authors of the given papers and their demographics.
    """
    query = """
    WITH written_by, Author
    FOR e IN written_by
    FILTER e._from IN @paper_ids
    FOR a IN Author
    FILTER a._id == e._to
    RETURN { name: a.name, country: a.country, affiliation: a.affiliation, paper_id: e._from }

    """
    results = list(db.aql.execute(query, bind_vars={"paper_ids": paper_ids}))
    return results


def generate_plots(author_data, paper_data):
    """
    Creates interactive Plotly plots for:
    1. Country distribution (Choropleth Map)
    2. Top affiliations (Bar Chart)
    3. Yearly publication trends (Time Series) - Now using paper data
    """

    df_authors = pd.DataFrame(author_data)
    df_papers = pd.DataFrame(paper_data)

    ## 🌍 **World Map (Choropleth) - Country Distribution**
    country_counts = df_authors["country"].value_counts().reset_index()
    country_counts.columns = ["country", "count"]  # Rename columns correctly

    fig1 = px.choropleth(
        country_counts,
        locations="country",
        locationmode="country names",
        color="count",
        title="Researcher Distribution by Country",
        color_continuous_scale="Blues"
    )

    ## 📊 **Bar Chart - Top 10 Affiliations**
    affiliation_counts = df_authors["affiliation"].value_counts().nlargest(10).reset_index()
    affiliation_counts.columns = ["affiliation", "count"]

    fig2 = px.bar(
        affiliation_counts,
        x="affiliation",
        y="count",
        labels={"affiliation": "Institution", "count": "Number of Researchers"},
        title="Top 10 Research Institutions"
    )

    ## 📈 **Time Series - Yearly Publication Trends (Using Paper Data)**
    year_counts = df_papers["year"].value_counts().sort_index().reset_index()
    year_counts.columns = ["year", "count"]  # Fix column naming

    fig3 = px.line(
        year_counts,
        x="year",
        y="count",
        labels={"year": "Year", "count": "Papers Published"},
        title="Yearly Publication Trends"
    )

    # ✅ Show plots
    # fig1.show()
    # fig2.show()
    # fig3.show()

    # Return the JSON representations of the plots
    return {
        "plots": {
            "world_map": fig1.to_json(),
            "affiliations": fig2.to_json(),
            "yearly_trends": fig3.to_json()
        }
    }

In [33]:
from langchain_core.output_parsers.json import JsonOutputParser
from langchain.output_parsers.fix import OutputFixingParser

In [34]:
from typing import List, Dict
import json

class SearchPapersInput(BaseModel):
    search_term: str

class GetAuthorsForPapersInput(BaseModel):
    paper_ids: List[str]

class GeneratePlotsInput(BaseModel):
    author_data: List[Dict]
    paper_data: List[Dict]

tools = [
    StructuredTool(
        name="SearchPapers",
        func=search_papers,
        description="Finds research papers matching a given topic.",
        args_schema=SearchPapersInput
    ),
    StructuredTool(
        name="GetAuthorsForPapers",
        func=get_authors_for_papers,
        description="Finds the authors of a given set of papers.",
        args_schema=GetAuthorsForPapersInput
    ),
    StructuredTool(
        name="GeneratePlots",
        func=generate_plots,
        description="Generates demographic visualizations for a given research field.",
        args_schema=GeneratePlotsInput
    ),
]

parser = JsonOutputParser()

custom_prompt = PromptTemplate(
    input_variables=["input", "agent_scratchpad"],
    template="""
    You are an AI research assistant with access to a **scientific knowledge graph**.


    **Available Tools:**
    1️ **SearchPapers(topic)** → Finds papers matching a research topic.
    2️ **GetAuthorsForPapers(paper_ids)** → Finds authors of those papers.
    4️ **GeneratePlots(author_data, papers_data)** → Creates demographics & trend visualizations.

    **How to Answer:**
    - If the user asks for **"papers on X"**, call `SearchPapers(X)`.
    - If the user asks **"who are the researchers in X"**, call `GetAuthorsForPapers(paper_ids)`.
    - If the user asks **"which authors are most influential?"**, call `AnalyzeAuthorNetwork(authors)`.
    - If the user asks **"show me trends"**, call `GeneratePlots(author_data, papers_data)`.
    - If the user asks **"make me a dashboard for X"**, call `GeneratePlots(author_data, papers_data)`.

    **Output format:**
    ALWAYS return a valid JSON object. DO NOT explain the output.

    ```json
    {{
        "country_plot": "...",  // JSON from Plotly
        "affiliation_plot": "...",
        "yearly_plot": "..."
    }}
    ```


    **User Query:** {input}

    {agent_scratchpad}
    """
)

def query_topic(query):
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o", verbose=False)

    agent = initialize_agent(
        tools=tools,
        llm=llm,
        agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
        verbose=False,
        agent_kwargs={"prompt": custom_prompt},
        return_intermediate_steps=True
    )

    response = agent.invoke({"input": query}, output_parser=parser)

    # DEBUG
    print(f"{response=}")

    return response

In [35]:
captured_response = query_topic("Make me a dashboard for quantum")

response={'input': 'Make me a dashboard for quantum', 'output': 'The dashboard for quantum research has been successfully created. It includes the following visualizations:\n\n1. **World Map of Researcher Distribution**: This map shows the distribution of researchers by country, highlighting the countries with the most researchers in the field of quantum studies.\n\n2. **Top 10 Research Institutions**: A bar chart displaying the top 10 institutions with the highest number of researchers contributing to quantum research.\n\n3. **Yearly Publication Trends**: A line chart illustrating the number of quantum research papers published over the years, providing insights into trends and growth in the field.\n\nIf you need further details or specific visualizations, feel free to ask!', 'intermediate_steps': [(AgentAction(tool='SearchPapers', tool_input={'search_term': 'quantum'}, log='To create a dashboard for quantum research, I need to gather data on recent research papers and their authors i

In [36]:
plotly_plots = captured_response['intermediate_steps'][2][1]["plots"]


import plotly.io as pio

for key, fig_json in plotly_plots.items():
    fig = pio.from_json(fig_json)
    fig.show()

In [37]:
# 8. (Optional) Set up UI via Gradio


!pip install gradio



In [46]:
import gradio as gr
import plotly.io as pio

def extract_and_display(captured_response):
    """Extracts Plotly figures and returns them for display in Gradio."""
    if not captured_response:
        return None, None, None  # Default empty plots

    plotly_plots = captured_response['intermediate_steps'][2][1]["plots"]

    figs = []
    for key, fig_json in plotly_plots.items():
        fig = pio.from_json(fig_json)
        figs.append(fig)

    return tuple(figs)

# Create Gradio app
with gr.Blocks() as demo:
    gr.Markdown("# 📊 Quantum Dashboard - Plotly Visualizations")

    captured_response_state = gr.State(query_topic("Make me a dashboard for quantum"))

    # We expect 3 plots
    plot1 = gr.Plot()
    plot2 = gr.Plot()
    plot3 = gr.Plot()

    # Button to trigger visualization
    with gr.Row():
        btn = gr.Button("Generate Dashboard")

    # When button is clicked, extract and display plots
    btn.click(extract_and_display, inputs=[captured_response_state], outputs=[plot1, plot2, plot3])

# Launch the Gradio app
demo.launch(share=True, debug=False)


response={'input': 'Make me a dashboard for quantum', 'output': 'The dashboard for quantum research has been successfully created. It includes the following visualizations:\n\n1. **World Map of Researcher Distribution**: This map shows the distribution of researchers by country, highlighting the countries with the most researchers in the field of quantum studies.\n\n2. **Top 10 Research Institutions**: A bar chart displaying the top 10 institutions with the highest number of researchers contributing to quantum research.\n\n3. **Yearly Publication Trends**: A line chart illustrating the number of quantum research papers published over the years, providing insights into trends and growth in the field.\n\nIf you need further details or specific visualizations, feel free to ask!', 'intermediate_steps': [(AgentAction(tool='SearchPapers', tool_input={'search_term': 'quantum'}, log='To create a dashboard for quantum research, I need to gather data on recent research papers and their authors i

DEBUG: key='world_map' {"data":[{"coloraxis":"coloraxis","geo":"geo","hovertemplate":"country=%{location}\u003cbr\u003ecount=%{z}\u003cextra\u003e\u003c\u002fextra\u003e","locationmode":"country names","locations":["Russia","United States","United Kingdom","Austria","Netherlands","UNKNOWN","Switzerland","Singapore","Spain","Portugal","Australia","Germany","Israel"],"name":"","z":[11,8,3,2,1,1,1,1,1,1,1,1,1],"type":"choropleth"}],"layout":{"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"tic

