In [1]:
%pip install --upgrade --quiet  langchain langchain-community langchain-ollama langchain-experimental neo4j tiktoken yfiles_jupyter_graphs python-dotenv json-repair langchain-openai langchain_core


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [86]:
import os
import re
import json
import requests
from typing import List

from dotenv import load_dotenv
from neo4j import GraphDatabase
from pydantic import BaseModel, Field
import warnings

#LangChain imports
from langchain_community.llms.ollama import Ollama

load_dotenv()

True

In [79]:
def get_json_request(url):
  return requests.get(url).json()

def strip_html(text):
  """ remove HTML tags from a string """
  if not isinstance(text, str):
    return ""
  clean = re.compile("<.*?>")
  return re.sub(clean, "", text)

def preprocess_events(events):
  """ construct dictionary from event data """
  return [
    {
      "title": event["title"],
      "group_title": event["group_title"],
      "url": event["url"],
      "description": strip_html(event["description"]),
      "date": event["date"],
      "date_time": event["date_time"],
      "location": event["location"],
      "location_title": event["location_title"],
      "location_latitude": float(event["location_latitude"]) if event["location_latitude"] != None else 0,
      "location_longitude": float(event["location_longitude"]) if event["location_longitude"] != None else 0,
      "cost": event["cost"],
      "thumbnail": event["thumbnail"],
      "event_types": event["event_types"],
      "event_types_audience": event["event_types_audience"],
    }
    for event in events
  ]

In [80]:
index_root = os.path.join(os.getcwd(), 'graphrag_index')
os.makedirs(os.path.join(index_root, 'input'), exist_ok=True)

In [81]:
tamu_events_url = "https://calendar.tamu.edu/live/json/events/group"
raw_events = get_json_request(tamu_events_url)
processed_events = preprocess_events(raw_events)

#save processed data to file
file_path = "inputEvents.txt"
with open(file_path, 'w') as file:
    for i, event in enumerate(processed_events):
        file.write(json.dumps(event) + "\n")
        if i == 2:  #remove this line later***************
            break

In [53]:
loaded_docs = []
with open(file_path, 'r') as file:
    for line in file:
        loaded_docs.append(line.strip())

documents = loaded_docs

In [58]:
#set up Neo4j database connection
driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX `fulltext_entity_id` 
    FOR (n:Entity) 
    ON EACH [n.id];
    '''
    tx.run(query)

def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)

try:
    create_index()
except:
    print("The index already exists or there was an error.")


The index already exists or there was an error.


In [83]:
class EntityItem(BaseModel):
    name: str
    type: str

class Entities(BaseModel):
    names: List[EntityItem] = Field(
        ...,
        description="List of entities with 'name' and 'type', focusing on event-related entities."
    )
    
def extract_entities(text):
    prompt = f"""
    Find relevant entities in the following text, extracting event title,
    speakers, locations, general subject matter, and other event-related entities 
    for creating a knowledge graph. 
    Format the output as a JSON list, where each item has 'name' and 'type' keys.
    Do not add any extra explanation or commentary, just the output specified above.
    Create a list of entities with `name` and `type` fields, ensuring that each entity has a 
    non-null `name` value. If you can't find the `name`, do not include the entity in the response.

    Text: "{text}"
    """
    
    llm = Ollama(model="mistral", temperature=0.0, num_predict=1000)

    response = llm.invoke(prompt)
    # print("llm response:")
    # print(response)

    try:
        # print(response.strip())
        entities = Entities.parse_obj({"names": json.loads(response.strip())})
        return entities
    except Exception as e:
        print(f"Error parsing entities: {e}")
        return []


In [87]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

#insert documents with extracted entities into the graph
def add_document_to_graph(document, entities):
    with driver.session() as session:
        for entity in entities.names:
            # set the Cypher query with the label based on entity type
            query = f"""
            MERGE (e:{entity.type} {{name: $name}})
            MERGE (d:Document {{text: $text}})
            MERGE (d)-[:MENTIONS]->(e)
            """
            
            session.run(query, name=entity.name, text=document)

#process documents
for doc in documents:
    entities = extract_entities(doc)
    add_document_to_graph(doc, entities)

# print("extracted entities:", entities)
# print(doc)

In [92]:
warnings.filterwarnings("ignore")

llm = Ollama(model="mistral", temperature=0.0, num_predict=150)

def graph_retriever(question: str):
    result = ""
    entities = extract_entities(question)  #extract entities from the question
    # print("extracted entities:", entities)

    for entity in entities.names:
        #query the graph for each entity's name
        response = driver.session().run(
            """
            CALL db.index.fulltext.queryNodes('fulltext_entity_id', $query, {limit:2})
            YIELD node, score
            WITH node
            CALL {
                MATCH (node)-[r:MENTIONS]->(neighbor)
                RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
                UNION ALL
                MATCH (node)<-[r:MENTIONS]-(neighbor)
                RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity.name}
        )

        result += "\n".join([el['output'] for el in response])
        

    prompt = f"""
    Based on the following graph data, answer the user's question within 100 words.
    Do not mention the graph, just focus on answering the user's question.
    '{question}'
    
    Graph data:
    {result}
    """
    
    #generate a response using the LLM
    llm_response = llm.invoke(prompt).strip()
    
    return llm_response

#test
print(graph_retriever("Tell me about Aggie One Stop."))

driver.close()




Aggie One Stop is a student service center at Texas A&M University. It provides various services such as registration, financial aid, billing, and academic advising all under one roof. This makes it easier for students to access the help they need in one convenient location. Additionally, Aggie One Stop offers resources like tutoring, career services, and disability services. It's a valuable resource for students at Texas A&M University.
