In [None]:
# %pip install requests python-dotenv neo4j pydantic langchain-community

In [28]:
import os
import re
import json
import requests
import time
from typing import List
from dotenv import load_dotenv
from neo4j import GraphDatabase
from pydantic import BaseModel, Field
import warnings
from langchain_community.llms.ollama import Ollama

load_dotenv()

True

In [2]:
def get_json_request(url):
  return requests.get(url).json()

def strip_html(text):
  """ remove HTML tags from a string """
  if not isinstance(text, str):
    return ""
  clean = re.compile("<.*?>")
  return re.sub(clean, "", text)

def preprocess_events(events):
  """ construct dictionary from event data """
  return [
    {
      "title": event["title"],
      "group_title": event["group_title"],
      "url": event["url"],
      "description": strip_html(event["description"]),
      "date": event["date"],
      "date_time": event["date_time"],
      "location": event["location"],
      "location_title": event["location_title"],
      "location_latitude": float(event["location_latitude"]) if event["location_latitude"] != None else 0,
      "location_longitude": float(event["location_longitude"]) if event["location_longitude"] != None else 0,
      "cost": event["cost"],
      "thumbnail": event["thumbnail"],
      "event_types": event["event_types"],
      "event_types_audience": event["event_types_audience"],
    }
    for event in events
  ]

In [3]:
index_root = os.path.join(os.getcwd(), 'graphrag_index')
os.makedirs(os.path.join(index_root, 'input'), exist_ok=True)

In [4]:
tamu_events_url = "https://calendar.tamu.edu/live/json/events/group"
raw_events = get_json_request(tamu_events_url)
processed_events = preprocess_events(raw_events)

#save processed data to file
file_path = "inputEvents.txt"
with open(file_path, 'w') as file:
    for j, event in enumerate(processed_events):
        file.write(json.dumps(event) + "\n")
        if j == 100:
            break

In [5]:
loaded_docs = []
with open(file_path, 'r') as file:
    for line in file:
        loaded_docs.append(line.strip())

documents = loaded_docs

In [None]:
#set up Neo4j database connection
#if you change your user/password, need to restart kernel for
#changes to take effect
driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

In [7]:
def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX fulltext_entity_name 
    FOR (n:Entity) 
    ON EACH [n.name];
    '''
    tx.run(query)

def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)

try:
    create_index()
except Exception as e:
    print("The index already exists or there was an error:", e)

In [None]:
class EntityItem(BaseModel):
    name: str
    type: str

class Entities(BaseModel):
    names: List[EntityItem] = Field(
        ...,
        description="List of entities with 'name' and 'type', focusing on event-related entities."
    )

def extract_entities(text, max_retries=3):
    prompt_template = f"""
    Find relevant entities in the following text, extracting "Event", "Event_Type", 
    "Event_Types_Audience", "Speakers", "Location", "Department_or_Organization", "Topic", and 
    "Date" entities. Format the output as a JSON list, where each item has 'name' and 'type' keys.
    
    Do not add any extra explanation or commentary, just the output specified above.
    Create a list of entities with `name` and `type` fields, ensuring that each entity has a 
    non-null `name` value. If you can't find the `name`, do not include the entity in the response.

    Text: "{text}"
    """

    llm = Ollama(model="mistral", temperature=0.0, num_predict=1000)
    
    for attempt in range(max_retries):
        response = llm.invoke(prompt_template)
        print(f"attempt {attempt + 1} response:")
        print(response)

        try:
            #use regex to extract JSON portion
            json_match = re.search(r"\[.*\]", response, re.DOTALL)
            if not json_match:
                raise ValueError("no valid json array found in LLM response")

            json_data = json_match.group()
            raw_entities = json.loads(json_data)

            #filter out entities with a null name
            valid_entities = []
            for entity in raw_entities:
                name = entity.get('name')
                if name is None or (isinstance(name, list) and not all(name)):
                    continue

                if isinstance(name, list):
                    #if name is a list, turn it into a single string
                    entity['name'] = ", ".join(name)

                if isinstance(entity['name'], str):
                    valid_entities.append(entity)
            
            entities = Entities.model_validate({"names": valid_entities})
            return entities

        except Exception as e:
            print(f"error getting entities on attempt {attempt + 1}: {e}")
            time.sleep(1)
    
    raise ValueError("failed to get a valid response from LLM")


In [None]:
#TO FIX: need to create an individual node for audience type faculty, staff, students, etc.***

In [31]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

#insert documents with extracted entities into the graph
def add_document_to_graph(document, entities):
    with driver.session() as session:
        for entity in entities.names:
            # set the Cypher query with the label based on entity type
            #NEED TO CHANGE LATER**********
            query = f"""
            MERGE (e:{entity.type} {{name: $name}})
            MERGE (d:Document {{text: $text}})
            MERGE (d)-[:MENTIONS]->(e)
            """
            
            session.run(query, name=entity.name, text=document)

#process documents
for doc in documents:
    try:
        entities = extract_entities(doc)
        # print("extracted entities:", entities)
        add_document_to_graph(doc, entities)
    except ValueError as e:
        print(f"Skipping text due to LLM failure: {doc}")
        continue

# print("extracted entities:", entities)
# print(doc)

Attempt 1 response:
```json
[
  {
    "name": "Learn to Bird 2024",
    "type": "Event"
  },
  {
    "name": "Campus Life",
    "type": "Event_Type"
  },
  {
    "name": "Students, Researcher, Staff, and/or Faculty",
    "type": "Event_Types_Audience"
  },
  {
    "name": "Texas A&M AgriLife Extension Service",
    "type": "Department_or_Organization"
  },
  {
    "name": "Birding",
    "type": "Topic"
  },
  {
    "name": "November 9, 2024",
    "type": "Date"
  },
  {
    "name": "100 W. Houston St., Sherman",
    "type": "Location"
  }
]
```
Attempt 1 response:
```json
[
  {
    "name": "Preregistration for Spring 2025",
    "type": "Event"
  },
  {
    "name": "Spring 2025 preregistration",
    "type": "Event_Type"
  },
  {
    "name": "Students, Researcher, Staff, and/or Faculty",
    "type": "Event_Types_Audience"
  },
  {
    "name": "Office of the Registrar",
    "type": "Department_or_Organization"
  },
  {
    "name": "Preregistration for spring semester",
    "type": "Topic"

In [None]:
warnings.filterwarnings("ignore")

llm = Ollama(model="llama3.1", temperature=0.0, num_predict=500)

def graph_retriever(question: str):
    result = ""
    entities = extract_entities(question)  # extract entities from the question
    
    entity_terms = []
    for entity in entities.names:
        #split compound terms and add both full and partial terms
        entity_terms.extend([
            term.lower() for term in entity.name.split()
        ])
    
    if entity_terms:
        response = driver.session().run(
            """
            MATCH (e:Event)
            CALL {
                WITH e
                MATCH (e)-[:MENTIONS*1..2]-(related)
                WHERE related.text IS NOT NULL OR related.name IS NOT NULL
                RETURN COLLECT(DISTINCT COALESCE(related.text, related.name)) as relatedTexts
            }
            WITH e, relatedTexts, [text IN relatedTexts WHERE text IS NOT NULL | toLower(text)] as lowerTexts
            WITH e, relatedTexts, lowerTexts,
                //calculate a match score based on how many terms are found
                size([term IN $query_terms WHERE 
                    toLower(e.name) CONTAINS term
                    OR ANY(text IN lowerTexts WHERE text CONTAINS term)
                ]) as matchScore
            WHERE matchScore > 0  //at least one term found in the node
            RETURN 
                'Event: ' + e.name + 
                '\nMatch Score: ' + toString(matchScore) + '/' + toString(size($query_terms)) +
                '\nContext: ' + 
                reduce(s = "", text IN relatedTexts | s + "\n- " + text) as output
            ORDER BY matchScore DESC  //show best matches first
            LIMIT 3
            """,
            {"query_terms": entity_terms}
        )

        result += "\n".join([el['output'] for el in response if el['output'] is not None])

    prompt = f"""
    Based on the following graph data, answer the user's question within 350 words.
    Do not mention the graph, just focus on answering the user's question.
    
    Graph data:
    {result}
    
    Question:
    '{question}'
    """
    
    print("results:")
    print(result)
    
    print("llm response")
    #generate a response using the LLM
    llm_response = llm.invoke(prompt).strip()
    
    return llm_response

In [74]:
#test
print(graph_retriever("Which events are for students and researchers, but not for faculty or staff?"))
#possibly implement: LLM should be able to modify Cypher query if necessary?*****

driver.close()



Attempt 1 response:
 [
      {
        "name": "Events",
        "type": "Event"
      },
      {
        "name": "Students",
        "type": "Event_Types_Audience"
      },
      {
        "name": "Researchers",
        "type": "Event_Types_Audience"
      },
      {
        "name": null,
        "type": "Department_or_Organization"
      },
      {
        "name": null,
        "type": "Topic"
      },
      {
        "name": null,
        "type": "Date"
      },
      {
        "name": "Faculty",
        "type": "Excluded_Audience"
      },
      {
        "name": "Staff",
        "type": "Excluded_Audience"
      }
    ]
results:
Event: Intro Backpacking Weekend with Outdoor Advenures
Match Score: 4/5
Context: 
- {"title": "Intro Backpacking Weekend with Outdoor Advenures", "group_title": "Rec Sports", "url": "https://calendar.tamu.edu/live/events/335807-intro-backpacking-weekend", "description": "\n  Experience some scenery you can\u2019t see driving along the Texas highways. See 