In [None]:
%pip install requests python-dotenv neo4j pydantic langchain-community

In [15]:
import os
import re
import json
import requests
from typing import List
from dotenv import load_dotenv
from neo4j import GraphDatabase
from pydantic import BaseModel, Field
import warnings
from langchain_community.llms.ollama import Ollama

load_dotenv()

True

In [16]:
def get_json_request(url):
  return requests.get(url).json()

def strip_html(text):
  """ remove HTML tags from a string """
  if not isinstance(text, str):
    return ""
  clean = re.compile("<.*?>")
  return re.sub(clean, "", text)

def preprocess_events(events):
  """ construct dictionary from event data """
  return [
    {
      "title": event["title"],
      "group_title": event["group_title"],
      "url": event["url"],
      "description": strip_html(event["description"]),
      "date": event["date"],
      "date_time": event["date_time"],
      "location": event["location"],
      "location_title": event["location_title"],
      "location_latitude": float(event["location_latitude"]) if event["location_latitude"] != None else 0,
      "location_longitude": float(event["location_longitude"]) if event["location_longitude"] != None else 0,
      "cost": event["cost"],
      "thumbnail": event["thumbnail"],
      "event_types": event["event_types"],
      "event_types_audience": event["event_types_audience"],
    }
    for event in events
  ]

In [33]:
index_root = os.path.join(os.getcwd(), 'graphrag_index')
os.makedirs(os.path.join(index_root, 'input'), exist_ok=True)

In [34]:
tamu_events_url = "https://calendar.tamu.edu/live/json/events/group"
raw_events = get_json_request(tamu_events_url)
processed_events = preprocess_events(raw_events)

#save processed data to file
file_path = "inputEvents.txt"
with open(file_path, 'w') as file:
    for j, event in enumerate(processed_events):
        file.write(json.dumps(event) + "\n")
        if j == 100:
            break

In [35]:
loaded_docs = []
with open(file_path, 'r') as file:
    for line in file:
        loaded_docs.append(line.strip())

documents = loaded_docs

In [41]:
#set up Neo4j database connection
driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

In [42]:
def create_fulltext_index(tx):
    query = '''
    CREATE FULLTEXT INDEX fulltext_entity_name 
    FOR (n:Entity) 
    ON EACH [n.name];
    '''
    tx.run(query)

def create_index():
    with driver.session() as session:
        session.execute_write(create_fulltext_index)

try:
    create_index()
except Exception as e:
    print("The index already exists or there was an error:", e)

The index already exists or there was an error: {code: Neo.ClientError.Security.Unauthorized} {message: The client is unauthorized due to authentication failure.}


In [43]:
class EntityItem(BaseModel):
    name: str
    type: str

class Entities(BaseModel):
    names: List[EntityItem] = Field(
        ...,
        description="List of entities with 'name' and 'type', focusing on event-related entities."
    )
    
def extract_entities(text):
    prompt = f"""
    Find relevant entities in the following text, extracting "Event", "Event_Type", 
    "Event_Types_Audience", "Speakers", "Location", "Department_or_Organization", "Topic", and 
    "Date" entities. Format the output as a JSON list, where each item has 'name' and 'type' keys.
    
    Note that for Event_Type, the possible types are only: Campus Life; Training & Workshops;
    Sports & Athletics; Speakers, Forums, and Conferences; Deadlines; General Interest; Academic 
    Calendar; Arts & Entertainment; and International Students.
    
    For Event_Types_Audience, the possible audiences are only: Students, Researcher, Staff, and/or
    Faculty.

    Do not add any extra explanation or commentary, just the output specified above.
    Create a list of entities with `name` and `type` fields, ensuring that each entity has a 
    non-null `name` value. If you can't find the `name`, do not include the entity in the response.

    For example:

    Text: "title": "ICF Open", "group_title": "Department of Rec Sports", "url": "", 
    "description": "", "date": "November 7", "date_time": "3:00pm - 10:00pm", "location": null, 
    "location_title": null, "location_latitude": 0, "location_longitude": 0, "cost": null,
    "thumbnail": null, "event_types": null, "event_types_audience": null

    Your response:

    [
        {{
            "name": "ICF Open",
            "type": "Event"
        }},
        {{
            "name": "Department of Rec Sports",
            "type": "Department_or_Organization"
        }},
        {{
            "name": "November 7",
            "type": "Date"
        }}
    ]
    
    Text: "What tech events are happening on October 4 for researchers?"
    
    Your response:
    
    [
        {{
            "name": "Department of Computer Science and Engineering",
            "type": "Department_or_Organization"
        }},
        {{
            "name": "Technology",
            "type": "Topic"
        }},
        {{
            "name": "October 4",
            "type": "Date"
        }},
        {{
            "name": "Researcher",
            "type": "Event_Types_Audience"
        }}        
    ]
    
    Your response should be formatted like above, with only a single pair of
    square brackets and curly brackets used internally.

    Text: "{text}"
    """
    
    llm = Ollama(model="llama3.1", temperature=0.0, num_predict=1000)

    response = llm.invoke(prompt)
    print("llm response:")
    print(response)

    try:
        raw_entities = json.loads(response.strip())

        #ensure all entities have a 'name'
        valid_entities = []
        for entity in raw_entities:
            name = entity.get('name')
            if isinstance(name, list):
                #if name is a list, concatenate it into a single string
                entity['name'] = ", ".join(name)
            elif not isinstance(name, str):
                continue

            if isinstance(entity['name'], str):
                valid_entities.append(entity)

        entities = Entities.parse_obj({"names": valid_entities})
        return entities

    except Exception as e:
        print(f"Error parsing entities: {e}")
        return []

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

#insert documents with extracted entities into the graph
def add_document_to_graph(document, entities):
    with driver.session() as session:
        for entity in entities.names:
            # set the Cypher query with the label based on entity type
            #NEED TO CHANGE LATER**********
            query = f"""
            MERGE (e:{entity.type} {{name: $name}})
            MERGE (d:Document {{text: $text}})
            MERGE (d)-[:MENTIONS]->(e)
            """
            
            session.run(query, name=entity.name, text=document)

#process documents
for doc in documents:
    entities = extract_entities(doc)
    # print("extracted entities:", entities)
    add_document_to_graph(doc, entities)

# print("extracted entities:", entities)
# print(doc)

In [None]:
warnings.filterwarnings("ignore")

llm = Ollama(model="llama3.1", temperature=0.0, num_predict=500)

def graph_retriever(question: str):
    result = ""
    entities = extract_entities(question)  # extract entities from the question
    
    entity_terms = []
    for entity in entities.names:
        #split compound terms and add both full and partial terms
        entity_terms.extend([
            term.lower() for term in entity.name.split()
        ])
    
    if entity_terms:
        response = driver.session().run(
            """
            MATCH (e:Event)
            CALL {
                WITH e
                MATCH (e)-[:MENTIONS*1..2]-(related)
                WHERE related.text IS NOT NULL OR related.name IS NOT NULL
                RETURN COLLECT(DISTINCT COALESCE(related.text, related.name)) as relatedTexts
            }
            WITH e, relatedTexts, [text IN relatedTexts WHERE text IS NOT NULL | toLower(text)] as lowerTexts
            WITH e, relatedTexts, lowerTexts,
                // Calculate a match score based on how many terms are found
                size([term IN $query_terms WHERE 
                    toLower(e.name) CONTAINS term
                    OR ANY(text IN lowerTexts WHERE text CONTAINS term)
                ]) as matchScore
            WHERE matchScore > 0  // At least one term must match
            RETURN 
                'Event: ' + e.name + 
                '\nMatch Score: ' + toString(matchScore) + '/' + toString(size($query_terms)) +
                '\nContext: ' + 
                reduce(s = "", text IN relatedTexts | s + "\n- " + text) as output
            ORDER BY matchScore DESC  // Show best matches first
            """,
            {"query_terms": entity_terms}
        )

        result += "\n".join([el['output'] for el in response if el['output'] is not None])

    prompt = f"""
    Based on the following graph data, answer the user's question within 450 words.
    Do not mention the graph, just focus on answering the user's question.
    
    Graph data:
    {result}
    
    Question:
    '{question}'
    """
    
    print("results:")
    print(result)
    
    #generate a response using the LLM
    llm_response = llm.invoke(prompt).strip()
    
    return llm_response

In [None]:
#test
print(graph_retriever("What athletics events are happening?"))
#possibly implement: LLM should be able to modify Cypher query if necessary?*****

driver.close()