In [11]:
import os
import pandas as pd
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from pinecone import Pinecone, ServerlessSpec
load_dotenv()

True

In [12]:
df = None
def get_connection():
    DB_USERNAME = os.getenv("DB_USERNAME")       
    DB_PASSWORD = os.getenv("DB_PASSWORD")   
    DB_HOST = os.getenv("DB_HOST")  
    DB_DATABASE = os.getenv("DB_DATABASE")     
    try:
        connection = mysql.connector.connect(
            host=DB_HOST,
            user=DB_USERNAME,
            password=DB_PASSWORD,
            database=DB_DATABASE
        )

        if connection.is_connected():
            return connection

    except Error as e:
        print(f"Error: {e}")
        return None

In [13]:
def create_incident_description(row):
    """
    Creates a descriptive text representation of an incident for vector embedding.
    Excludes incident_id, crash_num, and incident_date as they go in metadata.
    """
    parts = []
    
    # First Harmful Event
    if pd.notna(row['first_harmful_event']):
        parts.append(f"First harmful event: {row['first_harmful_event']}")
    
    # Work Zone
    if pd.notna(row['is_work_zone']):
        work_zone_text = "occurred in a work zone" if row['is_work_zone'] == 'Y' else "did not occur in a work zone"
        parts.append(f"Incident {work_zone_text}")
    
    # Injury Counts
    if pd.notna(row['cnt_fatal_injury']) and row['cnt_fatal_injury'] > 0:
        parts.append(f"{row['cnt_fatal_injury']} fatal {'injury' if row['cnt_fatal_injury'] == 1 else 'injuries'}")
    
    if pd.notna(row['cnt_sus_serious_injury']) and row['cnt_sus_serious_injury'] > 0:
        parts.append(f"{row['cnt_sus_serious_injury']} suspected serious {'injury' if row['cnt_sus_serious_injury'] == 1 else 'injuries'}")
    
    if pd.notna(row['cnt_sus_minor_injury']) and row['cnt_sus_minor_injury'] > 0:
        parts.append(f"{row['cnt_sus_minor_injury']} suspected minor {'injury' if row['cnt_sus_minor_injury'] == 1 else 'injuries'}")
    
    # Pedestrian and Cyclist
    if pd.notna(row['cnt_pedestrian']) and row['cnt_pedestrian'] > 0:
        parts.append(f"{row['cnt_pedestrian']} {'pedestrian' if row['cnt_pedestrian'] == 1 else 'pedestrians'} involved")
    
    if pd.notna(row['cnt_cyclist']) and row['cnt_cyclist'] > 0:
        parts.append(f"{row['cnt_cyclist']} {'cyclist' if row['cnt_cyclist'] == 1 else 'cyclists'} involved")
    
    # Hit and Run
    if pd.notna(row['is_hit_and_run']):
        if row['is_hit_and_run'] == 'Y':
            parts.append("Hit and run incident")
    
    # Location
    if pd.notna(row['incident_location']):
        parts.append(f"Location: {row['incident_location']}")
    
    # Light Condition
    if pd.notna(row['light_condition']):
        parts.append(f"Light condition: {row['light_condition']}")
    
    # Weather Condition
    if pd.notna(row['weather_condition']):
        parts.append(f"Weather: {row['weather_condition']}")
    
    # Road Surface
    if pd.notna(row['road_surface']):
        parts.append(f"Road surface: {row['road_surface']}")
    
    # Traffic Control Device
    if pd.notna(row['traffic_control_device_type']):
        parts.append(f"Traffic control: {row['traffic_control_device_type']}")
    
    # Intersection Type
    if pd.notna(row['roadway_intersection_type']):
        parts.append(f"Intersection type: {row['roadway_intersection_type']}")
    
    # Trafficway Type
    if pd.notna(row['trafficway_type']):
        parts.append(f"Trafficway type: {row['trafficway_type']}")
    
    # Collision Manner
    if pd.notna(row['collision_manner']):
        parts.append(f"Collision manner: {row['collision_manner']}")
    
    # Harmful Event Location
    if pd.notna(row['harmful_event_location']):
        parts.append(f"Harmful event location: {row['harmful_event_location']}")
    
    # Join all parts with periods and spaces
    return ". ".join(parts) + "."

In [14]:
connection = get_connection()
query="select * from incidents_view;"
df = pd.read_sql(query, connection)


  df = pd.read_sql(query, connection)


In [15]:
incidents = [
    {
        "content": create_incident_description(row),
        "crash_num": row['crash_num'],
        "incident_id": row['incident_id'],
        "incident_date": str(row['incident_date'])
    }
    for _, row in df.iterrows()
]

In [21]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = os.getenv("PINECONE_INDEX_NAME")

# Create index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # OpenAI embedding dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Delete all existing vectors from the index
index = pc.Index(index_name)
index.delete(delete_all=True)
print("Deleted all existing vectors from index")


Deleted all existing vectors from index


In [22]:
documents = [
    Document(
        page_content=incident["content"],
        metadata={
            "crash_num": incident["crash_num"],
            "incident_id": incident["incident_id"],
            "incident_date": incident["incident_date"]
        }
    )
    for incident in incidents
]

# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Batch indexing
batch_size = 100
vector_store = None

for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    
    if vector_store is None:
        vector_store = PineconeVectorStore.from_documents(
            documents=batch,
            embedding=embeddings,
            index_name=index_name
        )
    else:
        vector_store.add_documents(batch)

print(f"Indexed {len(documents)} incidents successfully!")

Indexed 1000 incidents successfully!


In [28]:
documents[0].page_content

'First harmful event: COLLISION WITH MOTOR VEHICLE IN TRAFFIC. Incident did not occur in a work zone. Location: 165 MIDDLESEX AV Somerville, MA. Light condition: DAYLIGHT. Weather: CLEAR. Road surface: DRY. Traffic control: NO CONTROLS. Intersection type: NOT AT INTERSECTION. Trafficway type: UNKNOWN. Collision manner: ANGLE. Harmful event location: UNKNOWN.'

In [39]:

# Example: Search for similar incidents
query = "313 BROADWAY Somerville, MA"
results = vector_store.similarity_search(query, k=1)

for doc in results:
    print(f"Incident ID: {doc.metadata['incident_id']}")
    print(f"Description: {doc.page_content}\n")

Incident ID: 517.0
Description: First harmful event: COLLISION WITH MOTOR VEHICLE IN TRAFFIC. Incident did not occur in a work zone. Location: 448 BROADWAY Somerville, MA. Light condition: DAYLIGHT. Weather: CLEAR. Road surface: DRY. Traffic control: NO CONTROLS. Intersection type: NOT AT INTERSECTION. Trafficway type: TWO-WAY, NOT DIVIDED. Collision manner: REAR-END. Harmful event location: ROADWAY.

