### **Imports**

In [49]:
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
print("✅ Demo tools loaded.")

✅ Demo tools loaded.


### **Model**

In [50]:
# Initialize the open-source embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"✅ Model '{model_name}' is loaded and ready.")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4199cf0f-7dab-4875-b2e8-7d3303dff1f5)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


✅ Model 'sentence-transformers/all-MiniLM-L6-v2' is loaded and ready.


## **Text-to-Vector**
*(Main component of RAG)*

Here is a plain English question being converted to numeric vector.

**"Tell me about an accident in Virginia involving a person over 50"**

In [51]:
question = "Tell me about an accident in Virginia involving a person over 50"
# question = "How many accidents in Virginia involved alcohol?"
vector = embeddings.embed_query(question)

print("--- Your Question ---")
print(f"'{question}'")

print("\n--- Becomes a 'Vector' (a list of numbers) ---")
print(np.array(vector))
print(f"\nTotal length of the vector: {len(vector)}")


print("\n--- Becomes a 'Vector' (a list of numbers) ---")
print("\nFirst 5 numbers of the vector:")
print(np.array(vector)[:5])


--- Your Question ---
'Tell me about an accident in Virginia involving a person over 50'

--- Becomes a 'Vector' (a list of numbers) ---
[ 8.40764958e-03  1.07515968e-01  5.41817248e-02  1.56952580e-03
  2.25946419e-02  7.25160092e-02 -8.43414851e-03  6.84567317e-02
 -9.00117531e-02  8.87434781e-02  1.03949726e-01  2.02046685e-05
  1.95920523e-02 -3.52662429e-02 -1.09598853e-01 -3.74296606e-02
  4.23778445e-02  4.61610444e-02 -1.03466004e-01  8.75970721e-02
 -2.14903820e-02  3.62427831e-02 -4.98572364e-02  1.70407549e-03
 -5.71466871e-02  1.52597334e-02 -3.12855430e-02  6.81994110e-02
  1.38328727e-02  1.83666814e-02  1.59278810e-02 -6.98305368e-02
 -1.05431303e-02 -1.96186197e-03 -5.20454273e-02 -8.94385502e-02
 -1.05323121e-02  7.47971684e-02 -2.22150367e-02 -1.07342061e-02
 -4.49027270e-02 -4.61861789e-02  4.17546216e-05 -2.18059681e-02
 -2.42836233e-02  2.81966683e-02 -2.26283241e-02 -8.78868718e-03
  8.10659304e-02  5.44087263e-03 -4.63340208e-02  3.15337144e-02
 -1.60059631e-02 -

#### **Now, let's create two other sentences to compare**

In [52]:
text_similar = "What is the number of drunk driving crashes in Virginia?"
text_dissimilar = "What's the weather like in California?"

vector_similar = embeddings.embed_query(text_similar)
vector_dissimilar = embeddings.embed_query(text_dissimilar)


print("✅ Embedded two more sentences to compare.")

✅ Embedded two more sentences to compare.


#### Now, let's use **'cosine similarity'** to see how **"close"** they are.

#### A score of **1.0** is a perfect match.

In [53]:
# We need to reshape the vectors for the function
v_question = np.array(vector).reshape(1, -1)
v_similar = np.array(vector_similar).reshape(1, -1)

# Using cosine similarity to compare
v_dissimilar = np.array(vector_dissimilar).reshape(1, -1)
sim_similar = cosine_similarity(v_question, v_similar)[0][0]
sim_dissimilar = cosine_similarity(v_question, v_dissimilar)[0][0]


print("--- Similarity Results ---")
print(f"Similarity to 'drunk driving': {sim_similar:.4f}")
print(f"Similarity to 'California weather': {sim_dissimilar:.4f}")

--- Similarity Results ---
Similarity to 'drunk driving': 0.5965
Similarity to 'California weather': 0.1689


## **RAG Implementation**

### Imports

In [54]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['OPENAI_API_KEY'] = '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'

import bs4
import sqlalchemy
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama 
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

print("✅ RAG tools loaded.")

✅ RAG tools loaded.


## **Connecting to FARS Database**
#### *(Fatality Analysis Reporting System)*


Here the local **SQL databse** is being connected. 


Running a **SQL query** to joins all three tables *(accident, person, vehicle)*.

In [55]:
# --- 1. CONNECT TO YOUR LOCAL MYSQL DATABASE ---
db_uri = "mysql+pymysql://root:NewStrongPass!123@localhost:3306/fars"
engine = sqlalchemy.create_engine(db_uri)

# --- 2. FETCH DATA & SERIALIZE ---
documents_to_index = []

# Helper maps for state names, etc.
print("Connecting to database...")
with engine.connect() as connection:
    # query JOINS your three tables to get rich data for each accident
    query = sqlalchemy.text("""
        SELECT 
            a.ST_CASE, a.YEAR, a.STATE, a.MONTH, a.PERSONS, a.VE_FORMS,
            p.AGE, p.SEX, p.PER_TYP,
            v.MAKE, v.MODEL
        FROM 
            accident_master a
        LEFT JOIN 
            person_master p ON a.ST_CASE = p.ST_CASE
        LEFT JOIN 
            vehicle_master v ON a.ST_CASE = v.ST_CASE
        LIMIT 5000; 
    """)
    
    result = connection.execute(query)
    
    for row in result:
        
        # 1. Create the text snippet (page_content)
        content_snippet = (
            f"Accident Case {row.ST_CASE} in {row.YEAR} involved "
            f"{row.PERSONS} persons and {row.VE_FORMS} vehicles. "
            f"Details include: Person (Age: {row.AGE}, Sex: {row.SEX}), "
            f"Vehicle (Make: {row.MAKE}, Model: {row.MODEL})."
        )
        # 2. Create the metadata (for 100% traceability)
        metadata = {
            "source_table": "accident_master",
            "ST_CASE": row.ST_CASE,
            "YEAR": row.YEAR,
        }
        
        doc = Document(page_content=content_snippet, metadata=metadata)
        documents_to_index.append(doc)

print(f"Created {len(documents_to_index)} Documents from MySQL.")

Connecting to database...
Created 5000 Documents from MySQL.


## **Indexing: Creating the Vector Store**

This cell is were vectorization happens. This is the "R" (Retrieval) component.

This code takes the list of **5,000** text Documents we created in the last step and builds our **"knowledge base".**

It loads the HuggingFaceEmbeddings model. This is our "vectorizer"—a specialized tool that reads text and converts its semantic meaning into a list of numbers (a vector).

Build Vector Store: It uses Chroma.from_documents to:
- Run all 5,000 documents through the embedding model.
- Store the resulting vectors in a new Chroma database.
- Persist that database to our local disk (./capstone_chroma_db), so we don't have to re-do this process every time.

We now have a highly efficient, searchable "memory" of our FARS data.

In [56]:
# 1. Initialize an open-source embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Build and persist the vector store, runs model through 5000 documents 
vectorstore = Chroma.from_documents(
    documents=documents_to_index, 
    embedding=embeddings,
    persist_directory="./capstone_chroma_db" # folder where it will be saved
)


print("--- SUCCESSFULLY LOADED ---")
print(f"Vector store created at './capstone_chroma_db'")
print(f"Total documents indexed: {vectorstore._collection.count()}")

--- SUCCESSFULLY LOADED ---
Vector store created at './capstone_chroma_db'
Total documents indexed: 30000


## **Retrieval**

In [59]:
# Load the persisted vector store from disk
print("Loading vector store from disk...")
vectordb = Chroma(
    persist_directory="./capstone_chroma_db", 
    embedding_function=embeddings
)

# Creating the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 5}) # 'k=5' finds the top 5 snippets

print("Retriever created.")

# Testing the retriever
print("\n--- Retriever Test ---")
test_docs = retriever.invoke("Accidents in Virginia")
print(f"Found {len(test_docs)} relevant docs for 'Accidents in Virginia'")
print(f"Top result: {test_docs[0].page_content}")

Loading vector store from disk...
Retriever created.

--- Retriever Test ---
Found 5 relevant docs for 'Accidents in Virginia'
Top result: Accident Case 40208 in 75 involved 4 persons and 2 vehicles. Details include: Person (Age: 22, Sex: 2), Vehicle (Make: 12, Model: 0).


## **RAG Chain (Prompt, LLM, and Chain)**


In [60]:
# 1. Get the RAG prompt from the hub
prompt = hub.pull("rlm/rag-prompt")

# 2. Initialize an open-source LLM via Ollama
llm = ChatOllama(model="llama3")

# 3. Create the RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created successfully.")

RAG chain created successfully.


## **(Ask a Question)**

In [62]:

question = "Tell me about an accident in Virginia involving a person over 50"
# question = "Tell me about an accident in Virginia that involved animals"
# question = "Tell me about an accident in Virginia involved alcohol?"

# .stream() gives you the answer as it's being generated
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)

According to the provided context, there was an accident in Virginia involving a person over 50. The person involved was a 67-year-old male who was driving a vehicle with a make and model both listed as "69". Two vehicles were involved in the accident, and two people were affected.