In [1]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain
!pip install beautifulsoup4 sqlalchemy pymysql langchain-text-splitters sentence-transformers



### LangSmith

In [1]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

### API Keys

In [2]:
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['OPENAI_API_KEY'] = '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'

## RAG Implementation

### Imports

In [None]:
import bs4
import sqlalchemy
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama 
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_hub import hub

ModuleNotFoundError: No module named 'langchain_hub'

In [None]:
import sys
print(sys.executable)

/opt/anaconda3/envs/fars_env/bin/python


## SQL Databse

In [None]:
# --- 1. CONNECT TO YOUR LOCAL MYSQL DATABASE ---
db_uri = "mysql+pymysql://root:NewStrongPass!123@localhost:3306/fars"
engine = sqlalchemy.create_engine(db_uri)

# --- 2. FETCH DATA & SERIALIZE ---
documents_to_index = []

# Helper maps for state names, etc.
print("Connecting to database...")
with engine.connect() as connection:
    # query JOINS your three tables to get rich data for each accident
    query = sqlalchemy.text("""
        SELECT 
            a.ST_CASE, a.YEAR, a.STATE, a.MONTH, a.PERSONS, a.VE_FORMS,
            p.AGE, p.SEX, p.PER_TYP,
            v.MAKE, v.MODEL
        FROM 
            accident_master a
        LEFT JOIN 
            person_master p ON a.ST_CASE = p.ST_CASE
        LEFT JOIN 
            vehicle_master v ON a.ST_CASE = v.ST_CASE
        LIMIT 5000; 
    """)
    
    result = connection.execute(query)
    
    for row in result:
        # 1. Create the text snippet (page_content)
        content_snippet = (
            f"Accident Case {row.ST_CASE} in {row.YEAR} involved "
            f"{row.PERSONS} persons and {row.VE_FORMS} vehicles. "
            f"Details include: Person (Age: {row.AGE}, Sex: {row.SEX}), "
            f"Vehicle (Make: {row.MAKE}, Model: {row.MODEL})."
        )
        # 2. Create the metadata (for 100% traceability)
        metadata = {
            "source_table": "accident_master",
            "ST_CASE": row.ST_CASE,
            "YEAR": row.YEAR,
        }
        
        doc = Document(page_content=content_snippet, metadata=metadata)
        documents_to_index.append(doc)

print(f"Created {len(documents_to_index)} Documents from MySQL.")

Connecting to database...
Created 5000 Documents from MySQL.


## Indexing (Embed and Store)

In [None]:
# 1. Initialize an open-source embedding model
print("Loading embedding model (this may take a moment)...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded.")

# 2. Build and persist the vector store
# runs all 5000 documents through model
print("Building and persisting vector store...")
vectorstore = Chroma.from_documents(
    documents=documents_to_index, 
    embedding=embeddings,
    persist_directory="./capstone_chroma_db" # folder where it will be saved
)

print("--- SUCCESS ---")
print(f"Vector store created at './capstone_chroma_db'")
print(f"Total documents indexed: {vectorstore._collection.count()}")

Loading embedding model (this may take a moment)...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Embedding model loaded.
Building and persisting vector store...


NameError: name 'documents_to_index' is not defined

## Retrieval

In [None]:
# Load the persisted vector store from disk
print("Loading vector store from disk...")
vectordb = Chroma(
    persist_directory="./capstone_chroma_db", 
    embedding_function=embeddings
)

# Creating the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 5}) # 'k=5' finds the top 5 snippets

print("Retriever created.")

# Testing the retriever
print("\n--- Retriever Test ---")
test_docs = retriever.invoke("Accidents in Virginia")
print(f"Found {len(test_docs)} relevant docs for 'Accidents in Virginia'")
print(f"Top result: {test_docs[0].page_content}")

Loading vector store from disk...


  vectordb = Chroma(


Retriever created.

--- Retriever Test ---
Found 0 relevant docs for 'Accidents in Virginia'


IndexError: list index out of range

## RAG Chain (Prompt, LLM, and Chain)


In [None]:
# 1. Get the RAG prompt from the hub
# needs 'langchain_hub'
prompt = hub.pull("rlm/rag-prompt")

# 2. Initialize an open-source LLM via Ollama
llm = ChatOllama(model="llama3")

# 3. Create the RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created successfully.")

NameError: name 'hub' is not defined

## Invoke the Chain (Ask a Question)

In [5]:
question = "Tell me about an accident in Virginia involving a person over 50"

# .stream() gives you the answer as it's being generated
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)

NameError: name 'rag_chain' is not defined