In [None]:
%%capture
%pip install langchain-huggingface langchain_community annoy pypdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
query1 = "What percentage of total crashes in Arizona were fatal in 2023?"
query2 = "How many crashes in Arizona in 2023 were alcohol-related, and what portion of them were fatal?"
query3 = "What was the peak hour for all crashes in Arizona in 2023, and how does it differ from the peak hour for fatal crashes?"
query4 = "How many fatalities in crashes where there in Arizona in 1989?"
query5 = "From 2011 to 2023 which year had the least fatality in Arizona?"
query6 = "How much percentage change was there in Rural Fatalities between 2022 and 2023?"
query7 = "What is the economic cost of motor vehicle crashes for Arizona in 2023?"
query8 = "How much percentage change was there in Rural Fatalities between 2022 and 2023?"
query9 = "How many licensed drivers were there in 2017?"
query10 = "How many total registered vehicles were there in 2018?"
query11 = "How much is the estimated motor vehicle miles traveled in 2023?"
query12 = "How much is the Arizona fatality rate in 2023?"
query13 = "Which county had the highest economic loss?"
query14 = "What are the States Adjusted crash costs for property damage only?"
query15 = "How many motors vehicle crashes were victims aged 25-34?"
query16 = "How many suspected serious injuries were there for bicyclists in 2022?"
query17 = "How many fatal crashes were there on the Labor Day weekend in 2023?"

# Method 1 (Directly the PDF tables)

## Reading the PDF

In [2]:
from pypdf import PdfReader

In [3]:
reader = PdfReader("data/2023-Crash-Facts_0.pdf")

In [4]:
all_text = ""
for page in reader.pages:
    text = page.extract_text()
    all_text += text

In [5]:
# Save the text to a file
import os
os.makedirs("output", exist_ok=True)
with open("output/2023-Crash-Facts.txt", "w") as text_file:
    text_file.write(all_text)

## Indexing

In [6]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("output/2023-Crash-Facts.txt")
text = loader.load()

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(text)

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Annoy

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
vectorstore = Annoy.from_documents(all_splits, embedding=embeddings)

In [10]:
vectorstore.save_local("annoy_vector_store(method_1)")

In [11]:
semantic_retriever = vectorstore.as_retriever()

## RAG

In [12]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [13]:
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [14]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [15]:
rag_chain = (
    {"context": semantic_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
output1 = rag_chain.invoke(query1)
output2 = rag_chain.invoke(query2)
output3 = rag_chain.invoke(query3)
output4 = rag_chain.invoke(query4)
output5 = rag_chain.invoke(query5)
output6 = rag_chain.invoke(query6)
output7 = rag_chain.invoke(query7)
output8 = rag_chain.invoke(query8)
output9 = rag_chain.invoke(query9)
output10 = rag_chain.invoke(query10)
output11 = rag_chain.invoke(query11)
output12 = rag_chain.invoke(query12)
output13 = rag_chain.invoke(query13)
output14 = rag_chain.invoke(query14)
output15 = rag_chain.invoke(query15)
output16 = rag_chain.invoke(query16)
output17 = rag_chain.invoke(query17)

In [17]:
print(f"Query 1: {query1}\nAnswer: {output1}\n")
print(f"Query 2: {query2}\nAnswer: {output2}\n")
print(f"Query 3: {query3}\nAnswer: {output3}\n")
print(f"Query 4: {query4}\nAnswer: {output4}\n")
print(f"Query 5: {query5}\nAnswer: {output5}\n")
print(f"Query 6: {query6}\nAnswer: {output6}\n")
print(f"Query 7: {query7}\nAnswer: {output7}\n")
print(f"Query 8: {query8}\nAnswer: {output8}\n")
print(f"Query 9: {query9}\nAnswer: {output9}\n")
print(f"Query 10: {query10}\nAnswer: {output10}\n")
print(f"Query 11: {query11}\nAnswer: {output11}\n")
print(f"Query 12: {query12}\nAnswer: {output12}\n")
print(f"Query 13: {query13}\nAnswer: {output13}\n")
print(f"Query 14: {query14}\nAnswer: {output14}\n")
print(f"Query 15: {query15}\nAnswer: {output15}\n")
print(f"Query 16: {query16}\nAnswer: {output16}\n")
print(f"Query 17: {query17}\nAnswer: {output17}\n")

Query 1: What percentage of total crashes in Arizona were fatal in 2023?
Answer: In 2023, there were 1,307 traffic fatalities in Arizona. Given that the total number of crashes was 122,247, the percentage of total crashes that were fatal is approximately 1.07%.

Query 2: How many crashes in Arizona in 2023 were alcohol-related, and what portion of them were fatal?
Answer: In 2023, there were 5,761 alcohol-related crashes in Arizona. Out of these, 308 were fatal, which means approximately 5.35% of alcohol-related crashes resulted in fatalities.

Query 3: What was the peak hour for all crashes in Arizona in 2023, and how does it differ from the peak hour for fatal crashes?
Answer: The peak hour for all crashes in Arizona in 2023 was from 3 PM to 4 PM. In contrast, the peak hour for fatal crashes occurred between 6 PM and 7 PM. This indicates that while overall crashes peaked earlier in the afternoon, fatal crashes were more concentrated in the evening.

Query 4: How many fatalities in cr

# Method 2 (Separating tables using document intelligence and giving tables)

In [2]:
with open("output_document_intelligence/cleaned_output.md", "r") as file:
    cleaned_output = file.read()

In [3]:
# Split the data into tables and text
tables = []
text = []
over_all = []

In [4]:
# Split the data by <table> tags
parts = cleaned_output.split("<table>")

In [5]:
for part in parts:
    if "</table>" in part:
        # Extract the table content
        table_content = part.split("</table>")[0]
        tables.append(f"<table>{table_content}</table>")
        over_all.append(f"<table>{table_content}</table>")
        # Add any text after the table
        remaining_text = part.split("</table>")[1].strip()
        if remaining_text:
            text.append(remaining_text)
            over_all.append(remaining_text)
    else:
        # Add text that is not part of a table
        if part.strip():
            text.append(part.strip())
            over_all.append(part.strip())

In [6]:
print(f"Tables: {len(tables)}")
print(f"Text: {len(text)}")
print(f"Overall: {len(over_all)}")

Tables: 96
Text: 52
Overall: 148


In [7]:
# combine the tables and text into a single list
combined_data = []
for i in range(max(len(tables), len(text))):
    if i < len(tables):
        combined_data.append(tables[i])
    if i < len(text):
        combined_data.append(text[i])

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Annoy

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
vectorstore = Annoy.from_texts(combined_data, embedding=embeddings)

In [10]:
vectorstore.save_local("annoy_vector_store(method_2)")

In [11]:
semantic_retriever = vectorstore.as_retriever()

## RAG

In [12]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [13]:
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [14]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [15]:
rag_chain = (
    {"context": semantic_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
output1 = rag_chain.invoke(query1)
output2 = rag_chain.invoke(query2)
output3 = rag_chain.invoke(query3)
output4 = rag_chain.invoke(query4)
output5 = rag_chain.invoke(query5)
output6 = rag_chain.invoke(query6)
output7 = rag_chain.invoke(query7)
output8 = rag_chain.invoke(query8)
output9 = rag_chain.invoke(query9)
output10 = rag_chain.invoke(query10)
output11 = rag_chain.invoke(query11)
output12 = rag_chain.invoke(query12)
output13 = rag_chain.invoke(query13)
output14 = rag_chain.invoke(query14)
output15 = rag_chain.invoke(query15)
output16 = rag_chain.invoke(query16)
output17 = rag_chain.invoke(query17)

In [17]:
print(f"Query 1: {query1}\nAnswer: {output1}\n")
print(f"Query 2: {query2}\nAnswer: {output2}\n")
print(f"Query 3: {query3}\nAnswer: {output3}\n")
print(f"Query 4: {query4}\nAnswer: {output4}\n")
print(f"Query 5: {query5}\nAnswer: {output5}\n")
print(f"Query 6: {query6}\nAnswer: {output6}\n")
print(f"Query 7: {query7}\nAnswer: {output7}\n")
print(f"Query 8: {query8}\nAnswer: {output8}\n")
print(f"Query 9: {query9}\nAnswer: {output9}\n")
print(f"Query 10: {query10}\nAnswer: {output10}\n")
print(f"Query 11: {query11}\nAnswer: {output11}\n")
print(f"Query 12: {query12}\nAnswer: {output12}\n")
print(f"Query 13: {query13}\nAnswer: {output13}\n")
print(f"Query 14: {query14}\nAnswer: {output14}\n")
print(f"Query 15: {query15}\nAnswer: {output15}\n")
print(f"Query 16: {query16}\nAnswer: {output16}\n")
print(f"Query 17: {query17}\nAnswer: {output17}\n")

Query 1: What percentage of total crashes in Arizona were fatal in 2023?
Answer: In 2023, fatal crashes in Arizona accounted for 0.98% of all crashes. There were a total of 1,197 fatal crashes reported.

Query 2: How many crashes in Arizona in 2023 were alcohol-related, and what portion of them were fatal?
Answer: In 2023, alcohol-related crashes accounted for 4.71% of all crashes in Arizona, and they represented 25.73% of all fatal crashes. The specific number of alcohol-related crashes and fatalities is not provided in the context. Therefore, I don't know the exact figures for crashes and fatalities.

Query 3: What was the peak hour for all crashes in Arizona in 2023, and how does it differ from the peak hour for fatal crashes?
Answer: The retrieved context does not specify the peak hour for all crashes in Arizona in 2023 or how it differs from the peak hour for fatal crashes. Therefore, I don't know the answer.

Query 4: How many fatalities in crashes where there in Arizona in 1989?

# Method 3 (Summarizing the table using LLM and giving the summarized paragraph along with the tables)

## Preprocessing the data

### Reading the cleaned output

In [2]:
import re

In [3]:
with open("output_document_intelligence/cleaned_output.md", "r") as file:
    cleaned_output = file.read()

In [4]:
# Split the data into tables and text
tables = []
text = []
over_all = []

In [5]:
# Split the data by <table> tags
parts = cleaned_output.split("<table>")

In [6]:
for part in parts:
    if "</table>" in part:
        # Extract the table content
        table_content = part.split("</table>")[0]
        tables.append(f"<table>{table_content}</table>")
        over_all.append(f"<table>{table_content}</table>")
        # Add any text after the table
        remaining_text = part.split("</table>")[1].strip()
        if remaining_text:
            text.append(remaining_text)
            over_all.append(remaining_text)
    else:
        # Add text that is not part of a table
        if part.strip():
            text.append(part.strip())
            over_all.append(part.strip())

In [7]:
print(f"Tables: {len(tables)}")
print(f"Text: {len(text)}")
print(f"Overall: {len(over_all)}")

Tables: 96
Text: 52
Overall: 148


### Reading the summarized table file

In [8]:
with open("output_document_intelligence/summarized_tables_prompt2.md", "r") as file:
    summarized_table = file.read()

In [9]:
# Define regex pattern to extract tables
pattern = r"Table \d+:\n\n(.*?)\n\n(?=Table \d+:|\.\.\.\.\.\.\.\.\.)"

In [10]:
# Extract tables
tables_summary_cleaned = re.findall(pattern, summarized_table, re.DOTALL)

### Merging both the tables

In [11]:
table_summary_mapping = {}

In [12]:
# zip the tables and summaries together

for table, summary in zip(tables, tables_summary_cleaned):
    table_summary_mapping[table] = summary.strip()

## Indexing

In [13]:
table_list = []

for table, summary in table_summary_mapping.items():
    table_list.append(f"Table: {table}\nSummary: {summary}")

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Annoy

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
vectorstore = Annoy.from_texts(table_list, embedding=embeddings)

In [16]:
vectorstore.save_local("annoy_vector_store(method_3)")

In [25]:
semantic_retriever = vectorstore.as_retriever()

## RAG

In [26]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [27]:
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [28]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [29]:
rag_chain = (
    {"context": semantic_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
output1 = rag_chain.invoke(query1)
output2 = rag_chain.invoke(query2)
output3 = rag_chain.invoke(query3)
output4 = rag_chain.invoke(query4)
output5 = rag_chain.invoke(query5)
output6 = rag_chain.invoke(query6)
output7 = rag_chain.invoke(query7)
output8 = rag_chain.invoke(query8)
output9 = rag_chain.invoke(query9)
output10 = rag_chain.invoke(query10)
output11 = rag_chain.invoke(query11)
output12 = rag_chain.invoke(query12)
output13 = rag_chain.invoke(query13)
output14 = rag_chain.invoke(query14)
output15 = rag_chain.invoke(query15)
output16 = rag_chain.invoke(query16)
output17 = rag_chain.invoke(query17)

In [31]:
print(f"Query 1: {query1}\nAnswer: {output1}\n")
print(f"Query 2: {query2}\nAnswer: {output2}\n")
print(f"Query 3: {query3}\nAnswer: {output3}\n")
print(f"Query 4: {query4}\nAnswer: {output4}\n")
print(f"Query 5: {query5}\nAnswer: {output5}\n")
print(f"Query 6: {query6}\nAnswer: {output6}\n")
print(f"Query 7: {query7}\nAnswer: {output7}\n")
print(f"Query 8: {query8}\nAnswer: {output8}\n")
print(f"Query 9: {query9}\nAnswer: {output9}\n")
print(f"Query 10: {query10}\nAnswer: {output10}\n")
print(f"Query 11: {query11}\nAnswer: {output11}\n")
print(f"Query 12: {query12}\nAnswer: {output12}\n")
print(f"Query 13: {query13}\nAnswer: {output13}\n")
print(f"Query 14: {query14}\nAnswer: {output14}\n")
print(f"Query 15: {query15}\nAnswer: {output15}\n")
print(f"Query 16: {query16}\nAnswer: {output16}\n")
print(f"Query 17: {query17}\nAnswer: {output17}\n")

Query 1: What percentage of total crashes in Arizona were fatal in 2023?
Answer: In 2023, 1,197 out of a total of 122,247 crashes in Arizona were fatal. This means that approximately 0.98% of all crashes were fatal.

Query 2: How many crashes in Arizona in 2023 were alcohol-related, and what portion of them were fatal?
Answer: In 2023, there were 5,761 alcohol-related crashes in Arizona. Out of these, 308 were fatal, which means approximately 5.34% of alcohol-related crashes resulted in fatalities.

Query 3: What was the peak hour for all crashes in Arizona in 2023, and how does it differ from the peak hour for fatal crashes?
Answer: The peak hour for all crashes in Arizona in 2023 was from 3 PM to 4 PM. In contrast, the peak hour for fatal crashes occurred from 6 PM to 7 PM. This indicates that the highest volume of crashes does not coincide with the highest number of fatal incidents.

Query 4: How many fatalities in crashes where there in Arizona in 1989?
Answer: In Arizona in 1989, 

# Method 4 (Sending the summarized paragraph only)

## Preprocessing the data

### Reading the cleaned output

In [2]:
import re

In [3]:
with open("output_document_intelligence/cleaned_output.md", "r") as file:
    cleaned_output = file.read()

In [4]:
# Split the data into tables and text
tables = []
text = []
over_all = []

In [5]:
# Split the data by <table> tags
parts = cleaned_output.split("<table>")

In [6]:
for part in parts:
  if "</table>" in part:
    # Extract the table content
    table_content = part.split("</table>")[0]
    tables.append(f"<table>{table_content}</table>")
    over_all.append(f"<table>{table_content}</table>")
    # Add any text after the table
    remaining_text = part.split("</table>")[1].strip()
    if remaining_text:
      text.append(remaining_text)
      over_all.append(remaining_text)
  else:
    # Add text that is not part of a table
    if part.strip():
      text.append(part.strip())
      over_all.append(part.strip())

In [7]:
print(f"Tables: {len(tables)}")
print(f"Text: {len(text)}")
print(f"Overall: {len(over_all)}")

Tables: 96
Text: 52
Overall: 148


### Reading the summarized table file

In [8]:
with open("output_document_intelligence/summarized_tables_prompt2.md", "r") as file:
    summarized_table = file.read()

In [9]:
# Define regex pattern to extract tables
pattern = r"Table \d+:\n\n(.*?)\n\n(?=Table \d+:|\.\.\.\.\.\.\.\.\.)"

In [10]:
# Extract tables
tables_summary_cleaned = re.findall(pattern, summarized_table, re.DOTALL)

### Merging both the tables

In [11]:
table_summary_mapping = {}
summary_table_mapping = {}

In [12]:
# zip the tables and summaries together

for table, summary in zip(tables, tables_summary_cleaned):
    table_summary_mapping[table] = summary.strip()
    summary_table_mapping[summary.strip()] = table

In [13]:
summaries = list(summary_table_mapping.keys())

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Annoy

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"
)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
vectorstore = Annoy.from_texts(summaries, embedding=embeddings)

In [16]:
vectorstore.save_local("annoy_vector_store(method_4)")

In [42]:
semantic_retriever = vectorstore.as_retriever(search_type="mmr")

## RAG

In [43]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [44]:
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [45]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(summary_table_mapping[doc.page_content] for doc in docs)
    # return "\n\n".join(doc.page_content for doc in docs)



In [46]:
# rag_chain = (
#     {"context": semantic_retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     # | StrOutputParser()
# )
from langchain_core.runnables import RunnableParallel

rag_chain_with_source = RunnableParallel(
    {
        "answer": (
            {"context": semantic_retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        ),
        "sources": semantic_retriever  # This will return the raw documents
    }
)

In [47]:
output1 =rag_chain_with_source.invoke(query1)
output2 =rag_chain_with_source.invoke(query2)
output3 =rag_chain_with_source.invoke(query3)
output4 =rag_chain_with_source.invoke(query4)
output5 =rag_chain_with_source.invoke(query5)
output6 =rag_chain_with_source.invoke(query6)
output7 =rag_chain_with_source.invoke(query7)
output8 =rag_chain_with_source.invoke(query8)
output9 =rag_chain_with_source.invoke(query9)
output10 =rag_chain_with_source.invoke(query10)
output11 =rag_chain_with_source.invoke(query11)
output12 =rag_chain_with_source.invoke(query12)
output13 =rag_chain_with_source.invoke(query13)
output14 =rag_chain_with_source.invoke(query14)
output15 =rag_chain_with_source.invoke(query15)
output16 =rag_chain_with_source.invoke(query16)
output17 =rag_chain_with_source.invoke(query17)

In [48]:
def format_sources(sources):
    return "\n".join([f"{doc.page_content}" for doc in sources])

for i in range(1, 18):
    query = globals()[f"query{i}"]
    output = globals()[f"output{i}"]
    print(f"Query {i}: {query}\nAnswer: {output['answer']}\nSources:\n{format_sources(output['sources'])}\n")

Query 1: What percentage of total crashes in Arizona were fatal in 2023?
Answer: In 2023, the percentage of total crashes in Arizona that were fatal is approximately 0.97%. This is calculated by dividing the total fatalities (1,307) by the total crashes (122,247) and multiplying by 100.
Sources:
The table presents a comparative analysis of crash statistics in Arizona for the years 2022 and 2023. It encompasses various categories related to traffic incidents, including total crashes, fatalities, and injuries, as well as specific data on alcohol-related incidents, urban and rural statistics, and the impact on motorcyclists, pedestrians, and bicyclists. Additionally, it examines vehicle miles traveled and calculates fatalities and injuries per 100 million vehicle miles. The table highlights changes in these metrics over the two years, indicating trends in road safety and the effectiveness of traffic regulations and interventions.
The table presents a comprehensive breakdown of fatal crash