Step 1: Install Required Libraries
Install LangChain and dependencies:

In [1]:
pip install langchain openai chromadb sentence-transformers PyPDF2 langchain-community langchain-huggingface


Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 k

Step 2 Embedding Layer (Using LangChain's Embedding Model)
We can use LangChain's SentenceTransformerEmbeddings for embedding documents or sentences.

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader

class EmbeddingLayer:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        #self.embedding_model = SentenceTransformerEmbeddings(model_name)
        self.embedding_model = HuggingFaceEmbeddings(model_name=model_name)


    def process_document(self, pdf_path):
        # Use LangChain's PDF loader to process the document
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()  # This loads the pages as documents
        return documents

    def embed_documents(self, documents):
        # Embed documents using LangChain's embedding model
        return self.embedding_model.embed_documents([doc.page_content for doc in documents])

    def chunk_documents(self, documents, chunk_size=100):
        # Chunk documents into smaller chunks
        chunks = []
        for doc in documents:
            text = doc.page_content
            chunks += [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        return chunks


Search Layer (Using LangChain’s Vector Database)
We will use Chroma for vector search and storing embeddings.

In [3]:
class SearchLayer:
    def __init__(self, model, db_path="./vector_db"):
        self.embedding_model = model
        self.vector_store = Chroma(persist_directory=db_path, embedding_function=self.embedding_model)

    def index_chunks(self, chunks):
        # Index chunks into Chroma DB
        self.vector_store.add_texts(chunks)

    def search(self, query, top_k=3):
        # Search the database using the query embedding
        query_embedding = self.embedding_model.embed_query(query)
        results = self.vector_store.similarity_search(query, k=top_k)
        return results


Generation Layer (Using OpenAI for Question Answering)
We will use LangChain’s built-in load_qa_chain to create a Q&A pipeline using the OpenAI LLM.

In [4]:
class GenerationLayer:
    def __init__(self, api_key):
        self.llm = OpenAI(openai_api_key=api_key)

    def generate_answer(self, query, retrieved_chunks):
        # Clean retrieved chunks: Remove None or empty strings
        clean_chunks = [chunk.strip() for chunk in retrieved_chunks if chunk and isinstance(chunk, str)]

        # Wrap the cleaned chunks as Document objects
        docs = [Document(page_content=chunk) for chunk in clean_chunks]

        # Combine the retrieved chunks for the Q&A generation
        chain = load_qa_chain(self.llm, chain_type="stuff")

        # Run the QA chain with the input documents and the query
        answer = chain.invoke({"input_documents": docs, "question": query})  # Correct argument names

        return answer

Main Execution
Finally, we’ll combine everything and process the document, index it, and execute the query-answer generation pipeline.

In [5]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m256.0/298.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [6]:
from langchain.schema import Document

def main(pdf_path, queries):
    # Initialize layers
    embedding_layer = EmbeddingLayer()
    search_layer = SearchLayer(embedding_layer.embedding_model)
    generation_layer = GenerationLayer(api_key="-")

    # Step 1: Process the document
    documents = embedding_layer.process_document(pdf_path)

    # Step 2: Chunk the document
    chunks = embedding_layer.chunk_documents(documents, chunk_size=100)

    # Step 3: Index chunks
    search_layer.index_chunks(chunks)

    # Step 4: Execute queries
    for query in queries:
        print(f"\nQuery: {query}")
        results = search_layer.search(query)

        # Retrieve the top-k chunks
        retrieved_chunks = results
        print("Top Retrieved Chunks:", [result.page_content for result in retrieved_chunks])

        # Step 5: Generate answer using retrieved chunks
        answer = generation_layer.generate_answer(query, retrieved_chunks)
        print("Generated Answer:", answer)

if __name__ == "__main__":
    pdf_path = "/content/Principal-Sample-Life-Insurance-Policy.pdf"
    queries = [
        "What are the benefits included under the Group Policy for Life Insurance?",
        "What is the coverage for Dependent Life Insurance?",
        "What is considered a 'Qualifying Event' for Accelerated Benefits under this policy?"
    ]
    main(pdf_path, queries)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  self.vector_store = Chroma(persist_directory=db_path, embedding_function=self.embedding_model)
  self.llm = OpenAI(openai_api_key=api_key)



Query: What are the benefits included under the Group Policy for Life Insurance?
Top Retrieved Chunks: [" a part of the premium for their Dependent's insurance under \nthis Group Policy. ", " Group Policy; or \n \nc. the date the last premium is paid for the Member's Dependent Life Insurance;", 'ble for Member Life Insurance under this Group Policy \nif the person is eligible under any other Gro']


stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(self.llm, chain_type="stuff")


Generated Answer: {'input_documents': [], 'question': 'What are the benefits included under the Group Policy for Life Insurance?', 'output_text': "\nThe benefits included under the Group Policy for Life Insurance can vary depending on the specific policy, but they may include things like a death benefit for the policyholder's beneficiaries, cash value accumulation, and potentially the ability to convert the policy to an individual policy in the future."}

Query: What is the coverage for Dependent Life Insurance?
Top Retrieved Chunks: ['Dependent Life Insurance \n \nDependent Life Insurance is available only with respect to Dependents of', " eligible for such insurance; or \n(2) the Dependent spouse's Dependent Life Insurance terminates as ", 'le \n \nIf a Dependent dies while insured for Dependent Life Insurance under this Group Policy, The \nP']
Generated Answer: {'input_documents': [], 'question': 'What is the coverage for Dependent Life Insurance?', 'output_text': "\n\nI don't know.

Implementation of reasoning agents, such as conversational, data analytics, or SQL agents, using LangChain.

In [7]:
import sqlite3

# Define the database connection string
db_connection_string = "example.db"

# Connect to the SQLite database
conn = sqlite3.connect(db_connection_string)
cursor = conn.cursor()

# Create a sample table (e.g., 'policies')
cursor.execute('''CREATE TABLE IF NOT EXISTS policies (
                    PolicyNumber TEXT PRIMARY KEY,
                    PolicyHolder TEXT,
                    IssueDate TEXT,
                    CoverageType TEXT,
                    EffectiveDate TEXT,
                    TerminationDate TEXT,
                    PremiumAmount REAL,
                    PremiumFrequency TEXT,
                    BenefitAmount REAL,
                    BenefitConditions TEXT,
                    Exclusions TEXT,
                    Rider TEXT
                )''')

# Commit the transaction and close the connection
conn.commit()



In [8]:
# Sample data to insert
data = [
    ("GL S654", "ISLAND DOE", "2020-05-10", "Life", "2020-06-01", "2025-06-01", 1500.50, "Monthly", 50000.00, "Must be diagnosed with a terminal illness", "No coverage for self-inflicted injuries", "Accidental Death Rider"),
    ("GL S653", "CALIFORNIA", "2021-06-10", "Accidental Death & Dismemberment", "2021-07-01", "2023-07-01", 1200.75, "Quarterly", 75000.00, "Must be in a valid accident", "No coverage for pre-existing conditions", "Accidental Injury Rider"),
    ("GL S655", "DOE", "2020-05-20", "Life", "2020-06-01", "2025-06-01", 1500.50, "Monthly", 70000.00, "Must be diagnosed with a terminal illness", "No coverage for self-inflicted injuries", "Accidental Death Rider"),
    ("GL S656", "CALIFORNIA Jhon", "2021-06-20", "Accidental Death & Dismemberment", "2021-07-01", "2023-07-01", 1200.75, "Quarterly", 85000.00, "Must be in a valid accident", "No coverage for pre-existing conditions", "Accidental Injury Rider")
]

# Insert data into the table
cursor.executemany('''INSERT INTO policies (
                        PolicyNumber, PolicyHolder, IssueDate, CoverageType,
                        EffectiveDate, TerminationDate, PremiumAmount,
                        PremiumFrequency, BenefitAmount, BenefitConditions,
                        Exclusions, Rider)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)

# Commit the transaction and close the connection
conn.commit()


In [9]:

import pandas as pd

# Query the database to fetch all records
cursor.execute("SELECT * FROM policies")
rows = cursor.fetchall()
# Fetch column names for the DataFrame
column_names = [description[0] for description in cursor.description]

# Load the data from SQLite into a pandas DataFrame
df = pd.DataFrame(rows, columns=column_names)



# Display the DataFrame
print(df)
# Display the result
for row in rows:
    print(row)


  PolicyNumber     PolicyHolder   IssueDate                      CoverageType  \
0      GL S654       ISLAND DOE  2020-05-10                              Life   
1      GL S653       CALIFORNIA  2021-06-10  Accidental Death & Dismemberment   
2      GL S655              DOE  2020-05-20                              Life   
3      GL S656  CALIFORNIA Jhon  2021-06-20  Accidental Death & Dismemberment   

  EffectiveDate TerminationDate  PremiumAmount PremiumFrequency  \
0    2020-06-01      2025-06-01        1500.50          Monthly   
1    2021-07-01      2023-07-01        1200.75        Quarterly   
2    2020-06-01      2025-06-01        1500.50          Monthly   
3    2021-07-01      2023-07-01        1200.75        Quarterly   

   BenefitAmount                          BenefitConditions  \
0        50000.0  Must be diagnosed with a terminal illness   
1        75000.0                Must be in a valid accident   
2        70000.0  Must be diagnosed with a terminal illness   
3     

In [10]:
pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4


In [11]:
# Agent Systems
from langchain.sql_database import SQLDatabase
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain_experimental.agents import create_pandas_dataframe_agent


from langchain_experimental.tools import PythonREPLTool
from langchain.tools import Tool
from langchain.agents import initialize_agent





class AgentSystems:
    def __init__(self, api_key):
        self.llm = OpenAI(openai_api_key=api_key)

    def conversational_agent(self):
        # Define a conversational tool using the LLM
        tools = [
            Tool(
                name="Chat Tool",
                func=lambda query: self.llm(query),
                description="Responds conversationally to user queries"
            )
        ]
        return initialize_agent(tools, self.llm, agent="zero-shot-react-description")

    def data_analytics_agent(self, df):
      return create_pandas_dataframe_agent(self.llm, df, verbose=True, allow_dangerous_code=True)

    def sql_agent(self, db_connection_string):
      # Initialize the SQL database
      db = SQLDatabase.from_uri(db_connection_string)

      # Create the SQL database toolkit
      toolkit = SQLDatabaseToolkit(db=db, llm=self.llm)

      # Initialize the agent
      return initialize_agent(toolkit.get_tools(), self.llm, agent="zero-shot-react-description")

# Extending the main function to include agent systems
def main_with_agents(pdf_path, queries, api_key, db_connection_string=None, dataframe=None):
    # Initialize layers
    embedding_layer = EmbeddingLayer()
    search_layer = SearchLayer(embedding_layer.embedding_model)
    generation_layer = GenerationLayer(api_key=api_key)
    agent_systems = AgentSystems(api_key=api_key)

    # Step 1: Process the document
    documents = embedding_layer.process_document(pdf_path)

    # Step 2: Chunk the document
    chunks = embedding_layer.chunk_documents(documents, chunk_size=100)

    # Step 3: Index chunks
    search_layer.index_chunks(chunks)

    # Step 4: Execute queries
    for query in queries:
        print(f"\nQuery: {query}")
        results = search_layer.search(query)

        # Retrieve the top-k chunks
        retrieved_chunks = results
        print("Top Retrieved Chunks:", [result.page_content for result in retrieved_chunks])

        # Step 5: Generate answer using retrieved chunks
        answer = generation_layer.generate_answer(query, retrieved_chunks)
        print("Generated Answer:", answer)

    # Step 6: Demonstrate Agent Systems
    print("\n--- Agent Systems Demonstration ---")

    # Conversational agent
    conversational_agent = agent_systems.conversational_agent()
    conversation_query = "Explain the purpose of this policy."
    print(f"\nConversational Agent Response:\n{conversational_agent.run(conversation_query)}")

    # Data Analytics Agent (if a DataFrame is provided)
    if dataframe is not None:
        analytics_agent = agent_systems.data_analytics_agent(dataframe)
        analytics_query = "Calculate the duration of each policy by subtracting EffectiveDate from TerminationDate and display"
        print(f"\nData Analytics Agent Response:\n{analytics_agent.run(analytics_query)}")

    # SQL Agent (if a DB connection string is provided)
    if db_connection_string is not None:
        sql_agent = agent_systems.sql_agent(db_connection_string)
        sql_query = "SELECT COUNT(*) FROM policies WHERE CoverageType='Life';"
        print(f"\nSQL Agent Response:\n{sql_agent.run(sql_query)}")

if __name__ == "__main__":
    pdf_path = "/content/Principal-Sample-Life-Insurance-Policy.pdf"
    queries = [
        "What are the benefits included under the Group Policy for Life Insurance?",
        "What is the coverage for Dependent Life Insurance?",
        "What is considered a 'Qualifying Event' for Accelerated Benefits under this policy?"
    ]
    main_with_agents(pdf_path, queries, api_key="",
                     db_connection_string="sqlite:///example.db", dataframe=df)



Query: What are the benefits included under the Group Policy for Life Insurance?
Top Retrieved Chunks: [" a part of the premium for their Dependent's insurance under \nthis Group Policy. ", " a part of the premium for their Dependent's insurance under \nthis Group Policy. ", " Group Policy; or \n \nc. the date the last premium is paid for the Member's Dependent Life Insurance;"]
Generated Answer: {'input_documents': [], 'question': 'What are the benefits included under the Group Policy for Life Insurance?', 'output_text': ' The Group Policy for Life Insurance typically includes benefits such as death benefits, accidental death and dismemberment, and optional coverage for dependents.'}

Query: What is the coverage for Dependent Life Insurance?
Top Retrieved Chunks: ['Dependent Life Insurance \n \nDependent Life Insurance is available only with respect to Dependents of', 'Dependent Life Insurance \n \nDependent Life Insurance is available only with respect to Dependents of', " eligible 

  return initialize_agent(tools, self.llm, agent="zero-shot-react-description")
  print(f"\nConversational Agent Response:\n{conversational_agent.run(conversation_query)}")
  func=lambda query: self.llm(query),



Conversational Agent Response:
The purpose of this policy is to create a framework for governing behavior and promoting a safe and fair environment for all individuals involved.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to use datetime library to calculate the duration
Action: python_repl_ast
Action Input: import datetime[0m[36;1m[1;3m[0m[32;1m[1;3mI need to convert the EffectiveDate and TerminationDate columns to datetime objects
Action: python_repl_ast
Action Input: df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'])[0m[36;1m[1;3mNameError: name 'pd' is not defined[0m[32;1m[1;3m I need to import pandas library to use the pd.to_datetime() function
Action: python_repl_ast
Action Input: import pandas as pd[0m[36;1m[1;3m[0m[32;1m[1;3m I can now convert the EffectiveDate and TerminationDate columns to datetime objects
Action: python_repl_ast
Action Input: df['EffectiveDate'] = pd.to_datetime(df['EffectiveDate'])[0m[36;1m[1;3m

In [12]:
print(df[['PolicyNumber', 'Duration']])

  PolicyNumber  Duration
0      GL S654 1826 days
1      GL S653  730 days
2      GL S655 1826 days
3      GL S656  730 days
