## Setup and Import Libraries

In [2]:
import os
import cassio
from langchain_community.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from datasets import load_dataset
from PyPDF2 import PdfReader
from typing_extensions import Concatenate
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
os.environ["ASTRA_DB_ID"] = os.getenv("ASTRA_DB_ID")

## Loading PDF

In [5]:
pdf_reader = PdfReader('2024_budget.pdf')

In [6]:
raw_text = ''
for i, page in enumerate(pdf_reader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [7]:
# raw_text

## Initialize Cassandra Connection

In [8]:
cassio.init(
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    database_id=os.environ["ASTRA_DB_ID"]
)

## LangChain LLM Model and Embedding

In [9]:
llm = OpenAI()
embeddings = OpenAIEmbeddings()

## Create LangChain Vector Store

In [10]:
vector_store = Cassandra(
    embedding=embeddings,
    table_name='qa_mini_demo',
    session=None,
    keyspace=None
)

## Text Splitting

In [11]:
text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)

Created a chunk of size 854, which is longer than the specified 800


## Loading into Vector Store

In [12]:
vector_store.add_texts(texts[:50])

print(f"Inserted {len(texts[:50])} headlines")

vector_index = VectorStoreIndexWrapper(
    vectorstore=vector_store
)

Inserted 50 headlines


## Run Queries

In [14]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat is your question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text.lower() == "quit":
        continue

    first_question = False

    print(f"QUESTION: {query_text}")
    answer = vector_index.query(question=query_text, llm=llm).strip()
    print(f"ANSWER: {answer}")

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in vector_store.similarity_search_with_score(query=query_text, k=4):
        print(f"{score, doc.page_content[:84]}")
    


Enter your question (or type 'quit' to exit):  What is Subsidy to PASSCO for Wheat Reserve Stock


QUESTION: What is Subsidy to PASSCO for Wheat Reserve Stock
ANSWER: The Subsidy to PASSCO for Wheat Reserve Stock is 5,700 million rupees in the Revised Budget of 2023-24.
FIRST DOCUMENTS BY RELEVANCE:
(0.8921051498624912, 'Pakistan Energy  Rev olv ing Fund (PERA)Prov ision f or Power Subsidy\nShortf all in ')
(0.8920865311833888, 'Pakistan Energy  Rev olv ing Fund (PERA)Prov ision f or Power Subsidy\nShortf all in ')
(0.8914661998021669, '10,000 10,000 12,000\n19 7,000 8,500 8,000\n20 3,000 1,500 4,000\n60,000 60,000 68,000\n')
(0.8913224720588764, '10,000 10,000 12,000\n19 7,000 8,500 8,000\n20 3,000 1,500 4,000\n60,000 60,000 68,000\n')



What is your question (or type 'quit' to exit):  quit
