In [2]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [3]:
from PyPDF2 import PdfReader

In [4]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:YkslDvqyPTWOeoiXYHTPjTlP:75b14ed0fbadd693bb0c77c77aab969a7b087faa1f55498ee118841c7d40001e" 
ASTRA_DB_ID = "99498bcd-ef25-4498-b948-f5fccf9ed132"

OPENAI_API_KEY = "" 

In [5]:
pdfreader = PdfReader('/home/ravi0531rp/Downloads/aaa.pdf')

In [6]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [7]:
raw_text

'Real-Time Flying Object Detection with YOLOv8\nDillon Reis*, Jordan Kupec, Jacqueline Hong, Ahmad Daoudi\nGeorgia Institute of Technology\ndreis7@gatech.edu *, jkupec3@gatech.edu, jhong356@gatech.edu, adaoudi3@gatech.edu\nAbstract\nThis paper presents a generalized model for real-time\ndetection of ﬂying objects that can be used for transfer\nlearning and further research, as well as a reﬁned model\nthat is ready for implementation. We achieve this by train-\ning our ﬁrst (generalized) model on a data set containing\n40 different classes of ﬂying objects, forcing the model to\nextract abstract feature representations. We then perform\ntransfer learning with these learned parameters on a data\nset more representative of “real world” environments (i.e.\nhigher frequency of occlusion, small spatial sizes, rotations,\netc.) to generate our reﬁned model. Object detection of ﬂy-\ning objects remains challenging due to large variance ob-\nject spatial sizes/aspect ratios, rate of speed, occl

In [8]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [None]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="demo",
    session=None,
    keyspace=None,
)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
texts[:50]

In [None]:
astra_vector_store.add_texts(texts[:50])

print(f"Inserted {len(texts[:50])} headlines.")

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:100]))