### Power of Cassandra and ChatGPT for PDF Data Ingestion and Question Answering using AstraDB & Langchain 🦜
[**Link to my YouTube Channel**](https://www.youtube.com/BhaveshBhatt8791?sub_confirmation=1)

# Installs

In [1]:
!pip install -q cassandra-driver
!pip install -q langchain
!pip install -q openai
!pip install -q pypdf
!pip install -q cassio>=0.1.1
!pip install -q tiktoken==0.4.0
!pip install langchain_community
!pip install -U langchain-openai


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.3.7 requires tiktoken<1,>=0.7, but you have tiktoken 0.4.0 which is incompatible.[0m[31m
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Using cached tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Installing collected packages: tiktoken
  Attempting uninstall: tiktoken
    Found existing installation: tiktoken 0.4.0
    Uninstalling tiktoken-0.4.0:
      Successfully uninstalled tiktoken-0.4.0
Successfully installed tiktoken-0.9.0


Check the installed Versions

In [2]:
import importlib
import pkg_resources

def check_package_version(package_name):
    try:
        module = importlib.import_module(package_name)
        version = pkg_resources.get_distribution(package_name).version
        print(f"{package_name}: {version} (Installed Correctly)")
    except ImportError:
        print(f"{package_name} is NOT installed.")
    except pkg_resources.DistributionNotFound:
        print(f"{package_name} is installed but version information is not available.")
    except Exception as e:
        print(f"Error checking {package_name}: {e}")

packages = [
    "cassandra-driver",
    "langchain",
    "openai",
    "pypdf",
    "cassio",
    "langchai_community",
    "tiktoken"
]

for package in packages:
    check_package_version(package)


  import pkg_resources


cassandra-driver is NOT installed.
langchain: 0.3.20 (Installed Correctly)
openai: 1.61.1 (Installed Correctly)
pypdf: 5.3.1 (Installed Correctly)
cassio: 0.1.10 (Installed Correctly)
langchai_community is NOT installed.
tiktoken: 0.9.0 (Installed Correctly)


# Cassandra Import

In [3]:
import cassandra
print (cassandra.__version__)

3.29.2


In [4]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json

# This secure connect bundle is autogenerated when you donwload your SCB,
# if yours is different update the file name below
cloud_config= {
  'secure_connect_bundle': '/content/secure-connect-pdf-qna.zip'
}

# This token json file is autogenerated when you donwload your token,
# if yours is different update the file name below
with open("/content/TokenAstra.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["clientId"]
CLIENT_SECRET = secrets["secret"]

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
  print(row[0])
else:
  print("An error occurred.")



4.0.11-3f93fd6872a3


Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.41 (from langchain_community)
  Downloading langchain_core-0.3.41-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.20 (from langchain_community)
  Downloading langchain-0.3.20-py3-none-any.whl.metadata (7.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

# Import

In [10]:
from langchain.llms.openai import OpenAI
from langchain.llms.openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAI, OpenAIEmbeddings

from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, PyPDFLoader

# OS Import

In [11]:
import os
import getpass

api_key = getpass.getpass("Enter your OpenAI API Key: ").strip()

if api_key:
    os.environ['OPENAI_API_KEY'] = api_key
    print("✅ API Key set successfully!")
else:
    print("❌ Failed to set API Key. Please enter a valid key.")


Enter your OpenAI API Key: ··········
✅ API Key set successfully!


# Initialization

In [12]:
llm = OpenAI(temperature=0)
openai_embeddings = OpenAIEmbeddings()

In [27]:
table_name = 'pdf_q_n_a_table_1'
keyspace = "pdf_q_n_a_test"

index_creator = VectorstoreIndexCreator(
    vectorstore_cls = Cassandra,
    embedding = openai_embeddings,
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 400,
        chunk_overlap = 30,
    ),

    vectorstore_kwargs={
        'session': session,
        'keyspace': keyspace,
        'table_name': table_name,
    },
)

# Loading PDF

In [28]:
loader = PyPDFLoader("/test.pdf")
pages = loader.load_and_split()



In [24]:
len(pages)

15

In [25]:
pages[1]

Document(metadata={'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250305055202Z00'00'", 'moddate': "D:20250305055202Z00'00'", 'source': '/test.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}, page_content='___________________________________________________________________________________                            \n \nNgoba Village, Lango Geog, Paro: Bhutan. Tel:(+975 8)2729289 /272992, 17111980 Fax :8)272993 \n   Email: reservations@naksel.com, Website: www.naksel.com \na. Agent: a person employed to make reservation for the guest or a person \ncontracted by the guest to make reservations for him/her/ them. \nb. Full deposit: the deposit to be made by the agent, which includes the room \ncharge(s), food charge(s) and applicable taxes for the reserved duration of \nstay in the hotel in advance at the time of booking. \nc. High season: The months of March, April, May, September, October and \nNovember. \nd. Low seaso

# Load to Index

In [29]:
pdf_index = index_creator.from_loaders([loader])



In [30]:
default_query = f'SELECT * FROM {keyspace}.{table_name}'

rows = session.execute(default_query)

for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'row_id: {row.row_id}')
    print(f'embedding_vector: {str(row.vector)[:64]} ...')
    print(f'body_blob: {row.body_blob[:64]} ...')
    print(f'metadata_blob: {row.metadata_s}')

print('\n...')


Row 0:
row_id: 7818b9fbc9b847aa895115c7a51b5ca7
embedding_vector: [0.024553582072257996, 0.018064232543110847, 0.00553581304848194 ...
body_blob: Check in - Check out Terms & Conditions
Check-in time: 13:00 hrs ...
metadata_blob: {'creationdate': "D:20250305055202Z00'00'", 'creator': 'PyPDF', 'moddate': "D:20250305055202Z00'00'", 'page': '6.0', 'page_label': '7', 'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext', 'source': '/test.pdf', 'total_pages': '15.0'}

Row 1:
row_id: ca323136add54c0da635df7a99163fd4
embedding_vector: [0.022329911589622498, 0.009975356981158257, -0.0137701127678155 ...
body_blob: Children 12 years and above: They are charged full price for mea ...
metadata_blob: {'creationdate': "D:20250305055202Z00'00'", 'creator': 'PyPDF', 'moddate': "D:20250305055202Z00'00'", 'page': '4.0', 'page_label': '5', 'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext', 'source': '/test.pdf', 'total_pages': '15.0'}

Row 2:
row_id: 6ced3066d43d42c2a7e497

# Asking Questions to the PDF

In [31]:
query_1 = "What is the rate of Yangkhil?"
pdf_index.query_with_sources(query_1, llm=llm)



{'question': 'What is the rate of Yangkhil?',
 'answer': ' The rate of Yangkhil is not mentioned.\n',
 'sources': ''}

In [34]:
query_2 = "How many hotels"
pdf_index.query_with_sources(query_2, llm=llm)



{'question': 'How many hotels',
 'answer': ' There are 3 types of rooms available at the hotel: Junior Suite, Deluxe Suite, and Luxury Suite. The rates for these rooms vary depending on single or double occupancy. The hotel also offers discounts for agents, ranging from 10% to 36% based on the room category. \n',
 'sources': '/test.pdf'}