In [34]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings


# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio
from PyPDF2 import PdfReader
from dotenv import load_dotenv
import os

In [35]:
pdfreader1 = PdfReader('Rocketium - Banco_Pann_Discovery_transcript.pdf')

load_dotenv()

True

In [36]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader1.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [37]:
raw_text

"Bonjour .\nMayo\nFadina\n00:00:12 \nOh,\nman.\nYou\nsee\nthe\nmessage\nI\nsent\nyou\nabout\nquickly ,\nto\nyou\nand\nAvinash,\nby\nthe\nway?\nI\nsent\nthe\ndm\nYeah.\nI\ndon't\nknow .\nSometimes\nI\nI\nthink\nI'd\nsend\nthings\none.\nI\ndon't\nactually ,\nI\ndon't\nhit\nsend.\nI\njust\ntype\nit\nout,\nand\nit's\nin\nmy\ndrafts\nthe\nwhole\ntime.\nOkay .\nRavi\nMaurya\n00:00:33 \nYeah.\nI\nthink\nI\nread\nit\nlast\nnight.\nAbout\nyeah.\nYeah.\nYeah.\nThat\none.\nYeah.\nI\ngot\nit.\nYeah.\nSo\nthe\nthe\nis\nthis\nbeing\nrecorded,\nlike,\nthe\ncall\none?\nOh,\nyeah.\nI\nthink\nthey're\nrecording\nthis.\nLet's\noffline\noffline.\nOkay .\nCool.\nMayo\nFadina\n00:01:22 \nSo\nwe\nhave\nPriscilla.\nRavi\nMaurya\n00:01:24 \nDiya,\nand\nMicrosoft.\nMayo\nFadina\n00:01:27 \nAndre.\nYeah.\nI\nthink\nsomeone\nelse\njoined,\naccepted\nlate\nyesterday .\nSo\nI\nthink\nYuri.\nMayo\nFadina\n00:01:35 \nYuri\nis\nfrom\nwhat\nI\nremember .\nJust\nhang\non.\nMayo\nFadina\n00:01:46 \nSo\njust\nsee\nmy\nset

In [38]:
cassio.init(token=os.getenv('ASTRA_DB_APPLICATION_TOKEN'), database_id=os.getenv('ASTRA_DB_ID'))

In [39]:
llm = OpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'))
embedding = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

In [40]:
llm.model_name

'gpt-3.5-turbo-instruct'

In [41]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="Rocketium3",
    session=None,
    keyspace=None,
)

In [42]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 20,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [43]:
texts

["Bonjour .\nMayo\nFadina\n00:00:12 \nOh,\nman.\nYou\nsee\nthe\nmessage\nI\nsent\nyou\nabout\nquickly ,\nto\nyou\nand\nAvinash,\nby\nthe\nway?\nI\nsent\nthe\ndm\nYeah.\nI\ndon't\nknow .\nSometimes\nI\nI\nthink\nI'd\nsend\nthings\none.\nI\ndon't\nactually ,\nI\ndon't\nhit\nsend.\nI\njust\ntype\nit\nout,\nand\nit's\nin\nmy\ndrafts\nthe\nwhole\ntime.\nOkay .\nRavi\nMaurya\n00:00:33 \nYeah.\nI\nthink\nI\nread\nit\nlast\nnight.\nAbout\nyeah.\nYeah.\nYeah.\nThat\none.\nYeah.\nI\ngot\nit.\nYeah.\nSo\nthe\nthe\nis\nthis\nbeing\nrecorded,\nlike,\nthe\ncall\none?\nOh,\nyeah.\nI\nthink\nthey're\nrecording\nthis.\nLet's\noffline\noffline.\nOkay .\nCool.\nMayo\nFadina\n00:01:22 \nSo\nwe\nhave\nPriscilla.\nRavi\nMaurya\n00:01:24 \nDiya,\nand\nMicrosoft.\nMayo\nFadina\n00:01:27 \nAndre.\nYeah.\nI\nthink\nsomeone\nelse\njoined,\naccepted\nlate\nyesterday .\nSo\nI\nthink\nYuri.\nMayo\nFadina\n00:01:35 \nYuri\nis\nfrom\nwhat\nI\nremember .\nJust",
 "I\nremember .\nJust\nhang\non.\nMayo\nFadina\n00:01:46

In [44]:
astra_vector_store.add_texts(texts)

print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 72 headlines.


In [45]:
query_text="Give a quick recap of the things discussed in the call in 3 to 4 bullet points(limit each bullet point to 15-20 words)"
print("\nQUESTION: \"%s\"" % query_text)
answer = astra_vector_index.query(query_text, llm=llm).strip()
print("ANSWER: \"%s\"\n" % answer)



QUESTION: "Give a quick recap of the things discussed in the call in 3 to 4 bullet points(limit each bullet point to 15-20 words)"
ANSWER: "1. Introductions and schedule for a custom demonstration.
2. Discussion of the next steps for the process and alignment on a custom demo using assets.
3. Confirmation of next Wednesday as the date for the custom demo, and meeting the solution team."



In [46]:
query_text="Generate 3 question-answer pair (include answers to the questions also) which are frequently asked by the buyer"
print("\nQUESTION: \"%s\"" % query_text)
answer = astra_vector_index.query(query_text, llm=llm).strip()
print("ANSWER: \"%s\"\n" % answer)



QUESTION: "Generate 3 question-answer pair (include answers to the questions also) which are frequently asked by the buyer"
ANSWER: "1. Question: How does your platform help with managing digital assets? 
Answer: Our platform allows you to easily upload and organize all your digital assets, such as photos, templates, and presets. This makes it easier to access and use them for your marketing campaigns. 

2. Question: Can your platform provide data and insights from past campaigns to inform future design decisions? 
Answer: Yes, our platform offers media recall and research capabilities, allowing you to understand how your previous campaigns performed and make informed design decisions for future campaigns. 

3. Question: How many sizes and channels does your platform support for digital marketing campaigns? 
Answer: Our platform supports up to 35 different sizes across 5-6 channels, including popular channels like Google Display Network and Facebook, as well as emerging channels like 