In [None]:
! sudo apt-get update

In [None]:
! sudo apt-get install poppler-utils

In [None]:
!sudo apt-get install libleptonica-dev tesseract-ocr libtesseract-dev python3-pil tesseract-ocr-eng tesseract-ocr-script-latn

In [4]:
! pip install -q langchain-community langchain-core unstructured langchain-google-genai langchain-astradb datasets pypdf "unstructured[pdf]" "unstructured[pptx]" unstructured-pytesseract tesseract-ocr

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_astradb import AstraDBVectorStore
from langchain.indexes import VectorstoreIndexCreator
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import nltk
from google.colab import userdata
import os

In [6]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [7]:
ASTRA_DB_APPLICATION_TOKEN = userdata.get('ASTRA_DB_APPLICATION_TOKEN')
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

ASTRA_DB_API_ENDPOINT = "https://dee13a4f-1af1-45e0-80d7-263f7ee83c3a-us-east-2.apps.astra.datastax.com"
ASTRA_DB_KEYSPACE = "practice"

os.environ["ASTRA_DB_APPLICATION_TOKEN"] = ASTRA_DB_APPLICATION_TOKEN
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [8]:
loader = DirectoryLoader("docs")

In [9]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512,chunk_overlap=64)

In [10]:
docs = loader.load_and_split(text_splitter=splitter)

In [11]:
len(docs)

60

In [12]:
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [13]:
vector_store = AstraDBVectorStore(
    embedding = embedding,
    collection_name="multimodaldata",
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN
)

In [14]:
inserted_ids = vector_store.add_documents(docs)

In [15]:
retriever = vector_store.as_retriever()

In [16]:
retriever

VectorStoreRetriever(tags=['AstraDBVectorStore', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_astradb.vectorstores.AstraDBVectorStore object at 0x790721342e50>, search_kwargs={})

In [17]:
prompt_template = '''
You are AI expert which can answer any question regarding AI

Context:
{context}

Question:
{question}

Answer:
'''

In [18]:
prompt_template = ChatPromptTemplate.from_template(prompt_template)

In [19]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [20]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()

)

In [23]:
chain.invoke("What is LawSetu")

'Based on the provided text, LawSetu is an AI-powered platform that leverages Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) to improve efficiency in legal and business decision-making.  It automates legal document drafting (contracts, affidavits, etc.), streamlines legal research by summarizing judgments and case law, and provides real-time business insights.  The platform is designed for scalability (handling increasing demands and multiple Indian languages) and usability (multi-language support and voice interface).  It aims to reduce manual effort and improve accuracy and efficiency for both legal and business professionals.'