In [None]:
# Install necessary Python packages for working with LangChain, Weaviate, and PDFs.
! pip install langchain-openai
! pip install langchain-community
! pip install weaviate-client
! pip install pypdf

In [2]:
# Import PyPDFDirectoryLoader to load PDF documents from a directory.
from langchain_community.document_loaders import PyPDFDirectoryLoader

# Load PDF documents from the "data" directory.
loader = PyPDFDirectoryLoader("data")
data = loader.load()

In [3]:
# Display the loaded data.
data

[Document(page_content='Generative AI ', metadata={'source': 'data/Generative AI.pdf', 'page': 0}),
 Document(page_content='What you will learn? \n●Generative AI? \n●Large Language Models (LLMs) \n●OpenAI \n●Langchain \n●Vector Database \n●Llama Index \n●Open Source LLM model \n●End to End Project ', metadata={'source': 'data/Generative AI.pdf', 'page': 1}),
 Document(page_content='Generative AI \n●ChatGPT \n●Google Bard \n●Meta Llama 2 ', metadata={'source': 'data/Generative AI.pdf', 'page': 2}),
 Document(page_content='What is Generative AI? \nGenerative AI generate new data based on training sample.Generative model \ncan generate Image,Text, Audio, Videos etc. data as output. \nSo generative AI is a very huge topics, \n-Generative Image model \n-Generative Language model ', metadata={'source': 'data/Generative AI.pdf', 'page': 3}),
 Document(page_content='Generative Model: \nQuestions R esponses ', metadata={'source': 'data/Generative AI.pdf', 'page': 4}),
 Document(page_content='Wh

In [4]:
# Import RecursiveCharacterTextSplitter for splitting documents into chunks.
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create a text splitter that splits text into chunks of 1000 characters with a 20 character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(data)

In [5]:
# Display the number of document chunks.
len(docs)

15

In [6]:
# Import userdata module for accessing user data in Google Colab.
from google.colab import userdata

# Retrieve the OpenAI API key from user data.
OPENAI_API_KEY = userdata.get('OPEN_AI_KEY')

In [7]:
# Import OpenAIEmbeddings to generate embeddings for documents.
from langchain_openai.embeddings import OpenAIEmbeddings

# Initialize OpenAI embeddings with the API key.
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [8]:
# Retrieve Weaviate API key and cluster URL from user data.
WEAVIATE_API_KEY = userdata.get('WEAVIATE_API_KEY')
WEAVIATE_CLUSTER = userdata.get('WEAVIATE_CLUSTER')

In [9]:
# Import Weaviate client and LangChain Weaviate vector store.
import weaviate
from langchain.vectorstores import Weaviate

# Connect to the Weaviate cluster using the API key.
auth_config = weaviate.auth.AuthApiKey(api_key = WEAVIATE_API_KEY)
WEAVIATE_URL = WEAVIATE_CLUSTER

client = weaviate.Client(
    url = WEAVIATE_URL,
    additional_headers = {"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret = auth_config,
    startup_period = 10
)

In [10]:
# Check if the Weaviate client is ready.
client.is_ready()

True

In [11]:
# Delete all existing schemas in the Weaviate cluster.
client.schema.delete_all()
client.schema.get()

# Define a new schema for storing document vectors.
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

# Create the schema in the Weaviate client.
client.schema.create(schema)

# Initialize the Weaviate vector store with the client, specifying the class and content property.
vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])

In [12]:
# Prepare text and metadata pairs for adding to the vector store.
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))

# Add the texts and their metadata to the Weaviate vector store.
vectorstore.add_texts(texts, meta)

['024d0e8a-d276-4b6b-bd1e-9e53ec1de30c',
 '87b579f7-48a5-4755-b700-77b64dbbc283',
 '30b04ef6-e88e-4538-b3a0-1d2d7b1f0d6f',
 'ead95a97-b028-4560-8999-4f7212413b56',
 '8e74112e-7c36-469b-9eab-cab737be5f9d',
 'b76445ba-2200-45ef-a9e9-02584fa01732',
 '631676d4-17a9-4f02-a227-65e8fb35bc3f',
 '6e0f0ef0-85e3-4ef8-8521-14ea8467277a',
 '09a7f1f8-4d00-42c4-8fc9-5c928b162c9a',
 'c2f3cef3-a864-40b8-85da-7cc56916e8e0',
 'd2c29f15-a1bb-4df2-9ccf-0c3cc5efd0d2',
 '002be3ab-17d5-4bea-9199-d745e30ab76a',
 '547ec298-d179-4c83-872a-3c3c87030d2f',
 '5f0512b5-0c05-4587-b7a2-c6f395e9f820',
 '199fa0ef-286d-4123-959f-d85101fd3052']

In [13]:
# Define a query to search for similar documents.
query = "What is LLM?"

# Perform a similarity search in the vector store with the query, retrieving the top 3 results.
docs = vectorstore.similarity_search(query,top_k=3)

In [14]:
# Display the retrieved documents.
docs

[Document(page_content='What makes LLM so Powerful? \n●In case of LLM, one model can be used for a whole variety of tasks like:- \nText generation, Chatbot, summarizer, translation, code generation \n& so on … \nSo, LLM is subset of Deep Learning & it has some properties merge with \nGenerative AI', metadata={'source': 'data/Generative AI.pdf'}),
 Document(page_content='What is LLMs? \nLarge Language Models (LLMs) are foundational machine learning models that use deep learning  \nalgorithms to process and understand natural language. These models are trained on massive amounts  \nof text data to learn patterns and entity relationships in the language.  \nIt is a language  model which is responsible for performing task such as text to text generation  , text to  \nimage generation  and image to text generations .', metadata={'source': 'data/Generative AI.pdf'}),
 Document(page_content='Why LLM so Powerful? \n●Train the model for a speciﬁc task', metadata={'source': 'data/Generative AI.p

In [15]:
# Import necessary modules for creating a RetrievalQA chain.
from langchain.chains import RetrievalQA

In [16]:
from langchain_openai import OpenAI

In [17]:
# Initialize the OpenAI language model with the API key.
llm = OpenAI(api_key=OPENAI_API_KEY)

In [18]:
# Create a RetrievalQA chain using the language model and the Weaviate vector store retriever.
llm_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=vectorstore.as_retriever())

In [19]:
# Define a query to be answered by the RetrievalQA chain.
query = "What is LLM?"

# Print the answer to the query by invoking the RetrievalQA chain.
print(llm_chain.invoke(query))

{'query': 'What is LLM?', 'result': ' Large Language Models (LLMs) are foundational machine learning models that use deep learning algorithms to process and understand natural language. They are trained on massive amounts of text data to learn patterns and relationships in language, making them powerful tools for tasks such as text generation, chatbots, summarization, translation, and code generation. LLMs are a subset of deep learning and have properties that merge with Generative AI.'}
