In [98]:
!pip install langchain-google-genai langchain-community google-generativeai



In [99]:
!pip install langchain-pinecone pinecone



In [100]:
!pip install PyPDF2



In [101]:
import google.generativeai as genai

In [102]:
from pinecone import Pinecone, ServerlessSpec

In [103]:
from langchain_pinecone import PineconeVectorStore

In [104]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [108]:
from google.colab import userdata
generative_api = userdata.get('generative')
pc_api = userdata.get('pc')


In [109]:
genai.configure(api_key = generative_api)

In [110]:
pc = Pinecone(api_key = pc_api)

In [111]:
index_name = "vectordb1"

In [112]:
pc.list_indexes()

[
    {
        "name": "vectordb1",
        "metric": "cosine",
        "host": "vectordb1-y39klm2.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1024,
        "deletion_protection": "disabled",
        "tags": null,
        "embed": {
            "model": "llama-text-embed-v2",
            "field_map": {
                "text": "text"
            },
            "dimension": 1024,
            "metric": "cosine",
            "write_parameters": {
                "dimension": 1024.0,
                "input_type": "passage",
                "truncate": "END"
            },
            "read_parameters": {
                "dimension": 1024.0,
                "input_type": "query",
                "truncate": "END"
            

In [113]:
if index_name not in [i['name'] for i in pc.list_indexes()]:
  pc.create_index(
      name = index_name,
      dimension = 1024,
      metric = "cosine",
      spec = ServerlessSpec(cloud = 'aws', region = 'us-east-1')
  )

In [114]:
pc.delete_index("vectordb1")   # delete old one

pc.create_index(
    name="vectordb1",
    dimension=768,    # must match embeddings
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)


{
    "name": "vectordb1",
    "metric": "cosine",
    "host": "vectordb1-y39klm2.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [115]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [116]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

In [117]:
index = pc.Index(index_name)

In [118]:
def get_chunks(text):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 1000,
      chunk_overlap = 200,
      length_function = len
  )
  chunks = text_splitter.split_text(text)
  return chunks

In [120]:
def get_vector_store(text_chunks, index, embeddings):
  vector_store = PineconeVectorStore(
      index = index,
      embedding = embeddings,
      pinecone_api_key= pc_api
  )
  vector_store.add_texts(text_chunks)
  return vector_store

In [121]:
def get_rel_text(query, db):
  docs = db.similarity_search(query, k = 1)
  return docs[0].page_content

In [122]:
pdf_path = "/content/Computer-Basics--computer_basics2.pdf"

In [123]:
from PyPDF2 import PdfReader

In [124]:
pdf_reader = PdfReader(pdf_path)

In [125]:
text = ''

In [126]:
for page in pdf_reader.pages:
  text += page.extract_text()

In [127]:
text

'COMPUTER BASICS \nSeema Sirpal \nDelhi University Computer Centre What is a Computer? \nAn electronic device that stores, retrieves, \nand processes data, and can be programmed with instructions. A computer is composed of hardware and software, and can exist in a variety of sizes and configurations. Hardware & Software \nThe term hardware refers to the physical \ncomponents of your computer such as the system unit, mouse, keyboard, monitor etc. \nThe software is the instructions that makes \nthe computer work. Software is held either on your computers hard disk, CD-ROM, DVD or on a diskette (floppy disk) and is loaded (i.e. copied) from the disk into the computers RAM (Random Access Memory), as and when required. \nTypes of Computers \nMini and Mainframe Computers \nVery powerful, used by large \norganisations such an banks to control the entire business operation.  Very expensive! \nPersonal Computers \nCheap and easy to use.  Often used as stand-alone computers or in a network.  May

In [128]:
chunks = get_chunks(text)

In [129]:
chunks

['COMPUTER BASICS \nSeema Sirpal \nDelhi University Computer Centre What is a Computer? \nAn electronic device that stores, retrieves, \nand processes data, and can be programmed with instructions. A computer is composed of hardware and software, and can exist in a variety of sizes and configurations. Hardware & Software \nThe term hardware refers to the physical \ncomponents of your computer such as the system unit, mouse, keyboard, monitor etc. \nThe software is the instructions that makes \nthe computer work. Software is held either on your computers hard disk, CD-ROM, DVD or on a diskette (floppy disk) and is loaded (i.e. copied) from the disk into the computers RAM (Random Access Memory), as and when required. \nTypes of Computers \nMini and Mainframe Computers \nVery powerful, used by large \norganisations such an banks to control the entire business operation.  Very expensive! \nPersonal Computers',
 'Types of Computers \nMini and Mainframe Computers \nVery powerful, used by lar

In [130]:
vector_db = get_vector_store(chunks, index, embeddings)

In [131]:
model = genai.GenerativeModel(
    model_name = 'gemini-2.0-flash',
    system_instruction = """

    You are a very experienced answer provider,
    Based on the relevant content given to you, you have the ability to easily answer the query asked by the user.

    """
)

In [132]:
def get_rag_prompt(user_query, relevant_text):

  rag_prompt = f"""

  The provided content are, the user query and the relevant text taken from the required documents.
  Based on the relevant text, please answer the user query. You can also use your own knowledge.
  But please, try to stick on to the relevant content instead of deviating from the topic space.

  User Query : {user_query}

  Relevant Content : {relevant_text}

  """

  return rag_prompt

In [139]:
query = "Whats basics things in Computer Science?"

In [140]:
rel_text = get_rel_text(query, vector_db)

In [141]:
prompt = get_rag_prompt(query, rel_text)

In [142]:
response = model.generate_content(prompt)

In [143]:
from IPython.display import Markdown, display

In [144]:
display(Markdown(response.text))

The basics of computer science mentioned in the provided text are:

*   **Binary Numbering System:** Computers use a binary numbering system, processing data in ones and zeros.
*   **Bit:** The fundamental unit of storage, representing a 1 or 0.
*   **Byte:** Consists of eight bits.
*   **Kilobyte (KB):** 1024 bytes.
*   **Megabyte (MB):** 1024 kilobytes.
*   **Gigabyte (GB):** 1024 megabytes.
*   **Microprocessors:** The "brain" of the computer, like Intel Pentium CPUs.
*   **CPU (Central Processing Unit):** A crucial component, often an Intel Pentium, determining the computer's speed (measured in MHz).


In [146]:
chunks = ['My name is SHahla Rafiq']

chunk_size = 3
chunk_overlap = 2

chunk1 = ['My name is']
chunk2 = ['name is Shahla']
chunk3 = ['is Shahla Rafiq']