In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
os.environ["GEMINI_API_KEY"]='xx'

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.

    Raises:
    - FileNotFoundError: If the specified file_path does not exist.
    - PyPDF2.utils.PdfReadError: If the PDF file is encrypted or malformed.

    Example:
    >>> pdf_text = load_pdf("example.pdf")
    >>> print(pdf_text)
    "This is the text content extracted from the PDF file."
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

In [3]:
file_path = '/kaggle/input/union-speech/state_of_the_union.pdf'

# replace the path with your file path
pdf_text = load_pdf(file_path)

In [4]:
pdf_text

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress \nand the Cabinet. Justices of the Supreme Court. My fellow Americans.  \nLast year COVID -19 kept us apart. This year we are finally together again. Tonight, we meet as \nDemocrats, Republicans, and Independents. But most importantly as Americans. With a duty to one \nanother, to the American people, to the Constitution. And with an  unwavering resolve that freedom will \nalways triumph over tyranny.  \n \nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world, thinking he could \nmake it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine \nand the world would roll over. Instead, he met a  wall of strength he never imagined. He met the \nUkrainian people. From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their \ndetermination, inspires the world.  \n \nGroups of citizens blocking tanks wi

## Splitting data into small chunks 
 -  split on each paragraph (simple way)
  -  More methods:

In [10]:
import re
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)

In [12]:
chunked_text[1]

'Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world, thinking he could \nmake it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine \nand the world would roll over. Instead, he met a  wall of strength he never imagined. He met the \nUkrainian people. From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their \ndetermination, inspires the world.  '

## Embed the chunks data
 - Gemini model using API to create embeddings

In [14]:
%%capture
!pip install pypdf==4.0.0
!pip install PyPDF2==3.0.1
!pip install pandas==2.2.0
!pip install chromadb==0.4.22
!pip install google-generativeai==0.3.2

In [26]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os
from typing import List, Tuple

In [27]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API key is not provided")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        task_type = "retrieval_document"  # Verify if this is the correct task type

        print(f"Using task_type: {task_type}")
        print(f"Input content: {input}")

        response = genai.embed_content(
            model=model,
            content=input,
            task_type=task_type,
            title=title
        )

        if 'embedding' not in response:
            raise ValueError("The response does not contain 'embedding' key")

        return response['embedding']

## Store the emdedded data into the vector database

[Vector Index](https://www.datastax.com/guides/what-is-a-vector-index)

In [28]:
import chromadb
from typing import List

In [33]:
def create_chroma_db(documents: List, path: str, name: str) -> Tuple[chromadb.Collection, str]:
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction(), get_or_create=True)

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name


In [34]:
db, name = create_chroma_db(
    documents=chunked_text,
    path="/kaggle/working/",
    name="rag_experiment_chromadb"
)

Using task_type: retrieval_document
Input content: ['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress \nand the Cabinet. Justices of the Supreme Court. My fellow Americans.  \nLast year COVID -19 kept us apart. This year we are finally together again. Tonight, we meet as \nDemocrats, Republicans, and Independents. But most importantly as Americans. With a duty to one \nanother, to the American people, to the Constitution. And with an  unwavering resolve that freedom will \nalways triumph over tyranny.  ']
Using task_type: retrieval_document
Input content: ['Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world, thinking he could \nmake it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine \nand the world would roll over. Instead, he met a  wall of strength he never imagined. He met the \nUkrainian people. From President Zelenskyy to every Ukrainian, their fearlessnes

In [35]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path="/kaggle/working/", name="rag_experiment_chromadb")

In [36]:
def get_relevant_passage(query, db, n_results):
    try:
        # Query the database
        response = db.query(query_texts=[query], n_results=n_results)
        # Check if 'documents' key exists in the response
        if 'documents' in response:
            passage = response['documents'][0]
            return passage
        else:
            raise KeyError("'documents' key not found in the response")
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
relevant_text = get_relevant_passage(query="Sanctions on Russia", db=db, n_results=3)
print(relevant_text)

Using task_type: retrieval_document
Input content: ['Sanctions on Russia']
['Together with our allies, we are right now enforcing powerful economic sanctions. We are cutting off \nRussia’s largest banks from the international financial system. Preventing Russia’s central bank from \ndefending the Russian Ruble, making Putin’s $630 Bill ion “war fund” worthless. We are choking off \nRussia’s access to technology that will sap its economic strength and weaken its military for years to \ncome.  ', 'And tonight I am announcing that we will join our allies in closing off American airspace to all Russian \nflights – further isolating Russia – and adding an additional squeeze on their economy. The Ruble has lost \n30% of its value. The Russian stock market h as lost 40% of its value and trading remains suspended. \nRussia’s economy is reeling and Putin alone is to blame.  ', 'We meet tonight in an America that has lived through two of the hardest years this nation has ever \nfaced. The pandem

## Genration Task from the doc to test

In [37]:
def make_rag_prompt(query, relevant_passage):
    escaped =  relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

    ANSWER:
    """).format(query=query, relevant_passage=escaped)

    return prompt

In [41]:
import google.generativeai as genai

def generate_answer(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [42]:
def generate_output(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query, 
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer(prompt)

    return answer

In [43]:
db=load_chroma_collection(path="/kaggle/working/",
                          name="rag_experiment_chromadb") #replace with the collection name

answer = generate_output(db,query="what sanctions have been placed on Russia")
print(answer)

Using task_type: retrieval_document
Input content: ['what sanctions have been placed on Russia']
Sure, I can answer that.  
    As a result of Russia's invasion of Ukraine, the United States, along with the European Union, United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and Switzerland, have imposed extensive economic sanctions on Russia. These sanctions are designed to cripple Russia's economy and pressure Putin to withdraw his forces from Ukraine.  
    The sanctions target Russia's largest banks, cutting them off from the international financial system. They also prevent Russia's central bank from defending the Russian ruble, making its currency nearly worthless. Additionally, the sanctions restrict Russia's access to technology that will weaken its military and economy in the long term. The sanctions have already had a significant impact on Russia's economy, causing the ruble to lose 30% of its value and the stock market to lose 40% of its value.
