In [None]:
pip install chromadb

In [None]:
import os
os.environ["GEMINI_API_KEY"] = "AIzaSyC--4S4iuAK_z8mTAgKsZiLHq5kMS1D_K4"

In [None]:
import re
import pandas as pd
import numpy as np

In [None]:
# replace the path with your target file path
df = pd.read_csv('/content/targetfile.csv')

In [None]:
df.head()

In [None]:
import pandas as pd

# Assuming df is your DataFrame with a column named 'text'
# and each row contains a single review
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.
    """
    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

# Assuming df is your DataFrame with a column named 'text'
# and each row contains a single review
df['text_split'] = df['Review'].apply(lambda x: split_text(x))


In [None]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os
import chromadb

In [None]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.

    Raises:
    - ValueError: If the Gemini API Key is not provided as an environment variable (GEMINI_API_KEY).

    Example:
    >>> gemini_embedding_function = GeminiEmbeddingFunction()
    >>> input_documents = Documents(["Document 1", "Document 2", "Document 3"])
    >>> embeddings_result = gemini_embedding_function(input_documents)
    >>> print(embeddings_result)
    Embeddings for the input documents generated by the Gemini AI API.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [None]:
import pandas as pd
import chromadb

def create_chroma_db(df, path, name):
    """
    Creates a Chroma database using the provided DataFrame, path, and collection name.

    Parameters:
    - df (DataFrame): DataFrame with a column named 'text_split' where each row contains a list of non-empty substrings.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    documents = df['text_split'].tolist()
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name


# Specify the path and name for the Chroma database
chroma_path = '/content/target_dataset'
chroma_name = 'reviews_collections'

# Create the Chroma database
db, collection_name = create_chroma_db(df, path=chroma_path, name=chroma_name)


In [None]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path='/content/target_dataset', name="reviews_collections")

In [None]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="Sanctions on Russia",db=db,n_results=3)

In [None]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  #user can change the prompt according to the need
  prompt = ("""Fill the [MASK] with diverse words related to doctors and hospitals, which makes the review semantically and contextually correct according to an doctor review. The review must contain words so that it seems like a proper review given to a doctor or hospital by its patient.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt


In [None]:
import google.generativeai as genai
def generate_answer_api(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
def generate_answer(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query,
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer_api(prompt)

    return answer

In [None]:
db=load_chroma_collection(path='/content/target_dataset', #replace with path of your persistent directory
                          name="reviews_collections") #replace with the collection name

answer = generate_answer(db,"i love the [MASK] of [MASK]")
print(answer)

Uploading folder to Drive

In [None]:
from google.colab import drive
import shutil
import os

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Path to the folder you want to upload
folder_path = '/content/target_dataset'

In [None]:
# Zip the folder
shutil.make_archive('/content/target_dataset', 'zip', folder_path)

# Destination path in Google Drive
destination_path = '/content/drive/My Drive/'

# Move the zipped folder to Google Drive
shutil.move('/content/target_dataset.zip', destination_path)