In [1]:
from git import Repo
import os
from dotenv import load_dotenv

from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
load_dotenv()

True

In [3]:
! mkdir code_base

In [4]:
repo_path = "code_base/"

Repo.clone_from("https://github.com/saurav-sabu/Question_Answering_System_Using_Gemini",to_path=repo_path)

<git.repo.base.Repo 'c:\\GenAI\\GenAI Projects\\Source-Code-Analysis\\experiments\\code_base\\.git'>

In [5]:
loader = GenericLoader.from_filesystem(repo_path+"/src",
                                       glob="**/*",
                                       suffixes=[".py"],
                                       parser=LanguageParser(language=Language.PYTHON,parser_threshold=500))

In [6]:
document = loader.load()

In [7]:
document

[Document(metadata={'source': 'code_base\\src\\data_ingestion.py', 'language': <Language.PYTHON: 'python'>}, page_content='from llama_index.core import SimpleDirectoryReader\nfrom exception import CustomException\nfrom logger import logging\n\nimport sys\n\ndef load_data(data):\n\n    try:\n        logging.info("Data loading started------")\n        loader = SimpleDirectoryReader("data")\n        documents = loader.load_data()\n\n        logging.info("Data loading completed-----")\n        return documents\n    \n    except Exception as e:\n        logging.info("Exception in loading data-------")\n        raise CustomException(e,sys)\n\n\n\n'),
 Document(metadata={'source': 'code_base\\src\\embedding.py', 'language': <Language.PYTHON: 'python'>}, page_content='from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext, load_index_from_storage\nfrom llama_index.embeddings.gemini import GeminiEmbedding\n\nfrom src.data_ingestion import load_data\nfrom src.model_api imp

In [8]:
splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON,
                                                        chunk_size=2000,
                                                        chunk_overlap=200)

In [9]:
texts = splitter.split_documents(document)

In [10]:
len(texts)

3

In [11]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [12]:
embedding = OpenAIEmbeddings(disallowed_special=())

In [13]:
vectordb = Chroma.from_documents(texts,embedding=embedding,persist_directory="./data")

In [14]:
llm = ChatOpenAI()

In [15]:
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)

In [16]:
chain = ConversationalRetrievalChain.from_llm(llm,
                                              retriever=vectordb.as_retriever(search_type="mmr",search_kwargs={"k":3}),
                                              memory=memory)

In [17]:
question = "tell me about download_gemini_embedding function?"

result = chain.invoke(question)
result["answer"]

Number of requested results 20 is greater than number of elements in index 3, updating n_results = 3


'The `download_gemini_embedding` function is responsible for downloading a Gemini embedding model, storing the vector embedding, and querying the results. It uses a GeminiEmbedding model with the specified model name "models/embedding-001" to create an embedding model. The function then creates a ServiceContext with default values for parameters like chunk size and overlap, and uses this context to create a VectorStoreIndex from the provided document. Finally, it persists the storage context and returns a query engine for querying the results. If an exception occurs during this process, a CustomException is raised.'