In [1]:
import os

from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

### Clone Github repositories

In [2]:
%pwd

'd:\\Data_Science_stuff\\Generative_AI_classes\\code_analyzer\\research'

In [3]:
!mkdir test_repo

In [4]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/srivanoo21/house_price_prediction", to_path=repo_path)

<git.repo.base.Repo 'd:\\Data_Science_stuff\\Generative_AI_classes\\code_analyzer\\research\\test_repo\\.git'>

In [14]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [15]:
documents = loader.load()
documents

[Document(page_content='from fastapi import FastAPI\nimport uvicorn\nimport sys\nimport os\nfrom fastapi.templating import Jinja2Templates\nfrom starlette.responses import RedirectResponse\nfrom fastapi.responses import Response\nfrom src.house_pricing.pipeline.prediction import PredictionPipeline\n\n\napp = FastAPI()\n\n@app.get("/", tags=[\'authentication\'])\nasync def index():\n    return RedirectResponse(url="/docs")\n\n\n@app.get("/train")\nasync def training():\n    try:\n        os.system("python main.py")\n        return Response("Training successful !!")\n    \n    except Exception as e:\n        return Response(f"Error Occurred! {e}")\n    \n\n@app.post("/predict")\nasync def predict_route():\n    try:\n        obj = PredictionPipeline()\n        obj.get_transformed_data()\n        obj.predict_data()\n        return Response("Prediction successful !!")\n\n    except Exception as e:\n        return Response(f"Error Occurred! {e}")\n\n\nif __name__ == "__main__":\n    uvicorn.

### Chunkings

In [16]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [17]:
texts = documents_splitter.split_documents(documents)

In [18]:
len(texts)

32

### Embedding Model

os.environ["OPENAI_API_KEY"] = "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"

In [10]:
embeddings = OpenAIEmbeddings(disallowed_special=())

### Knowledge base (vector DB)

Here we are using the Chroma library to create a vector database (vectordb) from a list of text documents (texts) with associated embeddings (embeddings).

The 'persist_directory' is a directory where the vector database will be stored as arguments.
After creating the vector database, 'persist()' is called to save the vector database to disk

In [19]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./data')
vectordb.persist()

ValueError: Expected EmbeddingFunction.__call__ to have the following signature: odict_keys(['self', 'input']), got odict_keys(['self', 'args', 'kwargs'])
Please see https://docs.trychroma.com/embeddings for details of the EmbeddingFunction interface.
Please note the recent change to the EmbeddingFunction interface: https://docs.trychroma.com/migration#migration-to-0416---november-7-2023 


### LLM Wrapper

In [None]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

In [None]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [None]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

### Q&A

question = "what is DataIngestion class?"

In [None]:
result = qa(question)
print(result['answer'])