In [1]:
import os

from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

### Clone Github repositories

In [2]:
%pwd

'd:\\Data_Science_stuff\\Generative_AI_classes\\code_analyzer\\research'

In [5]:
!mkdir test_repo

In [6]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/srivanoo21/house_price_prediction", to_path=repo_path)

<git.repo.base.Repo 'd:\\Data_Science_stuff\\Generative_AI_classes\\code_analyzer\\research\\test_repo\\.git'>

In [7]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path,
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [8]:
documents = loader.load()
documents

[Document(page_content='from fastapi import FastAPI\nimport uvicorn\nimport sys\nimport os\nfrom fastapi.templating import Jinja2Templates\nfrom starlette.responses import RedirectResponse\nfrom fastapi.responses import Response\nfrom src.house_pricing.pipeline.prediction import PredictionPipeline\n\n\napp = FastAPI()\n\n@app.get("/", tags=[\'authentication\'])\nasync def index():\n    return RedirectResponse(url="/docs")\n\n\n@app.get("/train")\nasync def training():\n    try:\n        os.system("python main.py")\n        return Response("Training successful !!")\n    \n    except Exception as e:\n        return Response(f"Error Occurred! {e}")\n    \n\n@app.post("/predict")\nasync def predict_route():\n    try:\n        obj = PredictionPipeline()\n        obj.get_transformed_data()\n        obj.predict_data()\n        return Response("Prediction successful !!")\n\n    except Exception as e:\n        return Response(f"Error Occurred! {e}")\n\n\nif __name__ == "__main__":\n    uvicorn.

### Chunkings

In [9]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [10]:
texts = documents_splitter.split_documents(documents)

In [11]:
len(texts)

32

### Embedding Model

In [12]:
#os.environ["OPENAI_API_KEY"] = "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"

In [15]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-2.6.1-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Using cached sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
Using cached transformers-4.39.3-py3-none-any.whl (8.8 MB)
Installing collected packages: transformers, sentence-transformers


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'd:\\Data_Science_stuff\\Generative_AI_classes\\code_analyzer\\venv\\Lib\\site-packages\\transformers\\models\\bark\\configuration_bark.py'
Consider using the `--user` option or check the permissions.



In [17]:
#embeddings = OpenAIEmbeddings()
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Knowledge base (vector DB)

Here we are using the Chroma library to create a vector database (vectordb) from a list of text documents (texts) with associated embeddings (embeddings).

The 'persist_directory' is a directory where the vector database will be stored as arguments.
After creating the vector database, 'persist()' is called to save the vector database to disk

In [18]:
vectordb = Chroma.from_documents(texts, embedding_function, persist_directory='./db')
vectordb.persist()

### LLM Wrapper

In [19]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

  warn_deprecated(


In [20]:
# In essence, this line sets up a memory component specifically designed to store and potentially retrieve a 
# conversation history associated with a particular LLM. This history can then be used by the LLM to provide 
# more contextually relevant responses in an ongoing conversation.

memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)

In [21]:
# This suggests the retrieval method might use a technique called "Maximum Marginal Relevance" (MMR). 
# MMR aims to retrieve documents that are both relevant to the query and dissimilar to each other, 
# promoting diversity in the retrieved answers.

# This line configures a question-answering component within a conversational system. 
# It creates a retrieval chain specifically designed for the provided LLM, defines a retrieval method that 
# leverages an MMR-based search, and potentially incorporates the conversation history for contextually 
# relevant answers.

qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), memory=memory)

### Q&A

In [23]:
question = "what is DataIngestion class?"

In [24]:
result = qa(question)
print(result['answer'])

  warn_deprecated(


The purpose of the `DataIngestion` class is to provide functionality for ingesting and managing data through pipelines. The class includes methods for handling data ingestion, validation, transformation, and model training tasks, which are essential stages in a typical data science pipeline.
