In [2]:
import os

In [None]:
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain

# Clone Github repositories

In [None]:
%pwd

In [None]:
!mkdir test_repo

In [None]:
repo_path = "test_repo/"

Repo.clone_from("https://github.com/entbappy/End-to-end-ML-Project-Implementation", to_path=repo_path)

In [None]:
repo_path = "test_repo/"

loader = GenericLoader.from_filesystem(repo_path+'/src/mlProject',
                                        glob = "**/*",
                                       suffixes=[".py"],
                                       parser = LanguageParser(language=Language.PYTHON, parser_threshold=500)
)

In [None]:
documents = loader.load()

In [None]:
documents

# Chunkings

In [None]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(language = Language.PYTHON,
                                                             chunk_size = 2000,
                                                             chunk_overlap = 200)

In [None]:
texts = documents_splitter.split_documents(documents)

In [None]:
len(texts)

# Embedding model

In [None]:
os.environ["OPENAI_API_KEY"] = "***************************"

In [None]:
embeddings=OpenAIEmbeddings(disallowed_special=())

# Knowledge base (vector DB)

In [None]:
vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory='./data')
vectordb.persist()

# LLM Wrapper

In [None]:
# llm = ChatOpenAI(model_name="gpt-4")
llm = ChatOpenAI()

In [None]:
memory = ConversationSummaryMemory(llm=llm, memory_key = "chat_history", return_messages=True)

In [None]:
qa = ConversationalRetrievalChain.from_llm(llm, retriever=vectordb.as_retriever(search_type="mmr", search_kwargs={"k":3}), mem

# Q&A

In [None]:
question = "what is DataIngestion class?"

In [None]:
result = qa(question)
print(result['answer'])