In [1]:
import os
from git import Repo    # for cloning GitHub codebases
from langchain.text_splitter import Language   # for context-aware splitting
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain   # since we're using conversational memory/history
 

### Clone GitHub repositories

In [3]:
REPO_PATH = "test_repo/"

Repo.clone_from(
    "https://github.com/entbappy/End-to-end-ML-Project-Implementation",
    to_path=REPO_PATH
)

<git.repo.base.Repo '/Users/debarchan/PycharmProjects/Source-Code-Analysis/research/test_repo/.git'>

In [4]:
loader = GenericLoader.from_filesystem(
    path=REPO_PATH + 'src/mlProject',
    glob="**/*",
    suffixes=[".py"],
    parser=LanguageParser(
        language=Language.PYTHON,
        parser_threshold=500
    )
)

In [5]:
documents = loader.load()

In [6]:
documents[:2]

[Document(page_content='import os\nimport sys\nimport logging\n\nlogging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"\n\nlog_dir = "logs"\nlog_filepath = os.path.join(log_dir,"running_logs.log")\nos.makedirs(log_dir, exist_ok=True)\n\n\nlogging.basicConfig(\n    level= logging.INFO,\n    format= logging_str,\n\n    handlers=[\n        logging.FileHandler(log_filepath),\n        logging.StreamHandler(sys.stdout)\n    ]\n)\n\nlogger = logging.getLogger("mlProjectLogger")', metadata={'source': 'test_repo/src/mlProject/__init__.py', 'language': <Language.PYTHON: 'python'>}),

### Chunking

In [7]:
documents_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=2000,
    chunk_overlap=200
)

texts = documents_splitter.split_documents(documents)

In [8]:
len(texts)

19

### Embedding Model

In [9]:
os.environ["OPENAI_API_KEY"] = "********************************"

In [11]:
embeddings = OpenAIEmbeddings(disallowed_special=())

### Knowledge Base (Vector DB)

In [12]:
vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./db"
)
vectordb.persist()

### LLM Wrapper

In [18]:
llm = ChatOpenAI()

In [19]:
memory = ConversationSummaryMemory(
    llm=llm,
    memory_key='chat_history',
    return_messages=True
)

In [20]:
# we want to do question-answering kind of a thing 
qa = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=vectordb.as_retriever(
        search_type='mmr', search_kwargs={"k": 3}),
    memory=memory
)

### Q&A

In [23]:
question = "what is DataIngestion class?"

In [24]:
result = qa(question)
result

Number of requested results 20 is greater than number of elements in index 19, updating n_results = 19


{'question': 'what is DataIngestion class?',
 'chat_history': [SystemMessage(content='The human inquires about the DataIngestion class. The AI explains that the purpose of the `DataIngestion` class is to download and extract files, specifically in the context of being instantiated with a configuration to download and extract a zip file as part of the data ingestion process in the training pipeline.', additional_kwargs={})],
 'answer': 'The `DataIngestion` class is designed to handle the data ingestion process. In the context provided, the `DataIngestion` class is responsible for downloading a file and extracting a zip file based on the configuration provided to it during initialization.'}

In [25]:
print(result['answer'])

The `DataIngestion` class is designed to handle the data ingestion process. In the context provided, the `DataIngestion` class is responsible for downloading a file and extracting a zip file based on the configuration provided to it during initialization.
