Install dependencies

In [None]:
%pip install openai tiktoken chromadb langchain esprima faiss-cpu unstructured
%pip install "unstructured[md]"


In [None]:
import langchain
import dotenv
import os
import openai

dotenv.load_dotenv()


In [None]:
# from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.parsers.language import LanguageParser


Load source code for analysis from file system
Langchain currently has limited language parsers. You may need to extend the Langchain libraries for your input languages or fallback to use of DirectoryLoader

In [None]:
# Load
repo_path = "../input-src"

#Alternative File loaders dependingon language of code being analysed and availability of specific parsers

#loader = GenericLoader.from_filesystem(
#    repo_path,
#    glob="**/*",
#    suffixes=[".php"],
#    parser=LanguageParser(language=Language.PHP, parser_threshold=50)
#)

loader = DirectoryLoader(
    repo_path, recursive=True, glob="**/*.php"
)
documents = loader.load()
len(documents)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
python_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PHP, 
                                                               chunk_size=4000, 
                                                               chunk_overlap=200)
texts = python_splitter.split_documents(documents)
len(texts)

Adjustment of k values in document retrieval can have significant impact on performance of model. Too few results leads to insufficient context, too many can lead to misleading or low relevance responses.

In [None]:
openai_api_base = os.getenv('OPENAI_API_BASE')
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_api_type = os.getenv('OPENAI_API_TYPE')
openai_api_version = os.getenv('OPENAI_API_VERSION')


#from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(disallowed_special=(), 
                                                   openai_api_base=openai_api_base, 
                                                   openai_api_key=openai_api_key, 
                                                   openai_api_type=openai_api_type,
                                                   openai_api_version=openai_api_version, chunk_size=16)


# generate text embeddings for our target codebase
db = FAISS.from_documents(texts, embeddings)

retriever = db.as_retriever(
    search_type="mmr", 
    search_kwargs={"k": 100, "fetch_k": 150},
)

In [None]:
from langchain.chat_models import AzureChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
openai_deploy_name = os.getenv('OPENAI_DEPLOY_NAME')
llm = AzureChatOpenAI(deployment_name=openai_deploy_name,
                          openai_api_base=openai_api_base,
                          openai_api_version=openai_api_version,
                          openai_api_key=openai_api_key,
                          openai_api_type=openai_api_type,
                          temperature=0.7,
                          verbose=True)
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

Interactively query the codebase

In [None]:
#add some code to in a loop ask for additional questions and print the answers
while True:
    question = input("Ask a question: ")
    if question == "quit":
        break
    print("Question: ", question)

    result = qa(question)
    print("Answer: ", result['answer'])