In [1]:
## The pipeline for QA over code follows the steps we do for document question answering, with some differences:
#   In particular, we can employ a splitting strategy that does a few things:
#     Keeps each top-level function and class in the code is loaded into separate documents.
#     Puts remaining into a separate document.
#     Retains metadata about where each split comes from

In [1]:
from util.config import load_configuration

PROP_FILE = 'app.properties'
configs = load_configuration(PROP_FILE)

API_TYPE = configs.get("API_TYPE").data
API_VERSION = configs.get("API_VERSION").data
API_KEY = configs.get("API_KEY").data
API_BASE = configs.get("API_BASE").data
LLM_ENGINE_GPT35_16K=configs.get("LLM_ENGINE_GPT35_16K").data
ADA_MODEL=configs.get("ADA_MODEL").data

In [2]:
from git import Repo
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language

In [4]:
# Clone
repo_path = "/mnt/c/Temp/code_analysis_llm"
repo = Repo.clone_from("https://github.com/sibendu/KafkaReader", to_path=repo_path)

In [6]:
# Load
loader = GenericLoader.from_filesystem(
    repo_path + "/src/main/java",
    glob="**/*",
    suffixes=[".java"],
    exclude=["**/non-utf8-encoding.py"],
    parser=LanguageParser(language=Language.JAVA, parser_threshold=500),
)
documents = loader.load()
len(documents)

7

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.JAVA, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
len(texts)

14

In [18]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings

llm = AzureChatOpenAI(
    azure_endpoint=API_BASE,
    api_key=API_KEY,
    azure_deployment=LLM_ENGINE_GPT35_16K,
    openai_api_version=API_VERSION,
)

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=API_BASE,
    azure_deployment=ADA_MODEL,
     api_key=API_KEY,
    openai_api_version=API_VERSION,
)

In [14]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

vector = FAISS.from_documents(texts, embeddings)

In [16]:
retriever = vector.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 8},
)

In [19]:
## Chat

from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# First we need a prompt that we can pass into an LLM to generate this search query

prompt = ChatPromptTemplate.from_messages(
    [
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
        (
            "user",
            "Given the above conversation, generate a search query to look up to get information relevant to the conversation",
        ),
    ]
)

retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
    ]
)
document_chain = create_stuff_documents_chain(llm, prompt)

qa = create_retrieval_chain(retriever_chain, document_chain)

In [20]:
question = "What are the processing steps in processMessage?"
result = qa.invoke({"input": question})
result["answer"]

'The processing steps in the `processMessage` method are as follows:\n\n1. Parse the message JSON and extract the required fields such as `serialNumber`, `objectType`, `objectId`, `terminalName`, `inetName`, `siteName`, `satelliteName`, `spaceCraftName`, `teleportName`, `beamName`, `serviceArea`, `customerName`, `terminalType`, `mobilityType`, `terminalModel`, `streamType`, `resolution`, and `timestamp`.\n2. Parse the timestamp string and convert it to a Date object.\n3. Extract the `metrics` JSON object from the message.\n4. Extract the metric name and value from the metrics JSON object.\n5. Check if the metric name is one of the predefined metrics to watch.\n6. If the metric value is not null, process the message by saving the metric to the database.\n7. If the serial number exists in the database, create a new `IntelMetric` object and save it.\n8. If the serial number does not exist in the database, create a new `IntelObject` object, add the `IntelMetric` to it, and save it.\n9. Set

In [24]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(result["answer"], 'html.parser')
print(soup.prettify())

The processing steps in the `processMessage` method are as follows:

1. Parse the message JSON and extract the required fields such as `serialNumber`, `objectType`, `objectId`, `terminalName`, `inetName`, `siteName`, `satelliteName`, `spaceCraftName`, `teleportName`, `beamName`, `serviceArea`, `customerName`, `terminalType`, `mobilityType`, `terminalModel`, `streamType`, `resolution`, and `timestamp`.
2. Parse the timestamp string and convert it to a Date object.
3. Extract the `metrics` JSON object from the message.
4. Extract the metric name and value from the metrics JSON object.
5. Check if the metric name is one of the predefined metrics to watch.
6. If the metric value is not null, process the message by saving the metric to the database.
7. If the serial number exists in the database, create a new `IntelMetric` object and save it.
8. If the serial number does not exist in the database, create a new `IntelObject` object, add the `IntelMetric` to it, and save it.
9. Set the `proce