# Generative AI - Prompt Agent - Loop

In [1]:
import os
from dotenv import load_dotenv
import openai
from langchain_openai import ChatOpenAI
# JSON loader
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
# Vector DB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
# prompt
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
# summarization
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain, MapReduceChain, load_summarize_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain_community.document_loaders import TextLoader

# 1. Static variable

In [2]:
vectorDB_declare = "faiss_vector_declare_index_db"
vectorDB_handle = "faiss_vector_handle_index_db"
vectorDB_exif_keyword = "faiss_vector_exif_keyword_index_db"
code_purpose_directory = "code_purpose"
code_purpose_file = code_purpose_directory+"//code_purpose.txt"
declare_code_block_path = "./declare_code_block"

In [3]:
class LLMAnswer:
    def __init__(self, app_name, answer, similar_app):
        self.app_name = app_name
        self.answer = answer
        self.similar_app = similar_app
    
    def to_json(self):
        return json.dumps(self.__dict__, indent=4)

# 2. Function

In [4]:
# Setup model
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = api_key
llm = ChatOpenAI(model="gpt-4-turbo",temperature=0)
# Setup embedding
embeddings = OpenAIEmbeddings()
# loading database
db_connect = FAISS.load_local(vectorDB_declare, embeddings,allow_dangerous_deserialization=True)
print(db_connect.index.ntotal) 
#print(db_connect.index_to_docstore_id)

64495


In [5]:
# Function create directory
def create_directory(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created successfully.")
    else:
        print(f"Directory '{directory_path}' already exists.")
# Funtion write string to text file
def write_string_to_file(string, file_path):
    with open(file_path, 'w') as file:
        file.write(string)
    print(f"String written to '{file_path}' successfully.")
# Function delete file
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"File '{file_path}' deleted successfully.")
    else:
        print(f"File '{file_path}' does not exist.")
# Function list all file in directory to list
def list_files_in_directory(directory_path):
    file_list = os.listdir(directory_path)
    file_list = [file for file in file_list if os.path.isfile(os.path.join(directory_path, file))]
    return file_list
# Function load JSON
def load_json_data(file_path,json_root):
    # Define the JSONLoader with the appropriate parameters
    loader = JSONLoader(
        file_path=file_path,
        jq_schema="."+json_root+"[]",
        text_content=False
    )
    # Load the JSON data
    data = loader.load()
    return data
# Function get sub-string in string
def get_sub_string(input_string, delimiter='/'):
    parts = input_string.split(delimiter)
    filename = parts[-1]
    filename_without_extension = filename.split('.')[0]
    return filename_without_extension

# 3. Main

In [6]:
# Create directory
create_directory(code_purpose_directory)

Directory 'code_purpose' already exists.


In [7]:
# Create retriever
retriever = db_connect.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x784badde78e0>)

In [8]:
# List all files
declare_code_files = list_files_in_directory(declare_code_block_path)

In [9]:
# Load data
declare_code_file = declare_code_block_path+"/"+declare_code_files[0]
print("File name :"+declare_code_file)
declare_code_documents = load_json_data(declare_code_file,"declare")
# metadata input
metadata_input = (declare_code_documents[0].metadata)["source"]
print(metadata_input)
app_name = get_sub_string(metadata_input)
print(app_name)
for i in range(len(declare_code_documents)):
    print("******************** Document page_content no-"+str(i)+" ********************")
    document = declare_code_documents[i]
    print(document)

File name :./declare_code_block/Gallery-Photo-Gallery-App-1.0_declare.json
/root/metaLeak-ml-llm-rag-ubuntu/declare_code_block/Gallery-Photo-Gallery-App-1.0_declare.json
Gallery-Photo-Gallery-App-1
******************** Document page_content no-0 ********************
page_content='public static final int[] q;' metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/declare_code_block/Gallery-Photo-Gallery-App-1.0_declare.json', 'seq_num': 1}
******************** Document page_content no-1 ********************
page_content='final int[] a2 = (Object)c.h(this.g);' metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/declare_code_block/Gallery-Photo-Gallery-App-1.0_declare.json', 'seq_num': 2}
******************** Document page_content no-2 ********************
page_content='final byte[] b = new byte[int2];' metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/declare_code_block/Gallery-Photo-Gallery-App-1.0_declare.json', 'seq_num': 3}
******************** Document page_content no-3 ******

In [10]:
query = declare_code_documents[81].page_content
# query = declare_code_documents[7].page_content
print(query)
print(type(query))

final d[] array = { new d("NewSubfileType", 254, 4), new d("SubfileType", 255, 4), new d(256, 3, 4, "ImageWidth"), new d(257, 3, 4, "ImageLength"), new d("BitsPerSample", 258, 3), new d("Compression", 259, 3), new d("PhotometricInterpretation", 262, 3), new d("ImageDescription", 270, 2), new d("Make", 271, 2), new d("Model", 272, 2), new d(273, 3, 4, "StripOffsets"), new d("Orientation", 274, 3), new d("SamplesPerPixel", 277, 3), new d(278, 3, 4, "RowsPerStrip"), new d(279, 3, 4, "StripByteCounts"), new d("XResolution", 282, 5), new d("YResolution", 283, 5), new d("PlanarConfiguration", 284, 3), new d("ResolutionUnit", 296, 3), new d("TransferFunction", 301, 3), new d("Software", 305, 2), new d("DateTime", 306, 2), new d("Artist", 315, 2), new d("WhitePoint", 318, 5), new d("PrimaryChromaticities", 319, 5), new d("SubIFDPointer", 330, 4), new d("JPEGInterchangeFormat", 513, 4), new d("JPEGInterchangeFormatLength", 514, 4), new d("YCbCrCoefficients", 529, 5), new d("YCbCrSubSampling", 5

In [11]:
# query = """
# "final e[] array = { new e(\"NewSubfileType\", 254, 4), new e(\"SubfileType\", 255, 4), new e(\"ImageWidth\", 256, 3, 4), new e(\"ImageLength\", 257, 3, 4), new e(\"BitsPerSample\", 258, 3), new e(\"Compression\", 259, 3), new e(\"PhotometricInterpretation\", 262, 3), new e(\"ImageDescription\", 270, 2), new e(\"Make\", 271, 2), new e(\"Model\", 272, 2), new e(\"StripOffsets\", 273, 3, 4), new e(\"Orientation\", 274, 3), new e(\"SamplesPerPixel\", 277, 3), new e(\"RowsPerStrip\", 278, 3, 4), new e(\"StripByteCounts\", 279, 3, 4), new e(\"XResolution\", 282, 5), new e(\"YResolution\", 283, 5), new e(\"PlanarConfiguration\", 284, 3), new e(\"ResolutionUnit\", 296, 3), new e(\"TransferFunction\", 301, 3), new e(\"Software\", 305, 2), new e(\"DateTime\", 306, 2), new e(\"Artist\", 315, 2), new e(\"WhitePoint\", 318, 5), new e(\"PrimaryChromaticities\", 319, 5), new e(\"SubIFDPointer\", 330, 4), new e(\"JPEGInterchangeFormat\", 513, 4), new e(\"JPEGInterchangeFormatLength\", 514, 4), new e(\"YCbCrCoefficients\", 529, 5), new e(\"YCbCrSubSampling\", 530, 3), new e(\"YCbCrPositioning\", 531, 3), new e(\"ReferenceBlackWhite\", 532, 5), new e(\"Copyright\", 33432, 2), new e(\"ExifIFDPointer\", 34665, 4), new e(\"GPSInfoIFDPointer\", 34853, 4), new e(\"SensorTopBorder\", 4, 4), new e(\"SensorLeftBorder\", 5, 4), new e(\"SensorBottomBorder\", 6, 4), new e(\"SensorRightBorder\", 7, 4), new e(\"ISO\", 23, 3), new e(\"JpgFromRaw\", 46, 7), new e(\"Xmp\", 700, 1) };",
# """
docs_similar = retriever.invoke(query)
print(len(docs_similar))
print(docs_similar)
# docs = db_connect.similarity_search(query)
# print(docs)
similar_apps = []
for j in range(len(docs_similar)):
    similar_metadata = (docs_similar[j].metadata)["source"]
    similar_app = get_sub_string(similar_metadata)
    similar_apps.append(similar_app)
print("++++++++++++++++++++++++++")
print("Similar app is a list", str(similar_apps))

4
[Document(page_content='final d[] array = { new d("NewSubfileType", 254, 4), new d("SubfileType", 255, 4), new d(256, 3, 4, "ImageWidth"), new d(257, 3, 4, "ImageLength"), new d("BitsPerSample", 258, 3), new d("Compression", 259, 3), new d("PhotometricInterpretation", 262, 3), new d("ImageDescription", 270, 2), new d("Make", 271, 2), new d("Model", 272, 2), new d(273, 3, 4, "StripOffsets"), new d("Orientation", 274, 3), new d("SamplesPerPixel", 277, 3), new d(278, 3, 4, "RowsPerStrip"), new d(279, 3, 4, "StripByteCounts"), new d("XResolution", 282, 5), new d("YResolution", 283, 5), new d("PlanarConfiguration", 284, 3), new d("ResolutionUnit", 296, 3), new d("TransferFunction", 301, 3), new d("Software", 305, 2), new d("DateTime", 306, 2), new d("Artist", 315, 2), new d("WhitePoint", 318, 5), new d("PrimaryChromaticities", 319, 5), new d("SubIFDPointer", 330, 4), new d("JPEGInterchangeFormat", 513, 4), new d("JPEGInterchangeFormatLength", 514, 4), new d("YCbCrCoefficients", 529, 5), n

In [12]:
# First, we need a prompt that we can pass into an LLM to generate this search query
prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user","{input}"),
    ("user","Given the above programming code block, generate a search query to look up information relevant to the conversation")
])
print(prompt)

input_variables=['chat_history', 'input'] input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]} messages=[MessagesPlaceholder(variable_name='chat_history'), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given the above programming code block, generate a search query to look up information relevant to the conversation'))]


In [13]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
        (
            "user",
            "Given the above conversation, generate a search query to look up to get information relevant to the conversation",
        ),
    ]
)

retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        ("placeholder", "{chat_history}"),
        ("user", "{input}"),
    ]
)
document_chain = create_stuff_documents_chain(llm, prompt)

qa = create_retrieval_chain(retriever_chain, document_chain)

In [14]:
question = f"""
What is Exif metadata in this list (Make, Model, Software, GPS, Datetime) included in this code block below? \n
{docs_similar} \n
If EXIF is not found in the code block, the simple answer is "the code block is unrelated to any kind of EXIF metadata."
"""
print("---QUESTION---")
print(question)
result = qa.invoke({"input": question})
code_purpose = result["answer"]
print("---CODE PURPOSE---")
print(code_purpose)
#print(type(code_purpose))
write_string_to_file(code_purpose, code_purpose_file)

---QUESTION---

What is Exif metadata in this list (Make, Model, Software, GPS, Datetime) included in this code block below? 

[Document(page_content='final d[] array = { new d("NewSubfileType", 254, 4), new d("SubfileType", 255, 4), new d(256, 3, 4, "ImageWidth"), new d(257, 3, 4, "ImageLength"), new d("BitsPerSample", 258, 3), new d("Compression", 259, 3), new d("PhotometricInterpretation", 262, 3), new d("ImageDescription", 270, 2), new d("Make", 271, 2), new d("Model", 272, 2), new d(273, 3, 4, "StripOffsets"), new d("Orientation", 274, 3), new d("SamplesPerPixel", 277, 3), new d(278, 3, 4, "RowsPerStrip"), new d(279, 3, 4, "StripByteCounts"), new d("XResolution", 282, 5), new d("YResolution", 283, 5), new d("PlanarConfiguration", 284, 3), new d("ResolutionUnit", 296, 3), new d("TransferFunction", 301, 3), new d("Software", 305, 2), new d("DateTime", 306, 2), new d("Artist", 315, 2), new d("WhitePoint", 318, 5), new d("PrimaryChromaticities", 319, 5), new d("SubIFDPointer", 330, 4)

In [15]:
# Summary
prompt_template = """
You are provided with a paragraph listing the types of EXIF metadata in the code block.
----------
{text}
----------
Question: Examine the paragraph and summarize which types of EXIF metadata the above paragraph relates to in the inclusion list (Make, Model, Software, GPS, Datetime) if EXIF metadata exists. 
Please only respond with the EXIF metadata name in this list (Make, Model, Software, GPS, Datetime). If you can't find it, answer "No".
EXIF metadata name:
"""
print(prompt_template)


You are provided with a paragraph listing the types of EXIF metadata in the code block.
----------
{text}
----------
Question: Examine the paragraph and summarize which types of EXIF metadata the above paragraph relates to in the inclusion list (Make, Model, Software, GPS, Datetime) if EXIF metadata exists. 
Please only respond with the EXIF metadata name in this list (Make, Model, Software, GPS, Datetime). If you can't find it, answer "No".
EXIF metadata name:



In [16]:
prompt = PromptTemplate.from_template(prompt_template)
prompt

PromptTemplate(input_variables=['text'], template='\nYou are provided with a paragraph listing the types of EXIF metadata in the code block.\n----------\n{text}\n----------\nQuestion: Examine the paragraph and summarize which types of EXIF metadata the above paragraph relates to in the inclusion list (Make, Model, Software, GPS, Datetime) if EXIF metadata exists. \nPlease only respond with the EXIF metadata name in this list (Make, Model, Software, GPS, Datetime). If you can\'t find it, answer "No".\nEXIF metadata name:\n')

In [17]:
prompt = PromptTemplate(template=prompt_template,input_variables=["text"])
stuff_chain = load_summarize_chain(llm,
                                   chain_type="stuff",
                                   prompt=prompt,
                                   verbose = True
                                  )
print(stuff_chain.llm_chain.prompt.template)


You are provided with a paragraph listing the types of EXIF metadata in the code block.
----------
{text}
----------
Question: Examine the paragraph and summarize which types of EXIF metadata the above paragraph relates to in the inclusion list (Make, Model, Software, GPS, Datetime) if EXIF metadata exists. 
Please only respond with the EXIF metadata name in this list (Make, Model, Software, GPS, Datetime). If you can't find it, answer "No".
EXIF metadata name:



In [18]:
loader = TextLoader(code_purpose_file)
code_purpose_docs = loader.load()
print(code_purpose_docs)
output_summary = stuff_chain.invoke(code_purpose_docs)
output_summary

[Document(page_content='The EXIF metadata included in the code block are:\n\n- "Make" (tag 271, type 2)\n- "Model" (tag 272, type 2)\n- "Software" (tag 305, type 2)\n- "DateTime" (tag 306, type 2)\n- "GPSInfoIFDPointer" (tag 34853, type 4)\n\nThese tags are part of the EXIF metadata standard, which is used to store information related to the image such as the camera settings and other attributes when the photo was taken.', metadata={'source': 'code_purpose//code_purpose.txt'})]


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are provided with a paragraph listing the types of EXIF metadata in the code block.
----------
The EXIF metadata included in the code block are:

- "Make" (tag 271, type 2)
- "Model" (tag 272, type 2)
- "Software" (tag 305, type 2)
- "DateTime" (tag 306, type 2)
- "GPSInfoIFDPointer" (tag 34853, type 4)

These tags are part of the EXIF metadata standard, which is used to sto

{'input_documents': [Document(page_content='The EXIF metadata included in the code block are:\n\n- "Make" (tag 271, type 2)\n- "Model" (tag 272, type 2)\n- "Software" (tag 305, type 2)\n- "DateTime" (tag 306, type 2)\n- "GPSInfoIFDPointer" (tag 34853, type 4)\n\nThese tags are part of the EXIF metadata standard, which is used to store information related to the image such as the camera settings and other attributes when the photo was taken.', metadata={'source': 'code_purpose//code_purpose.txt'})],
 'output_text': 'Make, Model, Software, GPS, Datetime'}

In [19]:
answer = output_summary["output_text"]

In [20]:
llm_answer_instance = LLMAnswer(app_name, answer, similar_apps)

In [21]:
print("LLMAnswer as JSON:", llm_answer_instance.to_json())

LLMAnswer as JSON: {
    "app_name": "Gallery-Photo-Gallery-App-1",
    "answer": "Make, Model, Software, GPS, Datetime",
    "similar_app": [
        "Gallery-Photo-Gallery-App-1",
        "Photo-Compress-Image-Resize-3",
        "File-Manager-File-Explorer-2",
        "Gallery-Photo-Gallery-Album-1"
    ]
}


In [22]:
delete_file(code_purpose_file)

File 'code_purpose//code_purpose.txt' deleted successfully.
