# Generative AI - Vector Store

In [78]:
import os
from dotenv import load_dotenv
import openai
from langchain_openai import ChatOpenAI
# JSON loader
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# 1. Function

In [79]:
# Setup model
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = api_key
llm = ChatOpenAI(model="gpt-4-turbo",temperature=0)
# Function list all file in directory to list
def list_files_in_directory(directory_path):
    file_list = os.listdir(directory_path)
    file_list = [file for file in file_list if os.path.isfile(os.path.join(directory_path, file))]
    return file_list
# Function load JSON
def load_json_data(file_path,json_root):
    # Define the JSONLoader with the appropriate parameters
    loader = JSONLoader(
        file_path=file_path,
        jq_schema="."+json_root+"[]",
        text_content=False
    )
    # Load the JSON data
    data = loader.load()
    return data
# Function remove directory if exist
def remove_directory(directory_path):
    if os.path.exists(directory_path):
        try:
            os.rmdir(directory_path)
            print(f"Directory '{directory_path}' removed successfully.")
        except OSError as e:
            print(f"Error: {directory_path} : {e.strerror}")
    else:
        print(f"Directory '{directory_path}' does not exist.")
# Function-test write list to file 
def write_list_to_file(data_list, file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
    with open(file_path, 'w') as file:
        for item in data_list:
            file.write(str(item) + '\n')

# 2. Static variable

In [80]:
declare_code_block_path = "./declare_code_block"
handle_code_block_path = "./handle_code_block"
exif_keyword_code_block_path = "./exif_keyword_code_block_re"

# 3. Main

In [81]:
# 1. Get file
declare_code_files = list_files_in_directory(declare_code_block_path)
handle_code_block_files = list_files_in_directory(handle_code_block_path)
exif_keyword_code_block_files = list_files_in_directory(exif_keyword_code_block_path)
# print("---------------------")
# print(declare_code_files)
# print("---------------------")
# print(handle_code_block_files)
# print("---------------------")
# print(exif_keyword_code_block_files)

In [82]:
# embedding
embeddings = OpenAIEmbeddings()
# documents
declare_code_documents_total = []
handle_code_documents_total = []
exif_keyword_code_documents_total = []

In [83]:
print("==============================DOCUMENT LOADER======================================")
for i in range(len(exif_keyword_code_block_files)):
    print("------------------------- Loop-"+str(i)+"-------------------------")
    exif_keyword_code_file = exif_keyword_code_block_path+"/"+exif_keyword_code_block_files[i]
    print("File :"+exif_keyword_code_file)
    exif_keyword_code_documents = load_json_data(exif_keyword_code_file,"exifkeyword")
    print("Length exif_keyword_code json_loader: "+str(len(exif_keyword_code_documents)))
    print(exif_keyword_code_documents)
    exif_keyword_code_documents_total.extend(exif_keyword_code_documents)
    # for document in declare_code_documents:
    #     print("**************")
    #     print(document)
    #break
print("*****************************************************************************")
print("Length exif_keyword_code json_loader total: "+str(len(exif_keyword_code_documents_total)))
print("*****************************************************************************")
write_list_to_file(exif_keyword_code_documents_total,"exif_keyword_code_documents_total.json")

------------------------- Loop-0-------------------------
File :./exif_keyword_code_block_re/Gallery-Photo-Vault-Album-1.0.3_exif_keyword.json
Length exif_keyword_code json_loader: 69
[Document(page_content='if ((("Make".equals(e.b) || "Model".equals(e.b)) && value.i(this.g).contains("PENTAX")) || ("Compression".equals(e.b) && value.h(this.g) == 65535)) {', metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/exif_keyword_code_block_re/Gallery-Photo-Vault-Album-1.0.3_exif_keyword.json', 'seq_num': 1}), Document(page_content='final e e89 = new e("GPSTrackRef", 14, 2);', metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/exif_keyword_code_block_re/Gallery-Photo-Vault-Album-1.0.3_exif_keyword.json', 'seq_num': 2}), Document(page_content='final e e99 = new e("GPSDestBearing", 24, 5);', metadata={'source': '/root/metaLeak-ml-llm-rag-ubuntu/exif_keyword_code_block_re/Gallery-Photo-Vault-Album-1.0.3_exif_keyword.json', 'seq_num': 3}), Document(page_content='final e e103 = new e("GPSAreaIn

In [84]:
print("==============================VECTOR STORE======================================")
vector_handle = FAISS.from_documents(exif_keyword_code_documents_total,embeddings) # ingest documents into the vector store 
print("Length vector exif_keyword_code : "+str(vector_handle.index.ntotal)) 
# print(vector_declare.index_to_docstore_id)
# save local
remove_directory("faiss_vector_exif_keyword_index_db")
vector_declare.save_local("faiss_vector_exif_keyword_index_db")

Length vector exif_keyword_code : 9926
Directory 'faiss_vector_exif_keyword_index_db' does not exist.
