In [12]:
import json
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

In [16]:
class JSONLoader(BaseLoader):
    def __init__(
        self,
        file_path: Union[str, Path],
        content_key: Optional[str] = None,
        metadata_func: Optional[Callable[[dict, dict], dict]] = None,
    ):
        """
        Initializes the JSONLoader with a file path, an optional content key to extract specific content,
        and an optional metadata function to extract metadata from each record.
        """
        self.file_path = Path(file_path).resolve()
        self.content_key = content_key 
        self.metadata_func = metadata_func

    def create_documents(self, processed_data):
        """
        Creates Document objects from processed data.
        """
        documents = []
        for item in processed_data:
            content = item.get('content', '')  
            metadata = item.get('metadata', {})
            document = Document(page_content=content, metadata=metadata)
            documents.append(document)
        return documents

    def process_json(self, data):
        """
        Processes JSON data to prepare for document creation, extracting content based on the content_key
        and applying the metadata function if provided.
        """
        processed_data = []
        if isinstance(data, list):
            for item in data:
                content = item.get(self.content_key, '') if self.content_key else ''
                metadata = {}
                if self.metadata_func and isinstance(item, dict):
                    metadata = self.metadata_func(item, {})
                processed_data.append({'content': content, 'metadata': metadata})
        return processed_data


    def load(self) -> List[Document]:
            """
            Load and return documents from the JSON file.
            """
            docs = []
            with open(self.file_path, mode="r", encoding="utf-8") as json_file:
                try:
                    data = json.load(json_file)
                    processed_json = self.process_json(data)
                    docs = self.create_documents(processed_json)
                except json.JSONDecodeError:
                    print("Error: Invalid JSON format in the file.")
            return docs

In [17]:
loader = JSONLoader(
    file_path="./ex.json",
)

In [20]:
with open('ex.json', 'r') as file:
    data = json.load(file)


print(data)

{'users': [{'id': 1, 'username': 'user1', 'email': 'user1@example.com'}, {'id': 2, 'username': 'user2', 'email': 'user2@example.com'}, {'id': 3, 'username': 'user3', 'email': 'user3@example.com'}]}


In [22]:
datal=loader.load()
print(datal)

[]


In [4]:
from dotenv import load_dotenv

# .env 파일 로드
load_dotenv()

True

In [5]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

# 문서를 로드합니다.
documents = loader.load()
# 문자 기반으로 텍스트를 분할하는 CharacterTextSplitter를 생성합니다. 청크 크기는 1000이고 청크 간 중복은 없습니다.
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0)
# 로드된 문서를 분할합니다.
texts = text_splitter.split_documents(documents)
# OpenAI 임베딩을 생성합니다.
embeddings = OpenAIEmbeddings()


In [7]:
# 분할된 텍스트와 임베딩을 사용하여 FAISS 벡터 데이터베이스를 생성합니다.
db = FAISS.from_documents(texts, embeddings)

IndexError: list index out of range

In [6]:
embedded_texts = embeddings.embed_documents(texts)

# 임베딩 결과를 확인합니다.
for i, embedded_text in enumerate(embedded_texts):
    print(f"Text {i+1} embedding:")
    print(embedded_text)