In [1]:
import os
from pathlib import Path
from llama_index.llms.ollama import Ollama
from llama_index.core import VectorStoreIndex
from llama_index.core.embeddings import resolve_embed_model
from llama_index.readers.json import JSONReader
from llama_index.core.node_parser import JSONNodeParser
from llama_index.readers.file import FlatReader

In [2]:
embed_model = resolve_embed_model("local:BAAI/bge-m3")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
llm = Ollama(model="mistral", request_timeout=180.0)

In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./data/energy_cost_related.json", 
                 "./data/production_related.json",
                 "./data/time_related.json"
                ]
).load_data()

In [4]:
documents[0]

Document(id_='6ea4ef66-06fa-4c12-965c-9028075ce587', embedding=None, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='{\n    "kpi": {\n      "consumption": {\n        "average": [\n          { "machine": "Assembly Machine 1", "month": "2024-03", "value": 0.0 },\n          { "machine": "Assembly Machine 2", "month": "2024-03", "value": 0.0 },\n          { "machine": "Assembly Machine 3", "month": "2024-03", "value": 0.0 },\n          { "machine": "Large Capacity Cutting Machine 1", "month": "2024-03", "value": 0.0013015137854767644 },\n      

In [4]:
DATA_PATH = "./data/"

# creating the documents out of the json files
documents = []
for filename in os.listdir(DATA_PATH):
    if filename.endswith(".json"):
        file_path = os.path.join(DATA_PATH, filename)
        documents.extend(FlatReader().load_data(Path(file_path)))     # if we want to load the data to then split it into nodes
        # documents.extend(reader.load_data(input_file=file_path))
parser = JSONNodeParser(include_metadata=True,
                        include_prev_next_rel=True)
nodes = parser.get_nodes_from_documents(documents) 


In [7]:
documents[0].metadata['machines'] = "Ragnarok"

In [8]:
from llama_index.core.node_parser import SimpleNodeParser

parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)
len(nodes)

9

In [9]:
nodes[0]

TextNode(id_='2cb629c8-a767-4018-8e2b-b7d700fc8ede', embedding=None, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11', 'machines': 'Ragnarok'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6ea4ef66-06fa-4c12-965c-9028075ce587', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11', 'machines': 'Ragnarok'}, hash='6093478eda45a44f3868b0ccad298913

In [18]:
from llama_index.core.extractors import (
    TitleExtractor,
#     QuestionsAnsweredExtractor,
#     SummaryExtractor,
)
# from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    # separator=" ", 
    chunk_size=512, 
    chunk_overlap=128
)
title_extractor = TitleExtractor(llm=llm,
                                 nodes=5)
# qa_extractor = QuestionsAnsweredExtractor(questions=3)
# entity_extractor= EntityExtractor()
# assume documents are defined -> extract nodes
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor]
)

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
)

In [17]:
nodes[0].metadata

{'file_path': 'data\\energy_cost_related.json',
 'file_name': 'energy_cost_related.json',
 'file_type': 'application/json',
 'file_size': 7409,
 'creation_date': '2024-11-11',
 'last_modified_date': '2024-11-11',
 'machines': 'Ragnarok'}