In [8]:
import os
from pathlib import Path
from llama_index.llms.ollama import Ollama
from llama_index.core import VectorStoreIndex
from llama_index.core.embeddings import resolve_embed_model
from llama_index.readers.json import JSONReader
from llama_index.core.node_parser import JSONNodeParser
from llama_index.readers.file import FlatReader



In [9]:
embed_model = resolve_embed_model("local:BAAI/bge-m3")

In [10]:
llm = Ollama(model="mistral", request_timeout=180.0)

In [21]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./data/energy_cost_related.json", 
                 "./data/production_related.json",
                 "./data/time_related.json"
                ]
).load_data()

In [4]:
documents[0]

Document(id_='6ea4ef66-06fa-4c12-965c-9028075ce587', embedding=None, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='{\n    "kpi": {\n      "consumption": {\n        "average": [\n          { "machine": "Assembly Machine 1", "month": "2024-03", "value": 0.0 },\n          { "machine": "Assembly Machine 2", "month": "2024-03", "value": 0.0 },\n          { "machine": "Assembly Machine 3", "month": "2024-03", "value": 0.0 },\n          { "machine": "Large Capacity Cutting Machine 1", "month": "2024-03", "value": 0.0013015137854767644 },\n      

In [11]:
DATA_PATH = "./data/"

# creating the documents out of the json files
documents = []
for filename in os.listdir(DATA_PATH):
    if filename.endswith(".json"):
        file_path = os.path.join(DATA_PATH, filename)
        documents.extend(FlatReader().load_data(Path(file_path)))     # if we want to load the data to then split it into nodes
        # documents.extend(reader.load_data(input_file=file_path))
parser = JSONNodeParser(include_metadata=True,
                        include_prev_next_rel=True)
nodes = parser.get_nodes_from_documents(documents) 


In [8]:
from llama_index.core.node_parser import SimpleNodeParser

parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)
len(nodes)

9

In [9]:
nodes[0]

TextNode(id_='2cb629c8-a767-4018-8e2b-b7d700fc8ede', embedding=None, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11', 'machines': 'Ragnarok'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6ea4ef66-06fa-4c12-965c-9028075ce587', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data\\energy_cost_related.json', 'file_name': 'energy_cost_related.json', 'file_type': 'application/json', 'file_size': 7409, 'creation_date': '2024-11-11', 'last_modified_date': '2024-11-11', 'machines': 'Ragnarok'}, hash='6093478eda45a44f3868b0ccad298913

In [6]:
import nest_asyncio
import nltk
nltk.download('punkt_tab')
nest_asyncio.apply()

In [18]:
documents= [documents[0]]

In [19]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter

from llama_index.core.ingestion import IngestionPipeline


text_splitter = TokenTextSplitter(
    # separator=" ", 
    chunk_size=512, 
    chunk_overlap=128
)

#if you wanna create some custom extractor

# class CustomExtractor(BaseExtractor):
#     def extract(self, nodes):
#         metadata_list = [
#             {
#                 "custom": (
#                     node.metadata["document_title"]
#                     + "\n"
#                     + node.metadata["excerpt_keywords"]
#                 )
#             }
#             for node in nodes
#         ]
#         return metadata_list

transformations = [
    text_splitter,
    # TitleExtractor(nodes=3,llm=llm),
    QuestionsAnsweredExtractor(questions=2,llm=llm),
    SummaryExtractor(summaries=["prev", "self"],llm=llm),
    KeywordExtractor(keywords=4,llm=llm),
    EntityExtractor(prediction_threshold=0.5,llm=llm),
]


pipeline = IngestionPipeline(
    transformations=transformations
)

nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True,
)

Parsing nodes: 100%|█████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:51<00:00,  2.85s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:59<00:00,  3.29s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:16<00:00,  1.12it/s]
Extracting entities: 100%|█████████████████████████████████████████████████████████████| 18/18 [01:12<00:00,  4.02s/it]


In [20]:
nodes[0].metadata

{'filename': 'monthly_Large_Capacity_Cutting_Machine_2.json',
 'extension': '.json',
 'document_title': '2024-March Performance Analysis of Large Capacity Cutting Machine 2: Good Cycles, Bad Cycles, Production, Consumption, Average Cycle Time, and Identification of Bad Cycles Data\n\nThis title provides a clear overview of the content, including the specific machine (Large Capacity Cutting Machine 2), the time period (March 2024), the types of data presented (Good Cycles, Bad Cycles, Production, Consumption, Average Cycle Time, and Bad Cycles Data), and the purpose of the analysis.',
 'questions_this_excerpt_can_answer': '1. What was the average cycle time, minimum cycle time, and maximum cycle time for Large Capacity Cutting Machine 2 in March 2024 according to the given data?\n\n2. How many bad cycles, on average, were recorded for Large Capacity Cutting Machine 2 during the month of March 2024 based on this analysis?\n\n(Summarizing the context: The document contains performance ana

In [21]:
len(nodes)

18