In [1]:
# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core import SimpleDirectoryReader
import json
import os

In [3]:
# SELECT EMBED MODEL
embed_model = OpenAIEmbedding(model='text-embedding-3-large')
# INIT SPLITTER MODEL
splitter = SemanticSplitterNodeParser(buffer_size=1, 
                                      breakpoint_percentile_threshold=95, 
                                      embed_model=embed_model)

In [None]:
# txt_dir = './S01_LlamaParsedMarkdownText/'
# lstxt = [f for f in os.listdir(txt_dir) if f.endswith('.txt')]
# lstxt

['Industrial_Gas_Turbines_AMY_Razak.txt',
 'msd_servo_drive.txt',
 'pcs7_compendium_part_a_en-US_en-US.txt',
 'pcs7_compendium_part_b_en-US_en-US.txt']

In [4]:
# LOAD TEXT FILES
documents = SimpleDirectoryReader(input_dir='./S01_LlamaParsedMarkdownText/').load_data()
# PRINT
print(len(documents))
[x.metadata['file_name'] for x in documents]

4


['Industrial_Gas_Turbines_AMY_Razak.txt',
 'msd_servo_drive.txt',
 'pcs7_compendium_part_a_en-US_en-US.txt',
 'pcs7_compendium_part_b_en-US_en-US.txt']

In [None]:
for i in range(len(documents)):
    # PREP
    doc = documents[i]
    filename = doc.metadata['file_name']
    print(i,filename)
    # SEMANTIC SPLIT
    nodes = splitter.get_nodes_from_documents([doc])
    nodes_json = [node.to_dict() for node in nodes]   
    # SAVE JSON
    output_name = filename.replace('.txt','.json')
    output_path = f"./S02_SemanticChunkedJson/{output_name}"
    with open(output_path, 'w') as f:
        json.dump(nodes_json, f)
    print(i,output_name)
    print('')

In [None]:
# from llama_index.core.schema import TextNode

# with open(output_path) as f:
#     nodes_json = json.load(f) 

# nodes = [TextNode.from_dict(node_dict) for node_dict in nodes_json]