In [1]:
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
import chromadb
import os
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

p = pipeline(
    task = Tasks.document_segmentation,
    model = 'iic/nlp_bert_document-segmentation_chinese-base'
)

db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("bible_vector")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=None, embed_model=OpenAIEmbedding())

2024-04-27 18:28:48,093 - modelscope - INFO - PyTorch version 2.2.1+cu121 Found.
2024-04-27 18:28:48,096 - modelscope - INFO - Loading ast index from C:\Users\wangz\.cache\modelscope\ast_indexer
2024-04-27 18:28:48,400 - modelscope - INFO - Loading done! Current index file version is 1.13.1, with md5 d00a6633a83861bd55e9102c13c16e14 and a total number of 972 components indexed
2024-04-27 18:28:59,536 - modelscope - INFO - initiate model from C:\Users\wangz\.cache\modelscope\hub\iic\nlp_bert_document-segmentation_chinese-base
2024-04-27 18:28:59,537 - modelscope - INFO - initiate model from location C:\Users\wangz\.cache\modelscope\hub\iic\nlp_bert_document-segmentation_chinese-base.
2024-04-27 18:28:59,540 - modelscope - INFO - initialize model from C:\Users\wangz\.cache\modelscope\hub\iic\nlp_bert_document-segmentation_chinese-base
  service_context = ServiceContext.from_defaults(llm=None, embed_model=OpenAIEmbedding())


LLM is explicitly disabled. Using MockLLM.


In [2]:
import json
file_path = './new_bible.json'

# 打开并读取 JSON 文件
with open(file_path, 'r', encoding='utf-8') as file:
    # 加载 JSON 文件内容到一个字典
    data = json.load(file)

all_nodes = []

for key in data.keys():
    for i in range(len(data[key])):
        result = p(documents=data[key][i])
        text_list = [text.replace('\t', '') for text in result['text'].split('\n\t') if len(text.strip()) > 0]
        documents = [Document(text=t) for t in text_list]
        parser = SentenceSplitter()
        nodes = parser.get_nodes_from_documents(documents)
        for node in nodes:
            node.metadata["book"] = key
            node.metadata["chapter"] = i
        all_nodes+=nodes
vector_index = VectorStoreIndex(all_nodes, service_context=service_context, storage_context=storage_context)



In [3]:
len(all_nodes)

7922

In [4]:
all_nodes

[TextNode(id_='0695c809-a030-424b-b847-6048520599cc', embedding=None, metadata={'book': '创世记', 'chapter': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4ac51723-18f4-4b28-85a8-57b3f3aaa3dd', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='dd578fcc916daabbd3e4694700b51cb512b3de08f2cf91c37ed3d428db41c126'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2f669785-5637-47ed-8ee1-f260242b578f', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0adaf5f091f982827bbd0d0727ebe956944dd681b58eb97e704a33bbf95394b1')}, text='第一章创造天地万物1起初，神创造天地。2地是空虚混沌；深渊上一片黑暗；神的灵运行在水面上。', start_char_idx=0, end_char_idx=45, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='2f669785-5637-47ed-8ee1-f260242b578f', embedding=None, metadata={'book': '创世记', 'chapter': 0}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], rela