## Step 1: Setting Up Development Environment

In [1]:
#markdown Install the libraries:
%pip install openai tiktoken llama_hub llama_index pypdf accelerate sentence_transformers llama_index neo4j 

Note: you may need to restart the kernel to use updated packages.


## 提前准备
- Docker Desktop
- Azure OpenAI 或者 GPT-4 （免费的 OpenAPI GPT-3.5 有 token 限制）
- python 版本小于 3.12（本项目使用 3.11.6python 版本）

In [2]:
#Download Dataset(https://www.kaggle.com/datasets/hinepo/harry-potter-books-in-pdf-1-7)

# Import libraries

import os 
from pathlib import Path
import tiktoken

from llama_index.llms import AzureOpenAI


from llama_index.text_splitter import TokenTextSplitter
from llama_index.llms import AzureOpenAI , OpenAI
from llama_index import ServiceContext, PromptHelper

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import set_global_service_context

from llama_index import Document


import logging
import sys

from IPython.display import Markdown, display

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

ImportError: cannot import name 'AzureOpenAI' from 'llama_index.llms' (unknown location)

### Step 2: Load Documents

In [None]:
# AzureOpenAI 配置
# os.environ['AZURE_OPENAI_API_KEY'] = "xxx"
# os.environ['AZURE_OPENAI_ENDPOINT'] = "https://xxx.openai.azure.com"
# os.environ['OPENAI_API_VERSION'] = "xxx"

# Openapi 配置
# os.environ['OPENAI_API_KEY'] = ""

documents = SimpleDirectoryReader(input_dir='data').load_data()

print(f" docs is a {type(documents[0])},  where each element is a {type(documents[0])} object")

# Output
# docs is a <class 'llama_index.schema.Document'>,  where each element is a <class 'llama_index.schema.Document'> object

ImportError: pypdf is required to read PDF files: `pip install pypdf`

### Step 3: Creating Text Chunks

In [None]:
text_splitter = TokenTextSplitter(
  separator=" ",
  chunk_size=1024,
  chunk_overlap=20,
  backup_separators=["\n"]

)

# optional
nodes = text_splitter.get_nodes_from_documents([Document(text="long text")], show_progress=False)

1022


### Step 4: Creating Knowledge Repositories

In [None]:
# AzureOpenAI 配置
llm = AzureOpenAI(engine='gpt-35-turbo-16k',  temperature=0, max_tokens=256,use_azure_ad=False,timeout=60000)
# Openapi 配置
# llm = OpenAI(model='gpt-3.5-turbo', temperature=0, max_tokens=256)



prompt_helper = PromptHelper(

  context_window=4096,

  num_output=256,

  chunk_overlap_ratio=0.1,

  chunk_size_limit=None

)

service_context = ServiceContext.from_defaults(

  llm=llm,

   embed_model="local:BAAI/bge-small-en-v1.5",

  # text_splitter=text_splitter,

  prompt_helper=prompt_helper

)

set_global_service_context(service_context)

### Step 5: Neo4j as the Vector Database

In [None]:
index = VectorStoreIndex.from_documents(
    documents,
    service_context = service_context
    )
index.storage_context.persist()

1009
1009
1009
1009
1009


### Step 6: Query Index

In [None]:
# 请求会超时
query_engine = index.as_query_engine(service_context=service_context)
response = query_engine.query("Who did Harry Potter kill to be victorious")
print(response)

# Output
# Based on the context information provided, it is not stated that Harry Potter killed anyone to be victorious

### Step 7: Initiate the Neo4j vector wrapper

In [None]:
from llama_index.vector_stores import Neo4jVectorStore

username = "neo4j"
password = "pleaseletmein"
url = "bolt://localhost:7687"
embed_dim = 1536

neo4j_vector = Neo4jVectorStore(username, password, url, embed_dim)

### Step 8 : Build the VectorStoreIndex with Graph Database

In [None]:
from llama_index.storage.storage_context import StorageContext
from IPython.display import Markdown, display

# create storage context
storage_context = StorageContext.from_defaults(vector_store=neo4j_vector)

# create index
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

### Step 9: Query the index:

In [None]:
query_engine = index.as_query_engine()
response = query_engine.query("Who is Harry Potter")
display(Markdown(f"<b>{response}</b>"))

#Output 
# Harry Potter is a character in the book "Harry Potter and the Half-Blood Prince" by J.K. Rowling.
