Notebook demonstrating document indexing using Llamaindex, Pinecone and openai.

# Imports

In [None]:
import os

import llama_index
import openai
from llama_index.core import (
    Document,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.indices.service_context import ServiceContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI as OpenAILLM
from llama_index.vector_stores.pinecone import PineconeVectorStore
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [None]:
# set api keys
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_KEY = os.environ["PINECONE_KEY"]

In [None]:
openai.api_key = OPENAI_API_KEY

In [None]:
pinecone_client = Pinecone(api_key=PINECONE_KEY)

# Read data

In [None]:
pdf_file = "HAI_AI-Index-Report_2023.pdf" # https://aiindex.stanford.edu/wp-content/uploads/2023/04/HAI_AI-Index-Report_2023.pdf

In [None]:
reader = SimpleDirectoryReader(input_files=[pdf_file])

In [None]:
data = reader.load_data()

In [None]:
len(data)

In [None]:
document = Document(text="\n\n".join([doc.text for doc in data])) # merge the documents spread across pages

# Indexing

In [None]:
metric = "cosine"
dim = 1536
model = "gpt-3.5-turbo"
llm = OpenAILLM(model=model, temperature=0, max_tokens=10240, api_key=OPENAI_API_KEY)

In [None]:
# sentence-window retrieval
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=2,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [None]:
sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=OpenAIEmbedding(),
    node_parser=node_parser,
)

In [None]:
index_name = "rag-demo"

pinecone_client.create_index(
    name=index_name,
    dimension=dim,
    metric=metric,
    spec=ServerlessSpec(cloud="aws", region="us-west-2"),
)

In [None]:
pinecone_index = pinecone_client.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex.from_documents(
    [document],
    storage_context=storage_context,
    service_context=sentence_context,
)