In [None]:
import os
import sys
import logging
import openai
from dotenv import load_dotenv
from llama_index import (
    VectorStoreIndex,
    LLMPredictor,
    ServiceContext,
    StorageContext,
    PromptHelper,
)
from langchain.llms.openai import OpenAIChat
from langchain import OpenAI

load_dotenv()

# api key
openai.api_key = os.getenv("OPENAI_API_KEY")

# log
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
import pinecone
from llama_index.vector_stores import PineconeVectorStore

# Creating a Pinecone index
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")
pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)
pinecone_index = pinecone.Index(os.getenv("PINECONE_INDEX"))
pinecone_index.describe_index_stats() #파인콘 index 상태 확인

In [None]:
file_names = [
    file_name
    for file_name in os.listdir("./pdf")
    if file_name.endswith(".pdf")
]
print(file_names)

In [None]:
from pathlib import Path
from llama_index import download_loader

file_names = [
    file_name
    for file_name in os.listdir("./pdf")
    if file_name.endswith(".pdf")
]

PDFReader = download_loader("PDFReader")
loader = PDFReader()

prod_descs = {}
all_docs = []
for idx, file_name in enumerate(file_names):
    prod_descs[file_name] = loader.load_data(
        Path(f"""C:/Users/mingue/Desktop/Demo_final/pdf/{file_name}""")
    )
    all_docs.extend(prod_descs[file_name])

In [None]:
import tiktoken

# construct vector store
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index, tokenizer=tiktoken.get_encoding("cl100k_base")
)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)
service_context = ServiceContext.from_defaults(chunk_overlap=50)
index = VectorStoreIndex.from_documents(
    documents=all_docs, storage_context=storage_context, service_context=service_context
)

In [None]:
nodes = service_context.node_parser.get_nodes_from_documents(all_docs)

In [None]:
len(nodes)

In [None]:
pinecone_index.describe_index_stats()

# vector DB에서 meta data filtering으로 vector 불러오기  
각 vector의 meta data엔 원본 text, pdf 페이지, pdf 파일 이름, 앞 뒤 노드에 관한 정보가 들어있다.  
불러온 벡터를 노드로 만들고 list index tree index 등으로 변환하는 방법이 있을까?

In [13]:
import pinecone
from llama_index.vector_stores import PineconeVectorStore

# Creating a Pinecone index
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")
pinecone.init(api_key=pinecone_api_key, environment=pinecone_environment)
pinecone_index = pinecone.Index(os.getenv("PINECONE_INDEX"))
pinecone_index.describe_index_stats() #파인콘 index 상태 확인

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [28]:
from pathlib import Path
from llama_index import download_loader

file_names = [
    file_name
    for file_name in os.listdir("./pdf")
    if file_name.endswith(".pdf")
]

PDFReader = download_loader("PDFReader")
loader = PDFReader()

prod_descs = {}
all_docs = []
for idx, file_name in enumerate(file_names):
    prod_descs[file_name] = loader.load_data(
        Path(f"""C:/Users/mingue/Desktop/Demo_final/pdf/{file_name}""")
    )
    all_docs.extend(prod_descs[file_name])

In [36]:
nodes = service_context.node_parser.get_nodes_from_documents(all_docs)

In [35]:
pinecone_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 266}},
 'total_vector_count': 266}