### Indexing

In [29]:
import json
import getpass
import os
from openai import OpenAI
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#!pip install langchain_pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders.csv_loader import CSVLoader
import time
from typing import Sequence
import streamlit as st
from langchain_core.messages import HumanMessage
import bs4
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing_extensions import Annotated, TypedDict
from uuid import uuid4
from langchain_core.documents import Document

In [30]:
with open("secrets/openai_key.json", "r") as f:
    openai_secrets = json.load(f)
os.environ["OPENAI_API_KEY"] = openai_secrets["openai_api_key"]
client = OpenAI()

with open("secrets/pinecone_key.json", "r") as f:
    pinecone_secrets = json.load(f)
pc = Pinecone(pinecone_secrets["pinecone_api_key"])

index_name = "langchain-test-index"  # change if desired

index = pc.Index(index_name)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

vector_store = PineconeVectorStore(index=index, embedding=embeddings)
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.5},
)

In [31]:
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

In [32]:
#Recursive character split : divides the input text into smaller chunks of similar sizes in a hierarchical and iterative manner using a set of separators. 

formatted_documents = []
for file in os.listdir("./data"):
    if file.lower().endswith("pdf"):
        loader = PyPDFLoader(f"./data/{file}")
        documents = loader.load()
        text = ''
        for i in range(len(documents)):
            text+=documents[i].page_content
    elif file.lower().endswith("txt"):
        with open(f"./data/{file}", "r") as f:
            text = f.read()
    else:
        print(f"File {file} type not supported")
        continue
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
    docs = text_splitter.split_text(text)

    for i in range(len(docs)):

        formatted_documents.append(Document(
            page_content=docs[i],
            metadata={"source": file},
        ))

In [None]:
uuids = [str(uuid4()) for _ in range(len(formatted_documents))]
vector_store.add_documents(documents=formatted_documents, ids=uuids)

In [None]:
formatted_documents