In [1]:
import os
import time
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone, ServerlessSpec

# This is from our embedding model
VECTOR_DIMENSION = 768
INDEX_NAME = "thtpledgeinfo"

load_dotenv()

# GROQ_API_KEY needs to be an environment variable (create a .env file for this)
key = os.getenv(key="GROQ_API_KEY")
from langchain_groq import ChatGroq
llm = ChatGroq(model="llama3-8b-8192",api_key=key)

  from tqdm.autonotebook import tqdm


In [None]:
loader = DirectoryLoader('docs/', glob="**/*.txt")
docs = loader.load()

embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# May want to play around with chunking sizes to make it run better
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# ChromaDB version
# vectorstore = Chroma.from_documents(documents=splits, embedding=embed)

# Pinecone version -------
# pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment='us-east-1-aws')
pc = Pinecone(os.getenv(key="PINECONE_API_KEY"))
spec = ServerlessSpec(cloud='aws', region='us-east-1')  

In [None]:
try:
    pc.delete_index(INDEX_NAME)  
except:
    print("Could not delete index.")
# create a new index  

print("Creating index...")
pc.create_index(  
    INDEX_NAME,  
    dimension=VECTOR_DIMENSION,  # dimensionality of text-embedding-ada-002  
    metric='cosine',  
    spec=spec  
)  