In [1]:
# Hugging Face with langchain

In [5]:
import os 
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

hf_key = os.environ["HUGGINGFACEHUB_API_TOKEN"]

In [13]:
def chunk_data(data, file_name, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    chunks = text_splitter.split_documents(data)
    return chunks

In [10]:
# $if you have created new index and you are storing the document in to the vector then only use this function 
def embedding_and_storing(index_name, chunks):  # must have index name, and already chunked the store

    from langchain_pinecone import PineconeVectorStore
    from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

    embeddings = HuggingFaceEndpointEmbeddings(
        model= "mixedbread-ai/mxbai-embed-large-v1",
        task="feature-extraction",
        huggingfacehub_api_token=hf_key,
    )

    print("only applicable if you have created a new Index....")
    vectorstore = PineconeVectorStore.from_documents(
        documents=chunks,
        embedding=embeddings,
        index_name=index_name
    )
    print("Documents successfully uploaded to pinecone!!")

    return vectorstore

In [11]:
from pinecone import Pinecone

pc = Pinecone(
        api=os.environ.get("PINECONE_API_KEY")
    )

In [None]:
def creating_new_index(index_name):
    from pinecone import pinecone
    from pinecone import ServerlessSpec
    # from langchain_pinecone import PineconeVectorStore
    # from langchain_google_genai import GoogleGenerativeAIEmbeddings

    pc = Pinecone(
                    api_key=os.environ.get("PINECONE_API_KEY")
                )

    if index_name not in pc.list_indexes():
        # if we could not find the index-name in the pinecone we have to create a new one
        print(f"Creating an index name...........{index_name}")
        pc.create_index(
            index_name,
            dimension=3072,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        print("Done creating Index..")

    else:
        print(f"Index {index_name} already exists.....", ends='')