In [9]:
import os
from openai import OpenAI

from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [10]:
api_key = os.getenv("OpenAI_API_KEY")

In [11]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-3.5-turbo")

In [12]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/PythonNotesForProfessionals.pdf")
pages = loader.load_and_split()

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(docs, language='python',chunk_size=5000,chunk_overlap=500):
    
    python_splitter = RecursiveCharacterTextSplitter.from_language(
        language=language, chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    
    texts = python_splitter.split_documents(pages)
    return texts

In [14]:
docs = split_text(pages)

In [15]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
persist_directory='chroma_db'

vector_db = Chroma.from_documents(
        documents=docs, persist_directory=persist_directory, embedding=embeddings
)

In [17]:
vector_db.persist()

In [18]:
vector_db

<langchain_community.vectorstores.chroma.Chroma at 0x7f3d601cb4d0>

In [19]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma('chroma_db',embeddings)

In [25]:
db.as_retriever

<langchain_community.vectorstores.chroma.Chroma at 0x7f3c7934e650>

In [20]:
query ='How do I find even numbers'

In [22]:
even = vector_db.similarity_search(query)

In [24]:
even

[]