In [11]:
import pandas as pd
import chromadb
from openai import OpenAI
import os
from dotenv import load_dotenv
from chromadb.utils import embedding_functions
load_dotenv()
openai_client = OpenAI(api_key=os.getenv('OPEN_AI_API_KEY'))
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


# Imports to build a complete chat model through the langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


In [13]:
# Building the connection and loading the model
api_key=os.getenv('OPEN_AI_API_KEY')
model = ChatOpenAI(api_key=api_key, model="gpt-4o-mini")


  model = ChatOpenAI(api_key=api_key, model="gpt-4o-mini")


In [17]:
loader = DirectoryLoader(
    path="../data/text-files/", glob="*.txt", loader_cls=TextLoader
)
document = loader.load()

In [21]:

# split text into sentences
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n"],
    chunk_size=1000,
    chunk_overlap=20,
)

In [24]:
# Dividing into smaller documents. 'document' is different than the 'documents' variable altogether. 
# After splitting 21 documents into 223 smaller chunks. That's why we are using the text_splitter objetc instance
documents = text_splitter.split_documents(document)

In [32]:
#Generate the embeddings from OpenAI

embedding = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")

  embedding = OpenAIEmbeddings(api_key=api_key, model="text-embedding-3-small")


In [38]:
# Next we instantiate the Chroma object from langchain_community.vectorstores
# This time should be a separate directory
persits_directory = '../data/chroma_db_real_world.db'
vectordb = Chroma.from_documents(
    documents=documents, embedding=embedding, persist_directory=persits_directory
)  # This will create the Chroma object and persist the embeddings to the directory


In [40]:
# Now we can query the Chroma object for similar sentences
retriever = vectordb.as_retriever()
res_docs = retriever.invoke("how much did microsoft raise?", k=2)
print(res_docs[0].page_content)

[Document(metadata={'source': '..\\data\\text-files\\05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt'}, page_content='April 28, 2023\n\nVC firms including Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global are picking up new shares, according to documents seen by TechCrunch. A source tells us Founders Fund is also investing. Altogether the VCs have put in just over $300 million at a valuation of $27 billion to $29 billion. This is separate to a big investment from Microsoft announced earlier this year, a person familiar with the development told TechCrunch, which closed in January. The size of Microsoft’s investment is believed to be around $10 billion, a figure we confirmed with our source.\n\nApril 25, 2023\n\nCalled ChatGPT Business, OpenAI describes the forthcoming offering as “for professionals who need more control over their data as well as enterprises seeking to manage their end users.”'), Document(metadata={'source': '..\\data\\text-files\\05

In [57]:
# Printing out the relevant documents that have teh answer to the question that we asked earlier.
# We will try to answer this with the help of OpenAI in the upcoming cell block with gpt-4o-mini which is the affordable version.
for i in range(0, len(res_docs)):
    print(f'Document: {i+1}')
    print(res_docs[i].page_content)
    print('****')

Document: 1
April 28, 2023

VC firms including Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global are picking up new shares, according to documents seen by TechCrunch. A source tells us Founders Fund is also investing. Altogether the VCs have put in just over $300 million at a valuation of $27 billion to $29 billion. This is separate to a big investment from Microsoft announced earlier this year, a person familiar with the development told TechCrunch, which closed in January. The size of Microsoft’s investment is believed to be around $10 billion, a figure we confirmed with our source.

April 25, 2023

Called ChatGPT Business, OpenAI describes the forthcoming offering as “for professionals who need more control over their data as well as enterprises seeking to manage their end users.”
****
Document: 2
Microsoft doubles down on AI with new Bing features The company's betting the farm on generative AI

Microsoft is embarking on the next phase of Bing’s expansion. And — no surpris