In [1]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Now you can access the OpenAI API key using os.getenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [2]:
import spacy

def extract_topic(sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    
    # Filter out stopwords and punctuation
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ in ['NOUN']]
    
    # Join the keywords into a single string
    topic = ' '.join(keywords)
    
    return topic

# Example usage
domain = 'e-commerce'
sentence = "Help me do better demand forecasting and inventory management"
topic = extract_topic(sentence)+' in '+domain
print("Topic:", topic)

Topic: forecasting inventory management in e-commerce


In [2]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import chromadb.utils.embedding_functions as embedding_functions
import os
import glob




In [3]:
import os
import glob

# Replace 'folder_path' with the path to your folder containing the PDF files
folder_path = './pdf_knowledge_base'
pdf_files = glob.glob(folder_path + '/*.pdf')

# Iterate over both file paths and filenames simultaneously
for path, file in zip(pdf_files, [os.path.basename(file) for file in pdf_files]):
    print(path, file)


./pdf_knowledge_base\test1.pdf test1.pdf
./pdf_knowledge_base\test2.pdf test2.pdf
./pdf_knowledge_base\test3.pdf test3.pdf


In [4]:
folder_path = './pdf_knowledge_base'
pdf_files = glob.glob(folder_path + '/*.pdf')

# Iterate over both file paths and filenames simultaneously
for path, file in zip(pdf_files, [os.path.basename(file) for file in pdf_files]):
    file_name_without_extension = os.path.splitext(file)[0] 
    # Load Documents
    loader = PyPDFLoader(path)
    docs = loader.load()


    # Split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
    splits = text_splitter.split_documents(docs)

    # Embed and store the texts
    # Supplying a persist_directory will store the embeddings on disk
    persist_directory = './db/'+file_name_without_extension+'_Knoweldge_base'

    print(persist_directory)

    ## here we are using OpenAI embeddings but in future we will swap out to local embeddings
    embedding = OpenAIEmbeddings()

    vectordb = Chroma.from_documents(documents=splits, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)
    vectordb.persist()
    vectordb = None

./db/test1_Knoweldge_base
./db/test2_Knoweldge_base
./db/test3_Knoweldge_base


In [5]:
# Now we can load the persisted database from disk, and use it as normal. 
vectordb = None
vectordb = Chroma(persist_directory="./db/test1_Knoweldge_base", 
                  embedding_function=embedding)

In [6]:
retriever = vectordb.as_retriever()

In [7]:
#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

question = 'what is Procurement and Supplier Management?'

rag_chain.invoke(question)

'Procurement and Supplier Management involves managing procurement processes and relationships with suppliers. It includes activities such as issuing purchase orders and monitoring inventory levels. Effective management can lead to better supply chain management and faster order fulfillment.'