In [15]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import openai
import os
import bson
from pymongo import MongoClient


In [37]:
from pymongo import MongoClient

def loadDataFromMongo(mongo_url='mongodb://localhost:27017/', db_name='gradhack', collection_name='discovery'):
    # Connect to MongoDB
    client = MongoClient(mongo_url)

    # Access the database
    db = client[db_name]

    # Access the collection
    collection = db[collection_name]

    # Retrieve multiple documents
    documents = collection.find()
    
    # Convert to list
    data = list(documents)
    
    return data


In [None]:
# Define the SimpleDocument class
class SimpleDocument:
    def __init__(self, text, metadata=None):
        self.page_content = text
        self.metadata = metadata if metadata is not None else {}

# Utility function to filter complex metadata types
def filter_complex_metadata(metadata):
    filtered_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            filtered_metadata[key] = value
        elif isinstance(value, bson.ObjectId):
            filtered_metadata[key] = str(value)  # Convert ObjectId to string
    return filtered_metadata

# Function to load data from MongoDB
def loadDataFromMongo(mongo_url='mongodb://localhost:27017/', db_name='gradhack', collection_name='discovery'):
    client = MongoClient(mongo_url)
    db = client[db_name]
    collection = db[collection_name]
    documents = collection.find()
    data = list(documents)
    return data

In [43]:


# Load data from MongoDB
data = loadDataFromMongo()

# Extract text content from the loaded data
docs = []
for doc in data:
    if 'content' in doc:  # Assuming 'content' field contains the text data
        filtered_metadata = filter_complex_metadata(doc)
        docs.append(SimpleDocument(text=doc['content'], metadata=filtered_metadata))
    else:
        # Process structured client data
        content = f"Client Name: {doc.get('name', 'N/A')}\n"
        content += f"Client Age: {doc.get('age', 'N/A')}\n"
        content += f"Client Address: {doc.get('address', 'N/A')}\n"
        if 'accounts' in doc:
            for account in doc['accounts']:
                content += f"Account ID: {account.get('account_id', 'N/A')}, Type: {account.get('account_type', 'N/A')}, Balance: {account.get('balance', 'N/A')}\n"
        if 'transactions' in doc:
            for transaction in doc['transactions']:
                content += f"Transaction ID: {transaction.get('transaction_id', 'N/A')}, Account ID: {transaction.get('account_id', 'N/A')}, Date: {transaction.get('date', 'N/A')}, Amount: {transaction.get('amount', 'N/A')}, Description: {transaction.get('description', 'N/A')}\n"
        filtered_metadata = filter_complex_metadata(doc)
        docs.append(SimpleDocument(text=content, metadata=filtered_metadata))

In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)

# Create splits of the document using the text splitter
splits = text_splitter.split_documents(docs)



In [45]:
# Generate unique IDs for each split
ids = [f"doc_{i}" for i in range(len(splits))]

# Initialize the embedding model
embedding = OpenAIEmbeddings()

# Specify the directory to persist the Chroma vector store
persist_directory = 'docs/chroma/'

# Create the vector store
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    ids=ids,
    persist_directory=persist_directory
)

# Example: Print the number of documents in the vector store
print(vectordb._collection.count())

462


In [46]:
print(splits)

[Document(page_content="1 The followining information pertains specifically to Discovery Bank Gold account  The Discovery Bank  Gold Card Account  Qualifying income from R100,000 to R350,000 a year  R100 per month  Everyday banking with fantastic rewards     If you're looking for an account that's flexible, affordable and gives you more than just a  credit card, then the Discovery Bank Gold Card Account is the smart choice for you.  Plus, you can unlock fantastic value and rewards with your good banking behaviour.  Why are we calling it more than just a credit card? Because you get a credit card with full  transactional capabilities including the ability to set up third-party debit orders, make  digital payments and accept salary deposits. You get up to 55 days' interest-free credit  on select transactions, and you can manage your credit easily with the most flexible  credit facility in the market. Plus, you can add as many free savings accounts as you  need to achieve your goals. And 

In [47]:
os.environ["OPENAI_API_KEY"] = "sk-proj-hegQLFsJdDkZ4Od0H7nzT3BlbkFJ85wscHePDEop2V242Bqc"

In [48]:
from langchain.chat_models import ChatOpenAI
llm_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

  warn_deprecated(


In [49]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [56]:
from langchain.prompts import PromptTemplate

# Build enhanced prompt
template = """
You are a highly knowledgeable and reliable financial advisor. 
Use the provided context to answer the questions accurately and professionally.
If the context does not contain enough information or you are unsure of the answer, clearly state that you don't know. 
Always prioritize the user's financial well-being and avoid making assumptions.
Always elborate in your answers and tell the user why you came up with the conclusion you came up with.

Context:
{context}

Question: {question}

Professional Answer:
"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


In [67]:
question = "What account do you reccomend for John Doe?"
result = qa_chain({"query": question})
# Check the result of the query
result["result"]
# Check the source document from where we
#result["source_documents"][0]

"Based on the information provided, I would recommend the Discovery Bank Gold Card Account for John Doe. This account offers flexibility, affordability, and a range of benefits beyond just a credit card. With qualifying income within the specified range, John Doe would be eligible for this account. The account provides transactional capabilities, interest-free credit on select transactions, the ability to set up third-party debit orders, make digital payments, and accept salary deposits. Additionally, John Doe can add free savings accounts as needed and access Real-Time Forex Accounts. Engaging with the Vitality Money program can also unlock lifestyle and travel rewards. Overall, this account seems to align well with John Doe's financial needs and goals."