In [1]:
import pandas as pd
import numpy as np
import langchain
import langchain_community 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma


In [2]:
df=pd.read_csv('C:/Users/KIIT/OneDrive/Desktop/FINANCE_AI/Finance_data.csv')
df_fin=df.to_dict(orient='records')

In [3]:
for entry in df_fin:
  prompt = f"I'm a {entry['age']}-year-old {entry['gender']} looking to invest in {entry['Avenue']} for {entry['Purpose']} over the next {entry['Duration']}. What are my options?"
  print(prompt)

I'm a 34-year-old Female looking to invest in Mutual Fund for Wealth Creation over the next 1-3 years. What are my options?
I'm a 23-year-old Female looking to invest in Mutual Fund for Wealth Creation over the next More than 5 years. What are my options?
I'm a 30-year-old Male looking to invest in Equity for Wealth Creation over the next 3-5 years. What are my options?
I'm a 22-year-old Male looking to invest in Equity for Wealth Creation over the next Less than 1 year. What are my options?
I'm a 24-year-old Female looking to invest in Equity for Wealth Creation over the next Less than 1 year. What are my options?
I'm a 24-year-old Female looking to invest in Mutual Fund for Wealth Creation over the next 1-3 years. What are my options?
I'm a 27-year-old Female looking to invest in Equity for Wealth Creation over the next 3-5 years. What are my options?
I'm a 21-year-old Male looking to invest in Mutual Fund for Wealth Creation over the next 3-5 years. What are my options?
I'm a 35-yea

# 
Pre-Processng the Data into Prompt-Response Format

In [4]:
# Convert the data to prompt-response format
prompt_response_data = []
for entry in df_fin:
    prompt = f"I'm a {entry['age']}-year-old {entry['gender']} looking to invest in {entry['Avenue']} for {entry['Purpose']} over the next {entry['Duration']}. What are my options?"
    response = (
        f"Based on your preferences, here are your investment options:\n"
        f"- Mutual Funds: {entry['Mutual_Funds']}\n"
        f"- Equity Market: {entry['Equity_Market']}\n"
        f"- Debentures: {entry['Debentures']}\n"
        f"- Government Bonds: {entry['Government_Bonds']}\n"
        f"- Fixed Deposits: {entry['Fixed_Deposits']}\n"
        f"- PPF: {entry['PPF']}\n"
        f"- Gold: {entry['Gold']}\n"
        f"Factors considered: {entry['Factor']}\n"
        f"Objective: {entry['Objective']}\n"
        f"Expected returns: {entry['Expect']}\n"
        f"Investment monitoring: {entry['Invest_Monitor']}\n"
        f"Reasons for choices:\n"
        f"- Equity: {entry['Reason_Equity']}\n"
        f"- Mutual Funds: {entry['Reason_Mutual']}\n"
        f"- Bonds: {entry['Reason_Bonds']}\n"
        f"- Fixed Deposits: {entry['Reason_FD']}\n"
        f"Source of information: {entry['Source']}\n"
    )
    prompt_response_data.append({"prompt": prompt, "response": response})

prompt_response_data[:5]

[{'prompt': "I'm a 34-year-old Female looking to invest in Mutual Fund for Wealth Creation over the next 1-3 years. What are my options?",
  'response': 'Based on your preferences, here are your investment options:\n- Mutual Funds: 1\n- Equity Market: 2\n- Debentures: 5\n- Government Bonds: 3\n- Fixed Deposits: 7\n- PPF: 6\n- Gold: 4\nFactors considered: Returns\nObjective: Capital Appreciation\nExpected returns: 20%-30%\nInvestment monitoring: Monthly\nReasons for choices:\n- Equity: Capital Appreciation\n- Mutual Funds: Better Returns\n- Bonds: Safe Investment\n- Fixed Deposits: Fixed Returns\nSource of information: Newspapers and Magazines\n'},
 {'prompt': "I'm a 23-year-old Female looking to invest in Mutual Fund for Wealth Creation over the next More than 5 years. What are my options?",
  'response': 'Based on your preferences, here are your investment options:\n- Mutual Funds: 4\n- Equity Market: 3\n- Debentures: 2\n- Government Bonds: 1\n- Fixed Deposits: 5\n- PPF: 6\n- Gold: 7\

# Storing data in VEctor DB

In [5]:
from langchain.docstore.document import Document
documents = []
for entry in prompt_response_data:
    combined_text = f"Prompt: {entry['prompt']}\nResponse: {entry['response']}"
    documents.append(Document(page_content=combined_text))
     

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
texts = text_splitter.split_documents(documents)

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

# Define the Hugging Face embeddings
hg_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  hg_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


In [8]:
from langchain.vectorstores import FAISS

In [15]:

persist_directory= 'docs/faiss_rag_new/'

# Define the batch size based on the error message (less than 166)
BATCH_SIZE = 150  # Slightly below the max limit for safety

# Function to create FAISS vector database in batches
def create_faiss_vector_db(documents, embedding, persist_directory, batch_size=BATCH_SIZE):
    vectordb_fin = None

    # Process documents in batches
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]

        if vectordb_fin is None:
            # Create FAISS DB with the first batch
            texts = [doc.page_content for doc in batch]  # Extract the text from the documents
            vectordb_fin = FAISS.from_texts(
                texts=texts,
                embedding=embedding,
            )
        else:
            # Add subsequent batches to the existing database
            texts = [doc.page_content for doc in batch]
            embeddings = embedding.embed_documents(texts)  # Get embeddings for the current batch
            # Create a temporary FAISS vector store for the batch and merge it with the existing one
            batch_vectordb = FAISS.from_texts(texts, embedding)
            vectordb_fin.merge_from(batch_vectordb)  # Merge the new batch with the existing vector store

    # Save the final vector database after all batches are processed
    vectordb_fin.save_local(persist_directory)  # Persist the database to the directory

    return vectordb_fin

# Call the function with the provided documents and embeddings
vectordb_fin = create_faiss_vector_db(documents=texts, embedding=hg_embeddings, persist_directory=persist_directory)




In [None]:
from langchain.chains import RetrievalQA
retriever_fin = vectordb_fin.as_retriever(search_kwargs={"k":5})
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_fin, return_source_documents=False)
query = "I'm a 34-year-old female looking to invest in mutual funds for wealth creation over the next 1-3 years. What are my options?"
result = qa({"query": query})
result