In [1]:
# Importing necessary libs
import pandas as pd
import numpy as np
import os
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from dotenv import load_dotenv
import os



In [2]:

PATH = 'data/'


def merge_datasets():
    """
    Function to merge all the datasets in the specified directory and create a single dataset
    with an additional 'spending_type' column to identify income and expenses.
    """
    df_income = [] 
    df_expense = []
    for file in os.listdir(PATH):
        if file.endswith('.xlsx') or file.endswith('.csv'):  # Corrected the condition
            # Data with income
            df_income_temp = pd.read_excel(f'{PATH}{file}', skiprows=1, sheet_name='Income')
            df_income_temp['spending_type'] = 'Income'  # identifier column
            df_income.append(df_income_temp)

            # Data with expense
            df_expense_temp = pd.read_excel(f'{PATH}{file}', skiprows=1, sheet_name='Expenses')
            df_expense_temp['spending_type'] = 'Expense'  # identifier column
            df_expense.append(df_expense_temp)

    # concatenate income and expense data into a single DataFrame
    df_income = pd.concat(df_income, ignore_index=True)
    df_expense = pd.concat(df_expense, ignore_index=True)

    # merge income and expense data
    df_combined = pd.concat([df_income, df_expense], ignore_index=True)
    
    # Cleaning the data
    df_combined.sort_values(by='Date and time', inplace=True)
    # remove columns
    df_combined.drop(['Amount in account currency', 'Account currency','Transaction amount in transaction currency', 'Transaction currency', 'Tags', 'Account','Default currency', 'Comment'], axis=1, inplace=True) # can include if needed

    return df_combined


In [3]:
df = merge_datasets()
# sperating date by year and month and day
df['Year'] = pd.DatetimeIndex(df['Date and time']).year
df['Month'] = pd.DatetimeIndex(df['Date and time']).month
# df['Day'] = pd.DatetimeIndex(df['Date and time']).day

df.drop(columns=['Date and time'], inplace=True)
df
#renaming columns
df = df.rename(columns={'spending_type': 'Type', 'Amount in default currency': 'Amount',})

# # reordering columns
df = df[['Year', 'Month', 'Type', 'Category', 'Amount']]


df_expense= df[df['Type'] == 'Expense']
df_income = df[df['Type'] == 'Income']

In [4]:
# total by type
df_total = df.groupby(['Type'], as_index=False)['Amount'].sum()
df_total

Unnamed: 0,Type,Amount
0,Expense,41572.86
1,Income,45533.14


In [5]:
# grouped by category for expense
dfexpense_category = df_expense.groupby(['Category'], as_index=False)['Amount'].sum()
dfexpense_category

Unnamed: 0,Category,Amount
0,Balancing,171.93
1,Cafe,4678.76
2,Education,19795.87
3,Fashion,428.78
4,Gadgets,85.71
5,Gifts,627.72
6,Groceries,2340.62
7,Haircut,65.55
8,Home,10700.0
9,Laptop repair,135.6


In [6]:
dfexpense_month = df_expense.groupby(['Year','Month'], as_index=False)['Amount'].sum()
dfexpense_month

Unnamed: 0,Year,Month,Amount
0,2023,4,50.0
1,2023,5,1435.02
2,2023,6,454.49
3,2023,7,536.26
4,2023,8,1999.06
5,2023,9,2278.62
6,2023,10,8480.87
7,2023,11,2376.88
8,2023,12,429.03
9,2024,1,1143.25


In [7]:
df_income_month = df_income.groupby(['Year','Month'], as_index=False)['Amount'].sum()
df_income_month

Unnamed: 0,Year,Month,Amount
0,2023,5,3384.0
1,2023,7,2013.0
2,2023,8,5090.0
3,2023,9,5000.0
4,2023,10,3055.7
5,2023,11,2000.0
6,2023,12,2243.21
7,2024,1,100.0
8,2024,2,2854.38
9,2024,3,4303.57


In [8]:
# saving the data frames as txt 
# Define the file path where you want to save the output
file_path = "financial_summary_llama.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    
    file.write("You are a financial advisor andb based on the spending habits from the summary data below you will give financial advice.\n")
    file.write(" If anything besides financial advice is aksed by the user you will not answer and reply you will only givefinancial advice.\n")
    file.write("Just do the calculating do not show then and give direct answers\n")
    file.write("\n\n") 
    file.write("You will first give a brief on the users spending habits and give stats in numbers look into the data before answering this do calculation and anything you need and then you will give financial advice based on the user data.\n")
    
        
    file.write("\n\n") 
    file.write("Also try to sound as human as possile dont give generic advice act like a person in a conversation:\n") 
    
    file.write("\n\n") 
    file.write("The user data is below as follows:\n") 
    file.write("\n\n") 
    
    file.write("This is dataframe for total expenses and imcome:\n")
    file.write(df_total.to_string())  # Convert the expenditure DataFrame to string
    file.write("\n\n")  # Add a new line for separation
    
    file.write("This is the expenses by month :\n")
    file.write(dfexpense_month.to_string())  
    file.write("\n\n") 
    
    file.write("This is the income by month :\n")
    file.write(dfexpense_month.to_string())  
    file.write("\n\n") 
    
    file.write("This is the expenses in different category :\n")
    file.write(dfexpense_category.to_string())  
    file.write("\n\n") 

# The file is automatically closed after the 'with' block
print(f"Data saved successfully to {file_path}")


Data saved successfully to financial_summary_llama.txt


In [9]:
file_path = "financial_summary_llama.txt"

with open(file_path, "r") as file:
    file_content = file.read()

In [10]:
from langchain.document_loaders import CSVLoader
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
import ollama

# Define custom embeddings for Ollama
class OllamaEmbeddings(Embeddings):
    def __init__(self, model="mxbai-embed-large"):
        self.model = model

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            response = ollama.embeddings(model=self.model, prompt=text)
            embeddings.append(response["embedding"])
        return embeddings

    def embed_query(self, text):
        if isinstance(text, dict) and "question" in text:
            text = text["question"]
        response = ollama.embeddings(model=self.model, prompt=text)
        return response["embedding"]

# Load the financial summary data
file_path = "financial_summary_llama.txt"
with open(file_path, "r") as file:
    file_content = file.read()

# Split the document for vectorization
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_text(file_content)

# Use vector database Chroma to store embeddings
embedding_model = OllamaEmbeddings(model="mxbai-embed-large")
vector_store = Chroma.from_texts(
    documents=documents,
    embedding=embedding_model,
    persist_directory="chroma_db"
)
vector_store.persist()

# Prompt template
prompt_template = """
You are an AI financial advisor. Your sole purpose is to provide financial advice based on the user's financial data provided below. 
You are only allowed to answer questions directly related to this data or general financial advice contextualized by the user's financial habits.

- If the user asks anything unrelated to financial advice or the data provided, politely decline by stating:
  "I am designed to provide financial advice based solely on your provided financial data. Please ask a financial-related question."

- Always refer to the financial data when giving advice and ensure your responses are specific to the user's data.

- Respond in a conversational, human-like tone, and avoid generic statements.

The user's financial data is as follows:
{context}

Question: {question}

Your response:
"""
prompt = ChatPromptTemplate.from_template(prompt_template)

# Define retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Custom query validator
def validate_query(query):
    allowed_keywords = ["financial", "income", "expense", "budget", "savings", "investment", "spending"]
    return any(keyword in query.lower() for keyword in allowed_keywords)

# Custom chain handler
setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)

def custom_chain_handler(input_data):
    question = input_data["question"]
    if not validate_query(question):
        return "I am designed to provide financial advice based solely on your provided financial data. Please ask a financial-related question."

    chain = setup_and_retrieval | prompt | (
        lambda x: ollama.chat(
            model="llama3.2",
            messages=[{"role": "user", "content": x.to_string()}]
        )["message"]["content"]
    )
    return chain.invoke(input_data)

# Example query
query = "What are my biggest expense categories?"
result = custom_chain_handler({"question": query})
print(result)


TypeError: Chroma.from_texts() missing 1 required positional argument: 'texts'