In [None]:
# Copyright 2024 Rittman Analytics ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Profit & Loss Report Chatbot (RAG + SQL Agent)

## Overview

This Jupyter notebook implements an advanced question-answering system for Profit and Loss (P&L) data analysis. It leverages Large Language Models (LLMs), Google BigQuery Vector Storage, and LangChain to provide intelligent, context-aware responses to financial queries.

## Key Technologies

- **Large Language Models (LLMs)**: Powers natural language understanding and generation.
- **Google BigQuery Vector Storage**: Stores and retrieves pre-analyzed financial data as vectors.
- **LangChain**: Orchestrates the interaction between LLMs, vector storage, and SQL databases.
- **SQL Agent**: Dynamically generates and executes SQL queries based on natural language questions.

## Core Functionality

### 1. Vector Store Retrieval

- Utilizes `BigQueryVectorStore` from LangChain for efficient similarity search.
- Pre-analyzed financial reports are stored as vector embeddings.
- `VertexAIEmbeddings` is used to generate embeddings for queries and documents.

### 2. LLM-Powered Decision Making

- `should_query_vector_store()` function uses an LLM to decide whether to query the vector store or use SQL.
- LLM considers the question content, time frame, and available data to make this decision.

### 3. Dynamic SQL Query Generation

- Employs LangChain's SQL Agent to convert natural language questions into SQL queries.
- `create_sql_agent()` sets up an agent with access to database schema and query execution capabilities.
- `SQLDatabaseToolkit` provides the agent with necessary tools for SQL operations.

### 4. Intelligent Question Processing

- `extract_date_from_question()` uses regex and LLM capabilities to understand temporal aspects of queries.
- `find_matching_values()` identifies relevant financial categories and groups mentioned in the question.

### 5. Answer Generation and Evaluation

- Combines information from vector store and SQL queries to generate comprehensive answers.
- `evaluate_answer_relevance()` uses an LLM to assess the relevance and quality of the generated answer.

## LangChain Components Used

- `ChatOpenAI`: Interface for the LLM (e.g., GPT-4).
- `ConversationChain`: Manages conversation context.
- `LLMChain`: Executes specific LLM tasks like decision making and evaluation.
- `PromptTemplate`: Structures prompts for consistent LLM interactions.
- `SQLDatabase`: Provides an interface to the SQL database.
- `BigQueryVectorStore`: Manages vector storage and retrieval in BigQuery.

## Setup and Configuration

1. Install required packages:
   ```
   pip install langchain langchain-google-vertexai langchain-google-community google-cloud-bigquery sqlalchemy
   ```

2. Set up Google Cloud credentials and BigQuery access.

3. Configure the notebook variables:
   - `project`: Your Google Cloud project ID
   - `dataset`: BigQuery dataset name
   - `service_account_file`: Path to your Google Cloud service account key

4. Set up your OpenAI API key as an environment variable.

## Usage

1. Initialize the notebook components:
   ```python
   main(reload_vector_storage=True)
   ```

2. Start the interactive query session:
   ```python
   question = "What was our revenue in May 2024?"
   response = ask_question(question)
   print(response)
   ```

## Customization

- Modify `vector_store_content_description` to match your financial data structure.
- Adjust `extract_date_from_question()` for different date formats.
- Customize SQL views in `determine_view()` to match your database schema.

## Advanced Features

- **Hybrid Retrieval**: Combines vector similarity search with SQL queries for comprehensive answers.
- **Dynamic Time Awareness**: Automatically adjusts queries based on the time frame mentioned in the question.
- **Relevance Scoring**: Uses LLM to evaluate the quality and relevance of generated answers.

In [None]:
pip install bs4 langchain langchain-community langchain-google-community langchain_google_vertexai langchain-openai openai chromadb tiktoken tabulate sqlalchemy sqlalchemy-bigquery google-cloud-bigquery

In [None]:
from google.cloud import bigquery
from sqlalchemy import *
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *
import os
from langchain.agents import create_sql_agent
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain, LLMChain
from langchain.prompts import PromptTemplate
import pandas as pd
from collections import defaultdict
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_google_community import BigQueryVectorStore
import re
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
from dateutil.relativedelta import relativedelta


service_account_file = "/content/ra-development-bf14f75f80cd.json"
project = "ra-development"
dataset = "analytics_finance_demo"
sqlalchemy_url = f'bigquery://{project}/{dataset}?credentials_path={service_account_file}'
os.environ["OPENAI_API_KEY"] = ""

vector_store_content_description = """
The vector store contains pre-created analysis with the following information:
1. Key Metrics
   - Revenue, Overheads, Cost of Delivery, Gross & Net Profit, Retained Earnings net amounts
   - Account Sub-Category and Account Group net amounts
2. Key Metrics Calculation:
   - Month-over-month, year-to-date, and budget variance calculations
   - Gross margin percentages
3. Significant Transaction Identification:
   - Transactions exceeding a certain percentage threshold of the total account group amount
   - Cancelling transactions are identified and excluded
   - Context for each significant transaction (new or changed from previous month)
4. Overhead Trend Analysis:
   - Monthly growth rates for overhead categories over the last 6 months
   - Account groups with significant average monthly growth (> 10%)
5. Identification of New Repeating Transactions:
   - Transactions with the same description appearing consistently over the last 3 months

The analysis covers the last three months and the current year-to-date at summary level for category, subcategory, and account group levels.
"""

# Create a BigQuery client
client = bigquery.Client.from_service_account_json(service_account_file)

# Initialize embedding model
embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@latest",
    project=project
)

# Initialize BigQueryVectorStore
vector_store = BigQueryVectorStore(
    project_id=project,
    dataset_name=dataset,
    table_name="pl_reports_vector_storage",
    location="europe-west2",
    embedding=embedding,
)

def load_vector_storage():
    query = f"""
    SELECT date_month as month, report_analysis, invoice_analysis, recurring_payments_analysis
    FROM `{project}.{dataset}.pl_reports`
    """
    df = client.query(query).to_dataframe()

    for _, row in df.iterrows():
        month = row['month']
        for analysis_type in ['report_analysis', 'invoice_analysis', 'recurring_payments_analysis']:
            text = row[analysis_type]
            metadata = {
                'month': month,
                'analysis_type': analysis_type
            }
            vector_store.add_texts([text], metadatas=[metadata])

    print("Vector storage loaded successfully.")

def get_available_months():
    query = f"""
    SELECT DISTINCT month as month
    FROM `{project}.{dataset}.pl_reports_vector_storage`
    ORDER BY month DESC
    """
    df = client.query(query).to_dataframe()
    return df['month'].tolist()

# Add this function to determine the valid time range
def get_valid_time_range(available_months):
    if not available_months:
        return None, None

    latest_month = max(available_months)
    earliest_month = min(available_months)

    # Calculate the start of the year for the latest month
    year_start = latest_month.replace(month=1, day=1)

    # The valid range includes two months before the earliest available month
    valid_start = (earliest_month - relativedelta(months=2)).replace(day=1)

    return valid_start, latest_month

def create_lookup_tables():
    query = f"""
    SELECT DISTINCT
        account_report_group,
        account_report_sub_category,
        account_category
    FROM `{project}.{dataset}.profit_and_loss_report_account_group`
    """
    df = client.query(query).to_dataframe()

    lookups = {
        'group': defaultdict(list),
        'sub_category': defaultdict(list),
        'category': defaultdict(list)
    }

    for _, row in df.iterrows():
        group = row['account_report_group']
        sub_category = row['account_report_sub_category']
        category = row['account_category']

        lookups['group'][group.lower()].append(group)
        lookups['sub_category'][sub_category.lower()].append(sub_category)
        lookups['category'][category.lower()].append(category)

    return lookups

lookups = create_lookup_tables()

memory = ConversationBufferMemory()
db = SQLDatabase.from_uri(sqlalchemy_url)
llm = ChatOpenAI(
    model="gpt-4",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
conversation_chain = ConversationChain(
    llm=llm,
    memory=memory
)
toolkit = SQLDatabaseToolkit(db=db, llm=llm)
agent_executor = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    top_k=1000,
)

def find_matching_values(question, lookups):
    words = question.lower().split()
    matches = {
        'group': set(),
        'sub_category': set(),
        'category': set()
    }

    for word in words:
        for key in lookups:
            if word in lookups[key]:
                matches[key].update(lookups[key][word])

    return matches

def construct_filter_clause(matches):
    clauses = []
    for key, values in matches.items():
        if values:
            column = "account_report_group" if key == "group" else f"account_{key}"
            quoted_values = ["'{0}'".format(v) for v in values]
            clause = "{0} IN ({1})".format(column, ", ".join(quoted_values))
            clauses.append(clause)

    return " AND ".join(clauses) if clauses else ""

def determine_view(matches):
    if matches['group']:
        return "profit_and_loss_report_account_group"
    elif matches['sub_category']:
        return "profit_and_loss_report_sub_categories"
    else:
        return "profit_and_loss_report_categories"

def extract_date_from_question(question):
    # Look for year and month patterns
    year_pattern = r'\b(20\d{2})\b'
    month_pattern = r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b'

    year_match = re.search(year_pattern, question)
    month_match = re.search(month_pattern, question, re.IGNORECASE)

    if year_match and month_match:
        year = year_match.group(1)
        month = month_match.group(1).capitalize()
        month_num = datetime.strptime(month, "%B").month
        return f"{year}-{month_num:02d}-01"
    elif year_match:
        return f"{year_match.group(1)}-01-01"  # Default to January if only year is specified
    else:
        return None

def strip_html(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    return soup.get_text()

def summarize_content(question, content):
    prompt = PromptTemplate(
        input_variables=["question", "content"],
        template="""Given the following question and content, provide a concise summary of the content that is directly relevant to answering the question.
        Ignore any information that doesn't pertain to the question. All amounts should be stated in GBP (£).

        Question: {question}

        Content: {content}

        Relevant Summary:"""
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    summary = chain.run(question=question, content=content)
    return summary.strip()

def should_query_vector_store(question):
    prompt = PromptTemplate(
        input_variables=["question", "content_description", "valid_time_range"],
        template="""Given the following question and description of the content in a vector store,
        determine if the vector store is likely to contain information that can answer the question.
        Consider the following:
        1. Does the question ask about any of the key metrics or analyses mentioned in the content description?
        2. Does the question fall within the time frame covered by the vector store? Valid time range: {valid_time_range}
        3. Is the level of detail requested (category, subcategory, account group) available in the vector store?

        Respond with 'Yes' if the vector store is likely to contain relevant information, or 'No' if it's unlikely or unclear.

        Question: {question}

        Vector Store Content Description:
        {content_description}

        Decision (Yes/No):
        Explanation:"""
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(question=question,
                         content_description=vector_store_content_description,
                         valid_time_range=f"{valid_time_range[0].strftime('%B %Y')} to {valid_time_range[1].strftime('%B %Y')}")

    print(f"LLM Response for vector store query decision:\n{response}")  # Debug print

    # Extract the decision from the response
    decision_match = re.search(r'Decision \(Yes/No\):\s*(Yes|No)', response, re.IGNORECASE)
    if decision_match:
        decision = decision_match.group(1).lower()
        print(f"Extracted decision: {decision}")  # Debug print
        return decision == 'yes'
    else:
        # If no clear decision is found in the expected format, look for a 'Yes' at the beginning of the response
        if response.strip().lower().startswith('yes'):
            print("'Yes' found at the start of the response. Using vector store.")  # Debug print
            return True
        else:
            print("No clear 'Yes' decision found in LLM response. Defaulting to not using vector store.")  # Debug print
            return False

def evaluate_answer_relevance(question, answer):
    prompt = PromptTemplate(
        input_variables=["question", "answer"],
        template="""Given the following question and answer, evaluate how well the answer addresses the question.
        Provide a relevance score as a percentage and a brief explanation.

        Question: {question}

        Answer: {answer}

        Relevance Score (0-100%):
        Explanation:"""
    )

    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run(question=question, answer=answer)
    return response

def ask_question(question):
    extracted_date = extract_date_from_question(question)

    if should_query_vector_store(question):
        month = extracted_date if extracted_date else extract_month_from_question(question)
        filter_dict = {"month": month} if month else {}
        relevant_docs = vector_store.similarity_search(question, k=1, filter=filter_dict)

        if relevant_docs:
            pre_created_answer = relevant_docs[0].page_content
            stripped_answer = strip_html(pre_created_answer)
            summarized_answer = summarize_content(question, stripped_answer)
            answer = f"Based on the pre-created analysis: {summarized_answer}"
        else:
            answer = "No relevant pre-created analysis found. Falling back to SQL query."
    else:
        answer = "The question cannot be answered by pre-created analysis. Using SQL query."

    if "Using SQL query" in answer:
        # Use the existing SQL query functionality
        matches = find_matching_values(question, lookups)
        filter_clause = construct_filter_clause(matches)
        view_name = determine_view(matches)

        if extracted_date:
            date_filter = f"date_trunc(date_month, MONTH) = DATE('{extracted_date}')"
        else:
            date_filter = ""

        filter_clause = f"{date_filter} AND {filter_clause}" if filter_clause else date_filter

        instruction = f"""You are a knowledgeable finance data analyst working for Rittman Analytics.
        Use the `{project}.{dataset}.{view_name}` view to answer this question.
        Use the following SQL filter clause in your query: {filter_clause}
        Please construct and execute a SQL query to answer the question, making sure to include the filter clause.
        Do not include markdown-style triple backticks in the SQL you generate and try to use or validate.
        Question is: {question}
        """

        answer = agent_executor.run(instruction)

    relevance_evaluation = evaluate_answer_relevance(question, answer)

    return f"{answer}\n\nRelevance Evaluation:\n{relevance_evaluation}"

def main(reload_vector_storage=False):
    global valid_time_range, vector_store_content_description

    if reload_vector_storage:
        load_vector_storage()

    available_months = get_available_months()
    valid_time_range = get_valid_time_range(available_months)

    # Update the vector_store_content_description with the actual time range
    vector_store_content_description += f"""
    The analysis covers the period from {valid_time_range[0].strftime('%B %Y')} to {valid_time_range[1].strftime('%B %Y')}.
    For each month in this range, the analysis includes data for that month, comparisons to the two previous months, and year-to-date figures.
    """

    print("Hi! Ask me a question about our company's profit and loss data")
    while True:
        question = input()
        if question.lower() == 'quit':
            break
        else:
            response = ask_question(question)
            print(f"{response}\n")
            print("Is there anything else I can answer for you? Or type QUIT to exit")

if __name__ == "__main__":
    main(reload_vector_storage=False)  # Set to True to reload vector storage