#### Friday, December 20, 2024

In [1]:
# Deliberately set the OPENAI_API_KEY to an invalid value to ensure that the code is not using it.
import os
os.environ['OPENAI_API_KEY'] = "Nope!"

In [2]:
# %pip install langchain langchain_openai langchain_mistralai pandas tqdm --upgrade --quiet

## Generate Ground Truth Data with GPT-4 API

Creating the known labels or ground truth data can be time consuming and expensive. You can use GPT-4 to _generate the ground truth data_ for you. This is useful for training your own models, and for evaluating the performance of other models. Then you can use these evals to test whether the open source or smaller / faster / cheaper models are performing as well as the larger / slower / more expensive models.

In [3]:
import pandas as pd
from tqdm import tqdm
import requests
import io

# Dataset URL:
url = "https://storage.googleapis.com/oreilly-content/transaction_data_with_expanded_descriptions.csv"

# Download the file from the URL:
downloaded_file = requests.get(url)

# Load the transactions dataset and only look at 20 transactions:
df = pd.read_csv(io.StringIO(downloaded_file.text))[:20]
df.head()

Unnamed: 0,Transaction Description
0,cash deposit at local branch
1,cash deposit at local branch
2,withdrew money for rent payment
3,withdrew cash for weekend expenses
4,purchased books from the bookstore


In [4]:
# Run through the dataset using GPT-4 to correctly classify the transactions:
from langchain_openai.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic.v1 import BaseModel
from typing import Literal, Union


In [5]:
# 1. Define the model:
# model = ChatOpenAI(
#     model="gpt-4-1106-preview",
#     model_kwargs={"response_format": {"type": "json_object"}},
# )

# Chat Model Output Parser:
# model = ChatOpenAI() 
lmstudio = "http://localhost:1234/v1"
model = "qwen2.5-14b-instruct@q8_0" # lmstudio-community/Qwen2.5-14B-Instruct-GGUF :  Qwen2.5-14B-Instruct-Q4_K_M.gguf

# chat = ChatOpenAI(base_url=lmstudio, model=model, api_key="LMStudio",  model_kwargs={"response_format": {"type": "json_object"}})
chat = ChatOpenAI(base_url=lmstudio, model=model, api_key="LMStudio")

In [6]:
system_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}"""

user_prompt = """Transaction Text:
{transaction}"""

In [7]:
# 2. Define the prompt:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "user",
            user_prompt,
        ),
    ]
)


In [8]:
# 3. Define the pydantic model:
class EnrichedTransactionInformation(BaseModel):
    transaction_type: Union[
        Literal["Purchase", "Withdrawal", "Deposit", "Bill Payment", "Refund"], None
    ]
    transaction_category: Union[
        Literal["Food", "Entertainment", "Transport", "Utilities", "Rent", "Other"],
        None,
    ]


In [9]:
# 4. Define the output parser:
output_parser = PydanticOutputParser(pydantic_object=EnrichedTransactionInformation)

In [10]:
# 5. Create an LCEL chain:
# chain = prompt | model | output_parser
chain = prompt | chat | output_parser

In [11]:
# 6. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
    results.append(result)

100%|██████████| 20/20 [01:04<00:00,  3.22s/it]


In [12]:
# 7. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

In [13]:
df["transaction_type"] = transaction_types
df["transaction_category"] = transaction_categories
df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category
0,cash deposit at local branch,Deposit,Other
1,cash deposit at local branch,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other
4,purchased books from the bookstore,Purchase,Other


In [14]:
df.to_csv("_transactions_with_enriched_data.csv", index=False)

## Obtain the Accuracy of Mistral API:

In [15]:
from dotenv import load_dotenv
load_dotenv()

False

In [16]:
import pandas as pd
from tqdm import tqdm
import requests
import io

# Dataset URL:
url = "https://storage.googleapis.com/oreilly-content/transactions_with_enriched_data.csv"

# Download the file from the URL:
downloaded_file = requests.get(url)

# Load the transactions dataset:
df = pd.read_csv(io.StringIO(downloaded_file.text))
df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category
0,cash deposit at local branch,Deposit,Other
1,cash deposit at local branch,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other
4,purchased books from the bookstore,Purchase,Other


In [17]:
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic.v1 import BaseModel
from typing import Literal, Union
import os

In [18]:
# 1. Define the model:
mistral_api_key = os.environ["MISTRAL_API_KEY"]

In [None]:
print(mistral_api_key)

In [20]:
model = ChatMistralAI(model="mistral-small", mistral_api_key=mistral_api_key)

In [21]:
# 2. Define the prompt:
system_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}"""

user_prompt = """Transaction Text:
{transaction}"""

In [22]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt,
        ),
        (
            "user",
            user_prompt,
        ),
    ]
)

In [23]:
# 3. Define the pydantic model:
class EnrichedTransactionInformation(BaseModel):
    transaction_type: Union[
        Literal["Purchase", "Withdrawal", "Deposit", "Bill Payment", "Refund"], None
    ]
    transaction_category: Union[
        Literal["Food", "Entertainment", "Transport", "Utilities", "Rent", "Other"],
        None,
    ]

In [24]:
# 4. Define the output parser:
output_parser = PydanticOutputParser(pydantic_object=EnrichedTransactionInformation)

In [25]:
# 5. Create an LCEL chain:
chain = prompt | model | output_parser

In [26]:
# 6. Invoke the chain for the first transaction:
transaction = df.iloc[0]["Transaction Description"]

In [27]:
# This will often fail because Mistral puts a backslash in the JSON keys ...
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result) 
# This will often fail because Mistral puts a backslash in the JSON keys 
# i.e. "transaction\_type" instead of "transaction_type"

OutputParserException: Invalid json output: {
"transaction\_type": "Deposit",
"transaction\_category": "Other"
}

The above code intentionally fails, so let's fit it.

In [30]:
from langchain_core.output_parsers import StrOutputParser

# 7. Define a function to try to fix and remove the backslashes:
def remove_back_slashes(string):
    cleaned_string = string.replace("\\", "") # double slash to escape the slash
    return cleaned_string

chain = prompt | model | StrOutputParser() | remove_back_slashes | output_parser

transaction = df.iloc[0]["Transaction Description"]
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result) 

transaction_type='Deposit' transaction_category='Other'


In [31]:
# 8. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    try:
        result = chain.invoke(
            {
                "transaction": transaction,
                "format_instructions": output_parser.get_format_instructions(),
            }
        )
    except:
        result = EnrichedTransactionInformation(
            transaction_type=None, transaction_category=None
        )
        
    results.append(result)

100%|██████████| 20/20 [00:04<00:00,  4.81it/s]


In [32]:
# 9. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

df["mistral_transaction_type"] = transaction_types
df["mistral_transaction_category"] = transaction_categories
df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category,mistral_transaction_type,mistral_transaction_category
0,cash deposit at local branch,Deposit,Other,Deposit,Other
1,cash deposit at local branch,Deposit,Other,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other,,
4,purchased books from the bookstore,Purchase,Other,,


In [33]:
# 10. Evaluate answers using LangChain evaluators
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("exact_match")

# loop through the dataframe and evaluate the predictions
transaction_types = []
transaction_categories = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction_type = row["transaction_type"]
    predicted_transaction_type = row["mistral_transaction_type"]
    transaction_category = row["transaction_category"]
    predicted_transaction_category = row["mistral_transaction_category"]

    transaction_type_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_type,
        reference=transaction_type,
    )

    transaction_category_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_category,
        reference=transaction_category,
    )

    transaction_types.append(transaction_type_score)
    transaction_categories.append(transaction_category_score)

100%|██████████| 20/20 [00:00<00:00, 16614.39it/s]


In [34]:
accuracy_score = 0

for transaction_type_score, transaction_category_score in zip(
    transaction_types, transaction_categories
):
    accuracy_score += transaction_type_score['score'] + transaction_category_score['score']

accuracy_score = accuracy_score / (len(transaction_types) * 2)
print(f"Accuracy score: {accuracy_score}")

Accuracy score: 0.2


## Compare Mistral API with GPT-3.5 API:

In [35]:
from langchain_openai.chat_models import ChatOpenAI

# 1. Define the model:
# model = ChatOpenAI(
#     model="gpt-3.5-turbo-1106",
#     model_kwargs={"response_format": {"type": "json_object"}},
# )

# chain = prompt | model | output_parser
chain = prompt | chat | output_parser

# 2. Invoke the chain for the first transaction:
transaction = df.iloc[0]["Transaction Description"]
result = chain.invoke(
        {
            "transaction": transaction,
            "format_instructions": output_parser.get_format_instructions(),
        }
    )
print(result)

transaction_type='Deposit' transaction_category='Other'


In [36]:
# 3. Invoke the chain for the whole dataset:
results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    try:
        result = chain.invoke(
            {
                "transaction": transaction,
                "format_instructions": output_parser.get_format_instructions(),
            }
        )
    except:
        result = EnrichedTransactionInformation(
            transaction_type=None, transaction_category=None
        )
    
    results.append(result)

100%|██████████| 20/20 [00:59<00:00,  2.98s/it]


In [37]:
# 4. Add the results to the dataframe, as columns transaction type and transaction category
transaction_types = []
transaction_categories = []

for result in results:
    transaction_types.append(result.transaction_type)
    transaction_categories.append(result.transaction_category)

df["gpt3.5_transaction_type"] = transaction_types
df["gpt3.5_transaction_category"] = transaction_categories

df.head()

Unnamed: 0,Transaction Description,transaction_type,transaction_category,mistral_transaction_type,mistral_transaction_category,gpt3.5_transaction_type,gpt3.5_transaction_category
0,cash deposit at local branch,Deposit,Other,Deposit,Other,Deposit,Other
1,cash deposit at local branch,Deposit,Other,Deposit,Other,Deposit,Other
2,withdrew money for rent payment,Withdrawal,Rent,Withdrawal,Rent,Withdrawal,Rent
3,withdrew cash for weekend expenses,Withdrawal,Other,,,Withdrawal,Other
4,purchased books from the bookstore,Purchase,Other,,,Purchase,Other


In [38]:
# Loop through the dataframe and evaluate the predictions
transaction_types = []
transaction_categories = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction_type = row["transaction_type"]
    predicted_transaction_type = row["gpt3.5_transaction_type"]
    transaction_category = row["transaction_category"]
    predicted_transaction_category = row["gpt3.5_transaction_category"]

    transaction_type_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_type,
        reference=transaction_type,
    )

    transaction_category_score = evaluator.evaluate_strings(
        prediction=predicted_transaction_category,
        reference=transaction_category,
    )

    transaction_types.append(transaction_type_score)
    transaction_categories.append(transaction_category_score)

accuracy_score = 0

for transaction_type_score, transaction_category_score in zip(
    transaction_types, transaction_categories
):
    accuracy_score += transaction_type_score['score'] + transaction_category_score['score']

accuracy_score = accuracy_score / (len(transaction_types) * 2)
print(f"Accuracy score: {accuracy_score}")

100%|██████████| 20/20 [00:00<00:00, 11240.26it/s]

Accuracy score: 0.975





In [40]:
# Pairwise comparison between GPT-4 and Mistral
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("labeled_pairwise_string")

row = df.iloc[0]
transaction = row["Transaction Description"]
gpt3pt5_category = row["gpt3.5_transaction_category"]
gpt3pt5_type = row["gpt3.5_transaction_type"]
mistral_category = row["mistral_transaction_category"]
mistral_type = row["mistral_transaction_type"]
reference_category = row["transaction_category"]
reference_type = row["transaction_type"]

# put the data into JSON format for the evaluator
gpt3pt5_data = f"""{{
    "transaction_category": "{gpt3pt5_category}",
    "transaction_type": "{gpt3pt5_type}"
}}"""

mistral_data = f"""{{
    "transaction_category": "{mistral_category}",
    "transaction_type": "{mistral_type}"
}}"""

reference_data = f"""{{
    "transaction_category": "{reference_category}",
    "transaction_type": "{reference_type}"
}}"""

# set up the prompt input for context for the evaluator
input_prompt = """You are are an expert at analyzing bank transactions, 
you will be categorising a single transaction. 
Always return a transaction type and category: do not return None.
Format Instructions:
{format_instructions}
Transaction Text:
{transaction}
"""

In [41]:
evaluator.evaluate_string_pairs(
    prediction=gpt3pt5_data,
    prediction_b=mistral_data,
    input=input_prompt.format(
        format_instructions=output_parser.get_format_instructions(),
        transaction=transaction),
    reference=reference_data,
)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: Nope!. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
# Run through the whole dataset and add the pairwise comparison scores to the dataframe:
pairwise_scores = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    transaction = row["Transaction Description"]
    gpt3pt5_category = row["gpt3.5_transaction_category"]
    gpt3pt5_type = row["gpt3.5_transaction_type"]
    mistral_category = row["mistral_transaction_category"]
    mistral_type = row["mistral_transaction_type"]
    reference_category = row["transaction_category"]
    reference_type = row["transaction_type"]

    # put the data into JSON format for the evaluator
    gpt3pt5_data = f"""{{
        "transaction_category": "{gpt3pt5_category}",
        "transaction_type": "{gpt3pt5_type}"
    }}"""

    mistral_data = f"""{{
        "transaction_category": "{mistral_category}",
        "transaction_type": "{mistral_type}"
    }}"""

    reference_data = f"""{{
        "transaction_category": "{reference_category}",
        "transaction_type": "{reference_type}"
    }}"""

    # set up the prompt input for context for the evaluator
    input_prompt = """You are are an expert at analyzing bank transactions, 
    you will be categorising a single transaction. 
    Always return a transaction type and category: do not return None.
    Format Instructions:
    {format_instructions}
    Transaction Text:
    {transaction}
    """

    pairwise = evaluator.evaluate_string_pairs(
        prediction=gpt3pt5_data,
        prediction_b=mistral_data,
        input=input_prompt.format(
            format_instructions=output_parser.get_format_instructions(),
            transaction=transaction),
        reference=reference_data,
    )
    pairwise_scores.append(pairwise)

# Add the pairwise scores to the dataframe:
reasoning = []
scores = []

for score in pairwise_scores:
    reasoning.append(score['reasoning'])
    scores.append(score['score'])

df['pairwise_reasoning'] = reasoning
df['pairwise_score'] = scores

df.head()
    

In [None]:
row = df.iloc[3]
print("Transaction: ", row['Transaction Description'])
print()
print("Reference:", row['transaction_type'], row['transaction_category'])
print("GPT-3.5 (A):", row['gpt3.5_transaction_type'], row['gpt3.5_transaction_category'])
print("Mistral (B):", row['mistral_transaction_type'], row['mistral_transaction_category'])

print()
print(row['pairwise_reasoning'])