## Step 1: Train a model for the transaction categorization

In [1]:
import pandas as pd
import numpy as np
import joblib
import faiss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# Ask for the file path
file_path = input("Enter the path to the dataset file: ")

# Load your transaction data
# Example CSV with 'Payee', 'Memo', and 'Category' columns
df = pd.read_csv(file_path)

# Combine Payee and Memo for better transaction context
df['transaction_context'] = df['Payee'] + " " + df['Memo'].fillna("") + " " + df['Tran Type'].fillna("")

# Extract combined text and categories
transaction_contexts = df['transaction_context'].tolist()
categories = df['Category'].fillna("").tolist()

# Use a sentence transformer model to generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(transaction_contexts, normalize_embeddings=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, categories, test_size=0.2, random_state=42
)

# Train a simple logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, 'models/transaction_classifier.pkl')
np.save('models/transaction_embeddings.npy', embeddings)
joblib.dump(categories, 'models/transaction_categories.pkl')

# Build FAISS index
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(embeddings))
faiss.write_index(index, 'models/transaction_faiss.index')

# Evaluate the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, zero_division=1))


Enter the path to the dataset file:  data/ASB-Functor-2023-2025-data.csv


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Accuracy: 0.7396694214876033
                                   precision    recall  f1-score   support

                                        0.74      0.97      0.84        33
       Automobile > Car Insurance       1.00      0.00      0.00         1
            Automobile > Gas/Fuel       1.00      0.50      0.67         6
         Automobile > Maintenance       1.00      0.00      0.00         3
             Automobile > Parking       0.50      1.00      0.67         1
 Bills > Electricity, Bills > Gas       0.00      1.00      0.00         0
 Bills > Gas, Bills > Electricity       1.00      0.50      0.67         2
       Bills > Internet/Broadband       1.00      0.67      0.80         3
             Bills > Mobile Phone       0.75      0.60      0.67         5
                    Bills > Phone       1.00      0.00      0.00         1
                    Bills > Water       1.00      1.00      1.00         3
                          Cashout       1.00      0.00      0.00      

## Step 2 - Use the trained model to categorize transaction

In [2]:
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Load Model and Data
clf = joblib.load('models/transaction_classifier.pkl')
index = faiss.read_index('models/transaction_faiss.index')
categories = joblib.load('models/transaction_categories.pkl')

# Function to categorize new transactions
def categorize_transaction(payee, threshold=0.5):
    vector = model.encode([payee.strip()])

    D, I = index.search(vector, k=1)  # Get the closest match
    if I[0][0] == -1:
        return "Uncategorized"

    predicted_category = categories[I[0][0]]
    probability = max(clf.predict_proba(vector)[0])  # Get model confidence

    if probability < threshold:
        return f"Not too sure ({probability}): Please categorize manually ({predicted_category} suggested)"

    return predicted_category

# Provide Payees
payees = input("Enter payees separated by commas: ").split(',')

for payee in payees:
    print(f"Category for {payee}: {categorize_transaction(payee)}")

Enter payees separated by commas:  kfc, fuyao


Category for kfc: Food & Dining > Dining/Eating Out
Category for  fuyao: Housing > Maintenance


Category for KFC: Food & Dining > Dining/Eating Out
Category for  PB tech: Not too sure (0.3653024109084847): Please categorize manually (Digital device suggested)


## Step 3 - Convert models to ONNX format

In [12]:
import joblib
import json
import skl2onnx
from skl2onnx.common.data_types import FloatTensorType
import onnx

# Load classifier
classifier = joblib.load("models/transaction_classifier.pkl")

# Debug: Print model information
print("Model class:", classifier.__class__)
print("Number of features:", classifier.n_features_in_)

# Define input shape for conversion
initial_type = [("float_input", FloatTensorType([None, classifier.n_features_in_]))]

# Convert to ONNX with opset 12
try:
    onnx_model = skl2onnx.convert_sklearn(
        classifier,
        initial_types=initial_type,
        target_opset=13,
        options={'zipmap': False}
    )
    print("✅ ONNX model converted successfully!")
except Exception as e:
    print("❌ Error converting model to ONNX:", e)
    raise

# Save ONNX model
with open("models/transaction_classifier.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("✅ ONNX model saved!")

# Load category mappings
categories = joblib.load("models/transaction_categories.pkl")

# Save as JSON
with open("models/transaction_categories.json", "w") as f:
    json.dump(categories, f)

print("✅ Categories saved as JSON!")


Model class: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Number of features: 384
✅ ONNX model converted successfully!
✅ ONNX model saved!
✅ Categories saved as JSON!


## Step 4 - Verify ONNX

In [13]:
import onnxruntime as ort
import numpy as np
import onnx

# Load ONNX model
model_path = "models/transaction_classifier.onnx"
print("🔍 Verifying ONNX Model: ", model_path)
onnx_model = onnx.load(model_path)

# Check ONNX model details
print(f"✅ Model IR Version: {onnx_model.ir_version}")
print(f"✅ Opset Version: {onnx_model.opset_import[0].version}")

# Initialize ONNX runtime session
try:
    session = ort.InferenceSession(model_path)
    print("✅ ONNX Model Loaded Successfully!")
except Exception as e:
    print("❌ Failed to Load ONNX Model:", e)
    exit()

# Verify model inputs
inputs = session.get_inputs()
print("✅ Model Inputs:")
for inp in inputs:
    print(f"   - Name: {inp.name}, Shape: {inp.shape}, Type: {inp.type}")

# Create a test input (random vector with the correct size)
input_name = inputs[0].name
input_shape = inputs[0].shape

test_input = np.random.rand(1, input_shape[1]).astype(np.float32)
print("🔍 Test Input Shape:", test_input.shape)

# Run inference
try:
    output = session.run(None, {input_name: test_input})
    print("✅ ONNX Model Output:", output)
except Exception as e:
    print("❌ ONNX Model Inference Failed:", e)


🔍 Verifying ONNX Model:  models/transaction_classifier.onnx
✅ Model IR Version: 7
✅ Opset Version: 1
✅ ONNX Model Loaded Successfully!
✅ Model Inputs:
   - Name: float_input, Shape: [None, 384], Type: tensor(float)
🔍 Test Input Shape: (1, 384)
✅ ONNX Model Output: [array(['Housing > Maintenance'], dtype=object), array([[0.02170445, 0.00107949, 0.02747615, 0.00539835, 0.0013078 ,
        0.15810925, 0.00266737, 0.00092421, 0.00547876, 0.01079663,
        0.00241947, 0.02365948, 0.00265162, 0.00097289, 0.03642365,
        0.00102673, 0.00336087, 0.00293931, 0.00264765, 0.00064904,
        0.00159974, 0.0009787 , 0.00265874, 0.07670357, 0.00633957,
        0.00108561, 0.06142418, 0.0398148 , 0.00073475, 0.00084074,
        0.25360036, 0.00339779, 0.00467516, 0.00545127, 0.00115727,
        0.00112946, 0.19985287, 0.00192685, 0.00902656, 0.0114347 ,
        0.00447419]], dtype=float32)]
