## Step 1: Train a model for the transaction categorization

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# Ask for the file path
file_path = input("Enter the path to the dataset file: ")

# Load your transaction data
# Example CSV with 'payee' and 'category' columns
df = pd.read_csv(file_path)

# Extract payees and categories
payees = df['Payee'].tolist()
categories = df['Category'].tolist()

# Use a sentence transformer model to generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(payees)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, categories, test_size=0.2, random_state=42
)

# Train a simple logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, 'models/transaction_classifier.pkl')
np.save('models/transaction_embeddings.npy', embeddings)
joblib.dump(categories, 'models/transaction_categories.pkl')

# Evaluate the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, zero_division=1))


## Step 2 - Use the trained model to categorize transaction

In [None]:
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Load Model and Data
clf = joblib.load('models/transaction_classifier.pkl')
embeddings = np.load('models/transaction_embeddings.npy')
categories = joblib.load('models/transaction_categories.pkl')

# Set Up FAISS for Similarity Search
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

# Function to categorize new transactions
def categorize_transaction(payee, threshold=0.5):
    vector = model.encode([payee.strip()])

    D, I = index.search(vector, k=1)  # Get the closest match
    if I[0][0] == -1:
        return "Uncategorized"

    predicted_category = categories[I[0][0]]
    probability = max(clf.predict_proba(vector)[0])  # Get model confidence

    if probability < threshold:
        return f"Not too sure: Please categorize manually ({predicted_category} suggested)"

    return predicted_category

# Provide Payees
payees = input("Enter payees separated by commas: ").split(',')

for payee in payees:
    print(f"Category for {payee}: {categorize_transaction(payee)}")