In [None]:
pip install pandas scikit-learn joblib


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import joblib

# Load your CSV file
df = pd.read_csv('expenses.csv')

# Basic check
print(df.head())

# Train-test split
X = df['description']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


                   description       category
0  Ordered pizza from Domino's           Food
1          Uber ride to office      Transport
2         Bought movie tickets  Entertainment
3             Monthly bus pass      Transport
4                Dinner at KFC           Food


In [3]:
# Create a pipeline with CountVectorizer + Naive Bayes
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Test accuracy
print("Test Accuracy:", model.score(X_test, y_test))


Test Accuracy: 0.5


In [4]:
joblib.dump(model, 'expense_category_predictor.pkl')
print("Model saved as expense_category_predictor.pkl")


Model saved as expense_category_predictor.pkl


In [5]:
# Quick test
print(model.predict(["Bought biryani from restaurant"]))  # should predict 'Food'
print(model.predict(["Cab to railway station"]))          # should predict 'Transport'


['Food']
['Transport']


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# 1. Load your data
df = pd.read_csv("expenses.csv")

# 2. Preprocess the data
df = df.dropna(subset=["description", "category"])  # make sure there are no missing values
X = df["description"]  # Text data for description
y = df["category"]     # Labels for categories

# 3. Split data into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Build a text classification pipeline
model = make_pipeline(
    TfidfVectorizer(),   # Converts text to numerical features
    LogisticRegression()  # Classifier that works well for text classification
)

# 5. Train the model
model.fit(X_train, y_train)

# 6. Evaluate the model (check performance on test set)
y_pred = model.predict(X_test)
print("Model Performance:")
print(classification_report(y_test, y_pred))

# 7. Save the trained model
joblib.dump(model, "expense_category_model.joblib")


Model Performance:
               precision    recall  f1-score   support

Entertainment       0.00      0.00      0.00         1
         Food       1.00      1.00      1.00         2
    Transport       0.00      0.00      0.00         1

     accuracy                           0.50         4
    macro avg       0.33      0.33      0.33         4
 weighted avg       0.50      0.50      0.50         4



['expense_category_model.joblib']