In [1]:
# ===================================================================
# PHASE 1 & 2: SETUP AND LOAD DATA
# ===================================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("--- Loading Data ---")
try:
    df = pd.read_csv('transactions.csv')
    print("Successfully loaded transactions.csv")
except FileNotFoundError:
    print("Error: 'transactions.csv' not found. Make sure it's in the same folder.")

# ===================================================================
# PHASE 3: PREPARE THE DATA
# ===================================================================
print("\n--- Cleaning Text ---")
df['cleaned_text'] = df['text'].str.lower()
df['cleaned_text'] = df['cleaned_text'].str.replace(r'[^a-z\s]', '', regex=True)

print("\n--- Splitting Data ---")
X = df['cleaned_text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data splitting complete!")

# ===================================================================
# PHASE 4: BUILD AND TRAIN THE MODEL
# ===================================================================
print("\n--- Converting text to numbers (Vectorizing) ---")
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("Text converted successfully.")

print("\n--- Training the model ---")
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
print("\nModel training complete!")

--- Loading Data ---
Successfully loaded transactions.csv

--- Cleaning Text ---

--- Splitting Data ---
Data splitting complete!

--- Converting text to numbers (Vectorizing) ---
Text converted successfully.

--- Training the model ---

Model training complete!


In [2]:
# ===================================================================
# IMPORTS: All our tools
# ===================================================================
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  # <-- NEW MODEL
from sklearn.metrics import accuracy_score

# ===================================================================
# LOAD AND PREPARE DATA
# ===================================================================
print("--- Loading and Preparing Data ---")
df = pd.read_csv('transactions.csv')
df['cleaned_text'] = df['text'].str.lower()
df['cleaned_text'] = df['cleaned_text'].str.replace(r'[^a-z\s]', '', regex=True)

X = df['cleaned_text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ===================================================================
# BUILD AND TRAIN THE IMPROVED MODEL
# ===================================================================
print("\n--- Building and Training Improved Model ---")

# Vectorizer now ignores common "stop words"
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# We are now using the MultinomialNB model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
print("Model training complete!")

# ===================================================================
# EVALUATE AND PREDICT
# ===================================================================
print("\n--- Evaluating Model Performance ---")
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\n--- Making New Predictions ---")
new_transactions = [
    'Coffee at Starbucks',
    'AMZN PAY',
    'IRCTC Ticket Booking',
    'Monthly rent transfer'
]
cleaned_new = [re.sub(r'[^a-z\s]', '', text.lower()) for text in new_transactions]
new_transactions_tfidf = vectorizer.transform(cleaned_new)
new_predictions = model.predict(new_transactions_tfidf)

for text, category in zip(new_transactions, new_predictions):
    print(f"'{text}' ==> Predicted Category: {category}")

--- Loading and Preparing Data ---

--- Building and Training Improved Model ---
Model training complete!

--- Evaluating Model Performance ---
Model Accuracy: 16.67%

--- Making New Predictions ---
'Coffee at Starbucks' ==> Predicted Category: Food
'AMZN PAY' ==> Predicted Category: Bills & Utilities
'IRCTC Ticket Booking' ==> Predicted Category: Travel
'Monthly rent transfer' ==> Predicted Category: Bills & Utilities


In [3]:
# ===================================================================
# IMPORTS: All our tools
# ===================================================================
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score # <-- Import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC # <-- NEW, MORE POWERFUL MODEL
from sklearn.metrics import accuracy_score

# ===================================================================
# LOAD AND PREPARE DATA
# ===================================================================
print("--- Loading and Preparing Data ---")
df = pd.read_csv('transactions.csv')
# Drop any empty rows that might have been created
df.dropna(subset=['text', 'category'], inplace=True)
df['cleaned_text'] = df['text'].str.lower()
df['cleaned_text'] = df['cleaned_text'].str.replace(r'[^a-z\s]', '', regex=True)

X = df['cleaned_text']
y = df['category']

# ===================================================================
# BUILD AND TRAIN THE FINAL MODEL
# ===================================================================
print("\n--- Building and Training Final Model ---")

# Vectorizer now ignores common "stop words"
vectorizer = TfidfVectorizer(stop_words='english')

# We transform ALL the data now, as cross-validation will handle splitting
X_tfidf = vectorizer.fit_transform(X)

# We are now using the LinearSVC model
model = LinearSVC(random_state=42) # Using random_state for reproducibility

# ===================================================================
# EVALUATE WITH CROSS-VALIDATION
# ===================================================================
print("\n--- Evaluating Model with Cross-Validation ---")
# This performs 5 rounds of training and testing and gives us the scores
scores = cross_val_score(model, X_tfidf, y, cv=5)

print(f"Scores for each of the 5 folds: {scores}")
print(f"Average Cross-Validation Accuracy: {scores.mean() * 100:.2f}%")

# ===================================================================
# TRAIN FINAL MODEL ON ALL DATA & MAKE PREDICTIONS
# ===================================================================
# Now we train our final model on ALL the data so it's as smart as possible
model.fit(X_tfidf, y)
print("\n--- Making Final Predictions ---")

new_transactions = [
    'Coffee at Starbucks',
    'AMZN PAY',
    'IRCTC Ticket Booking',
    'Monthly rent transfer',
    'Swiggy order'
]
cleaned_new = [re.sub(r'[^a-z\s]', '', text.lower()) for text in new_transactions]
new_transactions_tfidf = vectorizer.transform(cleaned_new)
new_predictions = model.predict(new_transactions_tfidf)

for text, category in zip(new_transactions, new_predictions):
    print(f"'{text}' ==> Predicted Category: {category}")

--- Loading and Preparing Data ---

--- Building and Training Final Model ---

--- Evaluating Model with Cross-Validation ---
Scores for each of the 5 folds: [0.16666667 0.5        0.16666667 0.23529412 0.29411765]
Average Cross-Validation Accuracy: 27.25%

--- Making Final Predictions ---
'Coffee at Starbucks' ==> Predicted Category: Food
'AMZN PAY' ==> Predicted Category: Travel
'IRCTC Ticket Booking' ==> Predicted Category: Travel
'Monthly rent transfer' ==> Predicted Category: Bills & Utilities
'Swiggy order' ==> Predicted Category: Food


In [4]:
# ===================================================================
# IMPORTS: All our tools
# ===================================================================
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score # <-- Import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC # <-- NEW, MORE POWERFUL MODEL
from sklearn.metrics import accuracy_score

# ===================================================================
# LOAD AND PREPARE DATA
# ===================================================================
print("--- Loading and Preparing Data ---")
df = pd.read_csv('transactions.csv')
# Drop any empty rows that might have been created
df.dropna(subset=['text', 'category'], inplace=True)
df['cleaned_text'] = df['text'].str.lower()
df['cleaned_text'] = df['cleaned_text'].str.replace(r'[^a-z\s]', '', regex=True)

X = df['cleaned_text']
y = df['category']

# ===================================================================
# BUILD AND TRAIN THE FINAL MODEL
# ===================================================================
print("\n--- Building and Training Final Model ---")

# Vectorizer now ignores common "stop words"
vectorizer = TfidfVectorizer(stop_words='english')

# We transform ALL the data now, as cross-validation will handle splitting
X_tfidf = vectorizer.fit_transform(X)

# We are now using the LinearSVC model
model = LinearSVC(random_state=42) # Using random_state for reproducibility

# ===================================================================
# EVALUATE WITH CROSS-VALIDATION
# ===================================================================
print("\n--- Evaluating Model with Cross-Validation ---")
# This performs 5 rounds of training and testing and gives us the scores
scores = cross_val_score(model, X_tfidf, y, cv=5)

print(f"Scores for each of the 5 folds: {scores}")
print(f"Average Cross-Validation Accuracy: {scores.mean() * 100:.2f}%")

# ===================================================================
# TRAIN FINAL MODEL ON ALL DATA & MAKE PREDICTIONS
# ===================================================================
# Now we train our final model on ALL the data so it's as smart as possible
model.fit(X_tfidf, y)
print("\n--- Making Final Predictions ---")

new_transactions = [
    'Coffee at Starbucks',
    'AMZN PAY',
    'IRCTC Ticket Booking',
    'Monthly rent transfer',
    'Swiggy order'
]
cleaned_new = [re.sub(r'[^a-z\s]', '', text.lower()) for text in new_transactions]
new_transactions_tfidf = vectorizer.transform(cleaned_new)
new_predictions = model.predict(new_transactions_tfidf)

for text, category in zip(new_transactions, new_predictions):
    print(f"'{text}' ==> Predicted Category: {category}")

--- Loading and Preparing Data ---

--- Building and Training Final Model ---

--- Evaluating Model with Cross-Validation ---
Scores for each of the 5 folds: [0.16666667 0.5        0.16666667 0.23529412 0.29411765]
Average Cross-Validation Accuracy: 27.25%

--- Making Final Predictions ---
'Coffee at Starbucks' ==> Predicted Category: Food
'AMZN PAY' ==> Predicted Category: Travel
'IRCTC Ticket Booking' ==> Predicted Category: Travel
'Monthly rent transfer' ==> Predicted Category: Bills & Utilities
'Swiggy order' ==> Predicted Category: Food
