<a href="https://colab.research.google.com/github/sagarprince/expense_income_train_model/blob/main/ExpenseIncomeModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
pip install tensorflow pandas scikit-learn



In [55]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv("transaction_data.csv")

# Separate the features and labels
transaction_type = df["Transaction_Type"]
description = df["Description"]
category = df["Category"]
label = df["Label"]

# Encode labels
label_encoder_category = LabelEncoder()
label_encoder_label = LabelEncoder()
category = label_encoder_category.fit_transform(category)
label = label_encoder_label.fit_transform(label)

# Split the dataset into train and test sets
transaction_type_train, transaction_type_test, description_train, description_test, category_train, category_test, label_train, label_test = train_test_split(
    transaction_type, description, category, label, test_size=0.2, random_state=42
)

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(description_train)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(description_train)
test_sequences = tokenizer.texts_to_sequences(description_test)

# Pad sequences for equal length
max_seq_length = 100
train_data_pad = pad_sequences(train_sequences, maxlen=max_seq_length)
test_data_pad = pad_sequences(test_sequences, maxlen=max_seq_length)

# Define the model
input_description = Input(shape=(max_seq_length,))
embedding_description = Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_seq_length)(input_description)
lstm_description = LSTM(128)(embedding_description)

input_transaction_type = Input(shape=(1,))
embedding_transaction_type = Embedding(len(set(transaction_type)), 8)(input_transaction_type)
flatten_transaction_type = tf.keras.layers.Flatten()(embedding_transaction_type)

concatenated_features = Concatenate()([lstm_description, flatten_transaction_type])
dense1 = Dense(64, activation="relu")(concatenated_features)
dense2 = Dense(32, activation="relu")(dense1)
output_category = Dense(len(set(category)), activation="softmax", name="category")(dense2)
output_label = Dense(len(set(label)), activation="softmax", name="label")(dense2)

model = Model(inputs=[input_description, input_transaction_type], outputs=[output_category, output_label])

# Compile the model
model.compile(
    loss={"category": "sparse_categorical_crossentropy", "label": "sparse_categorical_crossentropy"},
    optimizer="adam",
    metrics=["accuracy"]
)

# Encode transaction type
label_encoder_transaction_type = LabelEncoder()
transaction_type_train_encoded = label_encoder_transaction_type.fit_transform(transaction_type_train)
transaction_type_test_encoded = label_encoder_transaction_type.transform(transaction_type_test)

# Train the model
model.fit(
    [train_data_pad, transaction_type_train_encoded],
    [category_train, label_train],
    epochs=1000,
    batch_size=64,
    validation_data=(
        [test_data_pad, transaction_type_test_encoded],
        [category_test, label_test]
    )
)

# Evaluate the model
loss, category_loss, label_loss, category_accuracy, label_accuracy = model.evaluate(
    [test_data_pad, transaction_type_test_encoded],
    [category_test, label_test]
)
print("Test Loss:", loss)
print("Category Loss:", category_loss)
print("Label Loss:", label_loss)
print("Category Accuracy:", category_accuracy)
print("Label Accuracy:", label_accuracy)

# Save the model
model.save("transaction_model.h5")
print("Trained Model Saved")


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [61]:
# Load the model
loaded_model = tf.keras.models.load_model("transaction_model.h5")
print("Model Loaded");

# Predict on new data
new_data = "INR 9602.00 debited to HDFC Bank A/C No XXXXXXXXXX3078 towards BAJAJ FINANCE LTD / 25038107  with UMRN HDFC7020910220007476"
# new_data = "HDFC Bank: Rs 850.00 debited from a/c **3078 on 28-02-23 to VPA paytmqr2810050501011ivc7tjxi39r@paytm(UPI Ref No 305969683136). Not you? Call on 18002586161 to report"
# new_data = "You've withdrawn Rs.1000 On HDFC Bank Debit Card xx0229 At YBL LAXMIPURI On 2023-07-01:17:20:47 Avl bal: Rs.180761.6 Not you?Call 18002586161"
# new_data = "Alert You've spent Rs 3478.00 On HDFC Bank Debit Card xx229 at Spicey on 2023-07-16 Avl Bal 232323"
new_data_sequence = tokenizer.texts_to_sequences([new_data])
new_data_pad = pad_sequences(new_data_sequence, maxlen=max_seq_length)

# Iterate over transaction types to find the correct position
for i, tt in enumerate(transaction_type):
    tt_encoded = label_encoder_transaction_type.transform([tt])
    predictions_category, predictions_label = loaded_model.predict([new_data_pad, tt_encoded])
    predicted_category = label_encoder_category.inverse_transform([predictions_category.argmax()])[0]
    predicted_label = label_encoder_label.inverse_transform([predictions_label.argmax()])[0]

    if predicted_category != "unknown" and predicted_label != "unknown":
        break

print("Predicted Category:", predicted_category)
print("Predicted Label:", predicted_label)

Model Loaded
Predicted Category: Health
Predicted Label: Expense
