In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier


# Load and Preprocess Data from .csv

In [20]:
# Load data from datasets/data.csv
data = pd.read_csv('datasets/data.csv', delimiter=';')

# Drop columns that are not relevant for the model
data = data.drop(columns=['Date','Value_date','Recipient/Sender', 'Balance'])

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Split the data into features X and targets y
X = data.drop(columns=['Category'])

y = data['Category']

print(y.value_counts(normalize=True))
print("##################")
print(X.head())
print("##################")
print(y.head())


Category
Revenue                     0.4777
Administrative Expenses     0.0927
Marketing                   0.0888
Taxes                       0.0877
Interest Expense            0.0866
Cost of Goods Sold          0.0852
Research and Development    0.0813
Name: proportion, dtype: float64
##################
  Reference                   IBAN    Amount Transaction_Type
0   REF4511  DE5075213287439027999    714.22          Deposit
1   INV2186  DE1860197269680432003  13777.66          Payment
2    PO5607  DE3390620094997185945  -2246.75          Payment
3   REF9026  DE1610482117117913758     51.64       Withdrawal
4   REF9119  DE2434045291625091977   -391.17          Payment
##################
0               Revenue
1               Revenue
2               Revenue
3    Cost of Goods Sold
4               Revenue
Name: Category, dtype: object


In [69]:
import tensorflow as tf
import keras
from keras.layers import Input, Dense, Embedding, LSTM, Concatenate, Dropout
from keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import class_weight


# Prepare data
X_ohe = pd.get_dummies(X, columns=['Transaction_Type'],dtype=int)

label_encoder = LabelEncoder()
y_ohe = tf.keras.utils.to_categorical(label_encoder.fit_transform(y))

# Scale the amount column
scaler = MinMaxScaler(feature_range=(-1, 1))
X_ohe['Amount'] = scaler.fit_transform(X_ohe[['Amount']])

# # create new column with the sign of the amount
# X_ohe['Amount_sign'] = np.sign(X_ohe['Amount'])

# Split data into training, test and validation set
X_train, X_test_val, y_train, y_test_val = train_test_split(X_ohe, y_ohe, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.3333333, random_state=42)


# Split data into text and numerical data
text_data_train = X_train['Reference'].values + " " + X_train['IBAN'].values
numerical_data_train = X_train.drop(columns=['Reference','IBAN']).values

text_data_test = X_test['Reference'].values + " " + X_test['IBAN'].values
numerical_data_test = X_test.drop(columns=['Reference','IBAN']).values

text_data_val = X_val['Reference'].values + " " + X_val['IBAN'].values
numerical_data_val = X_val.drop(columns=['Reference','IBAN']).values


# Parameters for the model
num_features = numerical_data_train.shape[1]
max_text_length = max([len(i) for i in text_data_train]) + 1  # +1 for padding
vocab_size = len(set(''.join(text_data_train))) + 1
num_classes = y.nunique()

# Prepare text data for model
tokenizer = Tokenizer(char_level=True, num_words=vocab_size)
tokenizer.fit_on_texts(text_data_train)

# Prepare text data for model
text_sequences_train = tokenizer.texts_to_sequences(text_data_train)
padded_text_sequences_train = pad_sequences(text_sequences_train, maxlen=max_text_length, padding='post')

text_sequences_test = tokenizer.texts_to_sequences(text_data_test)
padded_text_sequences_test = pad_sequences(text_sequences_test, maxlen=max_text_length, padding='post')

text_sequences_val = tokenizer.texts_to_sequences(text_data_val)
padded_text_sequences_val = pad_sequences(text_sequences_val, maxlen=max_text_length, padding='post')

# Numerical input
numerical_input = Input(shape=(num_features,), name="numerical_input")
x_num = Dense(64, activation="relu")(numerical_input)
x_num = tf.keras.layers.BatchNormalization()(x_num)
x_num = Dropout(0.2)(x_num)
x_num = Dense(64, activation="relu")(x_num)
x_num = tf.keras.layers.BatchNormalization()(x_num)
x_num = Dropout(0.2)(x_num)

# Text input
text_input = Input(shape=(max_text_length,), name="text_input")
x_text = Embedding(input_dim=vocab_size, output_dim=50)(text_input)
x_text = LSTM(64, return_sequences=True)(x_text)
x_text = tf.keras.layers.BatchNormalization()(x_text)
x_text = Dropout(0.2)(x_text)
x_text = LSTM(64)(x_text)
x_text = tf.keras.layers.BatchNormalization()(x_text)
x_text = Dropout(0.2)(x_text)

# Concatenate both inputs
combined = Concatenate()([x_num, x_text])
x = Dense(64, activation="relu")(combined)
x = tf.keras.layers.BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(64, activation="relu")(x)
x = tf.keras.layers.BatchNormalization()(x)
x = Dropout(0.2)(x)
output = Dense(num_classes, activation="softmax")(x)

# Create model
model = Model(inputs=[numerical_input, text_input], outputs=output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
              loss="categorical_crossentropy",
              metrics=["accuracy"])

model.summary()


In [70]:
# Train model
# Calculate class weights
class_weights = dict(enumerate(class_weight.compute_class_weight('balanced', classes=np.unique(y), y=y)))

# Train model with class weights
model.fit(x=[numerical_data_train, padded_text_sequences_train],
          y=y_train,
          epochs=10,
          batch_size=32,
          validation_data=([numerical_data_test, padded_text_sequences_test], y_test),
          class_weight=class_weights)


Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 27ms/step - accuracy: 0.3114 - loss: 2.3346 - val_accuracy: 0.5085 - val_loss: 1.4275
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.4912 - loss: 1.8218 - val_accuracy: 0.5225 - val_loss: 1.1453
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.5078 - loss: 1.7102 - val_accuracy: 0.5245 - val_loss: 1.1739
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.4961 - loss: 1.7409 - val_accuracy: 0.5045 - val_loss: 1.2088
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.5259 - loss: 1.6390 - val_accuracy: 0.5235 - val_loss: 1.1752
Epoch 6/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 32ms/step - accuracy: 0.5175 - loss: 1.6576 - val_accuracy: 0.5085 - val_loss: 1.1613
Epoch 7/10
[1m219/21

<keras.src.callbacks.history.History at 0x19f210d4f20>

In [71]:
# Evaluate model
pred = model.predict([numerical_data_val, padded_text_sequences_val])
pred_max = np.argmax(pred, axis=1)
prob = np.max(pred, axis=1)
print("Probabilities for each class: ")
print(pred[:5].round(3))
print("Highest probability for each sample: ")
print(prob[:5])
print("Predicted class: ")
print(label_encoder.inverse_transform(pred_max)[:5])
print("True class: ")
print(label_encoder.inverse_transform(np.argmax(y_test, axis=1))[:5])


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
Probabilities for each class: 
[[0.    0.001 0.001 0.001 0.001 0.997 0.001]
 [0.151 0.168 0.223 0.105 0.156 0.027 0.17 ]
 [0.    0.    0.    0.    0.    0.999 0.   ]
 [0.141 0.155 0.216 0.128 0.167 0.02  0.173]
 [0.136 0.149 0.224 0.114 0.194 0.001 0.181]]
Highest probability for each sample: 
[0.9967257  0.22349377 0.9994795  0.21636403 0.22431539]
Predicted class: 
['Revenue' 'Interest Expense' 'Revenue' 'Interest Expense'
 'Interest Expense']
True class: 
['Administrative Expenses' 'Revenue' 'Revenue' 'Revenue' 'Revenue']


# Design and Train a sklearn model. (Logistic Regression, RandomForests...)

In [4]:
# Combine text features
data['text'] = data['Reference'] + " " + data['Recipient/Sender']

# Use CountVectorizer to convert text features into a matrix of token counts
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(data['text'])

X_amount_numeric = data[['Amount']].values

# combine features
X_combined = hstack([X_text, X_amount_numeric])

# Split the data into training, validation and test set
X_train, X_test_val, y_train, y_test_val = train_test_split(X_combined, y, test_size=0.30, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.33333, random_state=42)

# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)

right_predictions = predictions == y_test
accuracy = np.sum(right_predictions) / len(y_test)
print("Accuracy:", accuracy)
print("Number of right predictions:", np.sum(right_predictions))
print("Number of wrong predictions:", len(y_test) - np.sum(right_predictions))


Accuracy: 0.4705
Number of right predictions: 941
Number of wrong predictions: 1059


In [5]:
def classify_transaction(dataline):
    text = vectorizer.transform([data_line['Reference']+ " " + data_line['Recipient/Sender']])
    amount = data_line[['Amount']].values.astype(np.float64)
    data_line_processed = hstack([text, amount])
    return model.predict(data_line_processed)

data_line = data.iloc[0]
categorie = classify_transaction(data_line)
print(categorie)





['Revenue']
