# Add src folder to Python path

In [7]:
import sys
import os
ROOT_DIR = os.path.realpath(os.path.join(os.path.abspath(''), '..'))
FOLDER_PATH = os.path.join(ROOT_DIR, "src/")
sys.path.append(FOLDER_PATH)

In [54]:
from app import Application
import pandas as pd
import numpy as np
import joblib
import json
from pprint import pprint
import xgboost as xgb
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parameters

This module fits an optimized Random forest classifier 
to predict a category of banking transaction.
Data is loaded from the path and transformed to used format
in the app.DataFrame.load_data(), which may be
extended to accept new file types for your own purposes.
The model will be saved into the current folder.


In [46]:
DATA_PATH           = "/Users/rasmus/Desktop/training_data.csv"
TRAINING_SPLIT      = 0.2
RANDOM_SEARCH_N     = 500
CROSS_VALIDATION    = 3
SAVE_MODEL          = False
MODEL_NAME          = "trained_test_model.pkl"

# Used dataset

In [47]:
app = Application()
df = app.data_frame
df.load_data(DATA_PATH)
print(df.get_info_str())

Local path: /Users/rasmus/Desktop/training_data.csv
Data loaded from AIDF_GS-file
Used separator: (,) endoding: utf-8

Shape of the DataFrame: (927, 4)
         Date Receiver  Amount         Category
0  2019-05-01          -3168.0             FOOD
1  2019-05-01          -4680.0           LIVING
2  2019-05-01          -1212.0       TECHNOLOGY
3  2019-05-01          -1128.0          HOBBIES
4  2019-05-01           -684.0    ENTERTAINMENT
5  2019-05-01           -708.0        COMMUTING
6  2019-05-01           -804.0         CLOTHING
7  2019-05-01           -684.0           HEALTH
8  2019-05-01           -204.0  HOUSEHOLD ITEMS
9  2019-05-01          -7656.0        INVESTING

Rows with NaNs:
0

Rows with Empty strings:
267




# Dataset after removing nulls and empty strings

In [48]:
df.remove_empties()
print(df.get_info_str())

Local path: /Users/rasmus/Desktop/training_data.csv
Data loaded from AIDF_GS-file
Used separator: (,) endoding: utf-8

Shape of the DataFrame: (660, 4)
           Date                     Receiver   Amount      Category
91   2022-07-31  MOBILEPAY KATARIINA SEPPÄLÄ    50.00  OTHER INCOME
92   2022-07-31                 VR-YHTYMÄ OY   -49.90     COMMUTING
93   2022-07-29                     ICEYE OY  1836.55        SALARY
94   2022-07-28         DRESSMANN 707 KAMPPI   -19.95      CLOTHING
95   2022-07-28                          H&M   -19.99      CLOTHING
96   2022-07-27                 VR-YHTYMÄ OY   -27.00     COMMUTING
97   2022-07-27            STOCKMANN TAPIOLA   -53.40      CLOTHING
98   2022-07-25                   BESTSELLER   -79.99      CLOTHING
99   2022-07-25             NISSEN ISO OMENA    -7.90        HEALTH
100  2022-07-25             PRISMA ISO OMENA   -34.66          FOOD

Rows with NaNs:
0

Rows with Empty strings:
0




# Encode category names

In [89]:
path = os.path.join(FOLDER_PATH, "app/files/_gategories.json")
f = open(path)
data = json.load(f)

encoding = data['transaction_types']['expenditure_list']
encoding.update(data['transaction_types']['income_list'])

df_encoding = pd.DataFrame.from_dict(encoding, columns=['value'] ,orient='index')
df_encoding['Encoding'] = df_encoding.index.astype('int')

dataset = df.get_df().copy()
dataset = pd.merge(dataset, df_encoding, left_on='Category', right_on='value')

display(dataset)

Unnamed: 0,Date,Receiver,Amount,Category,value,Encoding
0,2022-07-31,MOBILEPAY KATARIINA SEPPÄLÄ,50.00,OTHER INCOME,OTHER INCOME,12
1,2022-07-07,SEPPÄLÄ ANTTI SAMULI,100.00,OTHER INCOME,OTHER INCOME,12
2,2022-07-04,SEPPÄLÄ SEIJA TUULIKKI,25.00,OTHER INCOME,OTHER INCOME,12
3,2022-06-27,MOBILEPAY KALLE HERMAN KEINONEN,400.00,OTHER INCOME,OTHER INCOME,12
4,2022-06-22,SEPPÄLÄ SEIJA TUULIKKI,400.00,OTHER INCOME,OTHER INCOME,12
...,...,...,...,...,...,...
655,2022-01-21,KANSANELÄKELAITOS,-35.80,UNCATEGORIZED,UNCATEGORIZED,9
656,2022-01-20,"POSTI OY, DIGIT",-22.01,UNCATEGORIZED,UNCATEGORIZED,9
657,2022-09-19,PAYTRAIL OYJ,-24.90,UNCATEGORIZED,UNCATEGORIZED,9
658,2022-10-14,"TEKNIIKAN AKATEEMISET, 00520, HELSIN",-46.12,UNCATEGORIZED,UNCATEGORIZED,9


# Splitting data into training and validation sets

In [81]:
dataset = dataset.loc[dataset['Date'] < '2023-01-01']
training_data = dataset.iloc[:, [1, 2]]
class_data = dataset.iloc[:, 5]
X_train, X_test, y_train, y_test = train_test_split(training_data, class_data, 
                                                    test_size=TRAINING_SPLIT, 
                                                    random_state=21, 
                                                    stratify=class_data)

print("Training X:" + str(X_train.shape) + " y:" + str(y_train.shape) + " (Rows, Columns)")
print("Testing  X:" + str(X_test.shape) + " y:" + str(y_test.shape) + " (Rows, Columns)")
print("\nHead of training X:")
print(X_train.head(5))
print("\nHead of training y:")
print(y_train.head(5))
print("\n\n\n\n")

Training X:(488, 2) y:(488,) (Rows, Columns)
Testing  X:(122, 2) y:(122,) (Rows, Columns)

Head of training X:
                      Receiver  Amount
329        ALEPA OTANIEMI UUSI  -38.20
514      IKEA VANTAA RAVINTOLA  -11.98
172     M Room Panorama Tower,  -28.50
517  0448 NESTE HEINOLA VIERUM   -5.80
279  K-CITYMARKET ESPOO ISO OM  -47.55

Head of training y:
329    1
514    7
172    2
517    7
279    1
Name: Encoding, dtype: int64







# Pipeline

In [82]:
'''
Text vectorizer
'''  
text_transformer = Pipeline(
    steps=[
       ('textVectorizer', CountVectorizer()),
       ('wordBankDimRed', SelectKBest(chi2, k='all'))
    ]
)
'''
Preprocessor of pipeline
'''
preprocessor = ColumnTransformer(
   transformers=[
       ('textTransformer', text_transformer, 0)
       
    ], remainder = 'passthrough'
) 
'''
Head of pipeline
'''
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ('randomForest', xgb.XGBClassifier())
    ]
)

# Fitting the model

In [83]:

# ======================= HYPERPARAMETERS TO BE TESTED ==========================
'''
Fitting functions tests randomly some of theses hyperparameters to find 
the best model. Changes can be made according to the "Best found parameters:"
''' 
n_estimators        = [int(x) for x in np.linspace(start = 1, stop = 120, num = 120)]
max_depth           = [int(x) for x in np.linspace(1, 50, num = 50)]
min_samples_split   = [int(x) for x in np.linspace(1, 20, num = 20)]
min_samples_leaf    = [int(x) for x in np.linspace(1, 10, num = 10)]
bootstrap           = [True, False]
chi2_k              = [int(x) for x in np.linspace(start = 50, stop = 200, num = 150)]


random_grid =  {'randomForest__n_estimators': n_estimators,
                'randomForest__max_depth': max_depth,
                'randomForest__min_samples_split': min_samples_split,
                'randomForest__min_samples_leaf': min_samples_leaf,
                'randomForest__bootstrap': bootstrap,
                'preprocessor__textTransformer__wordBankDimRed__k': chi2_k}


base_model = pipeline
base_model.fit(X_train, y_train)

tuned_model = RandomizedSearchCV(estimator=pipeline, 
                               param_distributions=random_grid, 
                               n_iter=RANDOM_SEARCH_N, 
                               cv=CROSS_VALIDATION, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs =-1)

tuned_model.fit(X_train, y_train)
best_model = tuned_model.best_estimator_
print("\nBest found parameters:")
pprint(tuned_model.best_params_)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "bootstrap", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: 

# Validating classifier

In [86]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    print(pd.crosstab(test_labels, y_pred, rownames=['Actual\u2193'], colnames=['Predicted\u2192']))
    accuracy = accuracy_score(test_labels, y_pred) 
    print('\nAccuracy {:0.2f}%.'.format(100*accuracy))
    return accuracy


print("Base model:")
base_accuracy = evaluate(base_model, X_test, y_test)

print("\nTuned model:")
random_accuracy = evaluate(best_model, X_test, y_test)

print('\nImprovement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Base model:
Predicted→  0   1   2   3   4   5   6   7   8   9   10  11  12
Actual↓                                                       
0            1   0   0   0   0   2   0   0   1   0   0   0   0
1            0  44   0   0   1   0   0   0   0   0   0   0   0
2            0   1   3   0   1   0   0   0   0   0   0   0   0
3            0   0   0   9   0   0   0   0   0   0   0   0   0
4            0   0   0   0  11   1   2   0   0   0   0   0   0
5            0   0   0   0   2   4   0   0   2   0   0   0   0
6            0   0   0   1   1   0   1   0   0   0   0   0   0
7            0   4   0   0   1   0   0   3   0   1   0   0   0
8            0   1   0   0   0   0   0   1   3   0   0   0   0
9            0   0   0   0   0   0   0   0   1   1   0   0   0
10           0   0   0   0   0   0   0   0   0   0   2   0   0
11           0   0   0   0   0   0   0   0   0   0   0   2   1
12           0   0   0   0   0   0   0   0   0   0   0   0  13

Accuracy 79.51%.

Tuned model:
Predicted→ 

In [92]:
dataset = dataset.loc[dataset['Date'] >= '2023-01-01']

def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    print(pd.crosstab(test_labels, y_pred, rownames=['Actual\u2193'], colnames=['Predicted\u2192']))
    accuracy = accuracy_score(test_labels, y_pred) 
    print('\nAccuracy {:0.2f}%.'.format(100*accuracy))
    return accuracy


print("\nTuned model:")
random_accuracy = evaluate(best_model, dataset.iloc[:, [1, 2]], dataset.iloc[:, 5])



Tuned model:
Predicted→  0   1   2   3   4   5   7   8   9   10  11  12
Actual↓                                                   
0            1   0   0   0   0   0   0   0   1   0   0   0
1            0   7   1   0   0   1  10   0   0   0   0   0
2            0   0   1   1   0   0   0   0   0   0   0   0
3            0   0   0   7   0   0   0   0   0   0   0   0
4            0   0   0   0   5   0   0   0   0   0   0   0
5            0   0   0   0   0   1   0   0   0   0   0   0
7            0   0   0   0   1   0   4   1   0   0   0   0
8            0   0   0   0   0   0   0   2   0   0   0   0
10           0   0   0   0   0   0   0   0   0   1   0   0
11           0   0   0   0   0   0   0   0   0   0   1   0
12           0   0   0   0   0   0   0   0   0   0   0   4

Accuracy 68.00%.


# Save the model

In [None]:
if SAVE_MODEL:
    with open(MODEL_NAME , 'wb') as file:
        joblib.dump(best_model, file)
        print("\nModel saved as: " + MODEL_NAME + " to current folder")