# Add src folder to Python path

In [15]:
import sys
import os
ROOT_DIR = os.path.realpath(os.path.join(os.path.abspath(''), '..'))
FOLDER_PATH = os.path.join(ROOT_DIR, "src/")
sys.path.append(FOLDER_PATH)

In [21]:
from app import Application
import pandas as pd
import numpy as np
import joblib
from pprint import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parameters

This module fits an optimized Random forest classifier 
to predict a category of banking transaction.
Data is loaded from the path and transformed to used format
in the app.DataFrame.load_data(), which may be
extended to accept new file types for your own purposes.
The model will be saved into the current folder.


In [22]:
DATA_PATH           = "/Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv"
TRAINING_SPLIT      = 0.2
RANDOM_SEARCH_N     = 500
CROSS_VALIDATION    = 3
SAVE_MODEL          = True
MODEL_NAME          = "trained_test_model.pkl"

# Used dataset

In [26]:
app = Application()
df = app.data_frame
df.load_data(DATA_PATH)
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: ISO-8859-1

Shape of the DataFrame: (112, 4)
         Date                   Receiver  Amount       Category
0  2022-10-31                    DNA Oyj  -18.90     TECHNOLOGY
1  2022-10-31                HSL Mobiili   -2.50      COMMUTING
2  2022-10-31                DISNEY PLUS   -8.99  ENTERTAINMENT
3  2022-10-31                HSL Mobiili   -2.50      COMMUTING
4  2022-10-31        K-Supermarket Derby  -13.24           FOOD
5  2022-10-31                KANRESTA OY   -3.30           FOOD
6  2022-10-31  K-CITYMARKET ESPOO LEPPAV  -60.98           FOOD
7  2022-10-30       CENTAURUS FINLAND KY -982.00         LIVING
8  2022-10-28                     KIASMA  -18.00           FOOD
9  2022-10-28            C HOTEL HELSINK  -15.00           FOOD

Rows with NaNs:
Empty DataFrame
Columns: [Date, Receiver, Amount, Category]
Index: []

Rows with Empty stri

# Dataset after removing nulls and empty strings

In [27]:
df.remove_empties()
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: ISO-8859-1

Shape of the DataFrame: (110, 4)
         Date                   Receiver  Amount       Category
0  2022-10-31                    DNA Oyj  -18.90     TECHNOLOGY
1  2022-10-31                HSL Mobiili   -2.50      COMMUTING
2  2022-10-31                DISNEY PLUS   -8.99  ENTERTAINMENT
3  2022-10-31                HSL Mobiili   -2.50      COMMUTING
4  2022-10-31        K-Supermarket Derby  -13.24           FOOD
5  2022-10-31                KANRESTA OY   -3.30           FOOD
6  2022-10-31  K-CITYMARKET ESPOO LEPPAV  -60.98           FOOD
7  2022-10-30       CENTAURUS FINLAND KY -982.00         LIVING
8  2022-10-28                     KIASMA  -18.00           FOOD
9  2022-10-28            C HOTEL HELSINK  -15.00           FOOD

Rows with NaNs:
Empty DataFrame
Columns: [Date, Receiver, Amount, Category]
Index: []

Rows with Empty stri

# Splitting data into training and validation sets

In [28]:
dataset = df.get_df()
training_data = dataset.iloc[:, [1, 2]]
class_data = dataset.iloc[:, 3]
X_train, X_test, y_train, y_test = train_test_split(training_data, class_data, 
                                                    test_size=TRAINING_SPLIT, 
                                                    random_state=21, 
                                                    stratify=class_data)

print("Training X:" + str(X_train.shape) + " y:" + str(y_train.shape) + " (Rows, Columns)")
print("Testing  X:" + str(X_test.shape) + " y:" + str(y_test.shape) + " (Rows, Columns)")
print("\nHead of training X:")
print(X_train.head(5))
print("\nHead of training y:")
print(y_train.head(5))
print("\n\n\n\n")

Training X:(88, 2) y:(88,) (Rows, Columns)
Testing  X:(22, 2) y:(22,) (Rows, Columns)

Head of training X:
                                 Receiver  Amount
33   POLYTEKNIKKOJEN PARTIOKLUBI TEEPAKKI  -12.00
29                    PAYPAL *ETSYIRELAND   -6.10
108             ALKO HKI KANNELMAKI PRISM  -22.49
34   TEKNIIKAN AKATEEMISET, 00520, HELSIN  -46.12
7                    CENTAURUS FINLAND KY -982.00
61                             CHF*PNTpay   -5.00
30                            HSL Mobiili  -52.30
20              ALKO HKI KANNELMAKI PRISM  -22.49
102           MOBILEPAY KATARIINA SEPPÄLÄ   20.00
22                    K-market Maununneva   -1.79

Head of training y:
33           HOBBIES
29              FOOD
108    ENTERTAINMENT
34              FOOD
7             LIVING
61              FOOD
30         COMMUTING
20     ENTERTAINMENT
102     OTHER INCOME
22              FOOD
Name: Category, dtype: object







# Pipeline

In [29]:
'''
Text vectorizer
'''  
text_transformer = Pipeline(
    steps=[
       ('textVectorizer', CountVectorizer()),
       ('wordBankDimRed', SelectKBest(chi2, k='all'))
    ]
)
'''
Preprocessor of pipeline
'''
preprocessor = ColumnTransformer(
   transformers=[
       ('textTransformer', text_transformer, 0)
       
    ], remainder = 'passthrough'
) 
'''
Head of pipeline
'''
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ('randomForest', RandomForestClassifier())
    ]
)

# Fitting the model

In [31]:

# ======================= HYPERPARAMETERS TO BE TESTED ==========================
'''
Fitting functions tests randomly some of theses hyperparameters to find 
the best model. Changes can be made according to the "Best found parameters:"
''' 
n_estimators        = [int(x) for x in np.linspace(start = 1, stop = 120, num = 120)]
max_depth           = [int(x) for x in np.linspace(1, 50, num = 50)]
min_samples_split   = [int(x) for x in np.linspace(1, 20, num = 20)]
min_samples_leaf    = [int(x) for x in np.linspace(1, 10, num = 10)]
bootstrap           = [True, False]
chi2_k              = [int(x) for x in np.linspace(start = 50, stop = 200, num = 150)]


random_grid =  {'randomForest__n_estimators': n_estimators,
                'randomForest__max_depth': max_depth,
                'randomForest__min_samples_split': min_samples_split,
                'randomForest__min_samples_leaf': min_samples_leaf,
                'randomForest__bootstrap': bootstrap,
                'preprocessor__textTransformer__wordBankDimRed__k': chi2_k}


base_model = pipeline
base_model.fit(X_train, y_train)

tuned_model = RandomizedSearchCV(estimator=pipeline, 
                               param_distributions=random_grid, 
                               n_iter=RANDOM_SEARCH_N, 
                               cv=CROSS_VALIDATION, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs =-1)

tuned_model.fit(X_train, y_train)
best_model = tuned_model.best_estimator_
print("\nBest found parameters:")
pprint(tuned_model.best_params_)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits





Best found parameters:
{'preprocessor__textTransformer__wordBankDimRed__k': 63,
 'randomForest__bootstrap': False,
 'randomForest__max_depth': 37,
 'randomForest__min_samples_leaf': 1,
 'randomForest__min_samples_split': 20,
 'randomForest__n_estimators': 40}


1244 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rasmus/opt/miniconda3/envs/rosetta_3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rasmus/opt/miniconda3/envs/rosetta_3.9/lib/python3.9/site-packages/sklearn/pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/rasmus/opt/miniconda3/envs/rosetta_3.9/lib/python3.9/site-packages/sklearn/pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/rasmus/opt/miniconda3/env

# Validating classifier

In [32]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    print(pd.crosstab(test_labels, y_pred, rownames=['Actual\u2193'], colnames=['Predicted\u2192']))
    accuracy = accuracy_score(test_labels, y_pred) 
    print('\nAccuracy {:0.2f}%.'.format(100*accuracy))
    return accuracy


print("Base model:")
base_accuracy = evaluate(base_model, X_test, y_test)

print("\nTuned model:")
random_accuracy = evaluate(best_model, X_test, y_test)

print('\nImprovement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Base model:
Predicted→       COMMUTING  FOOD  HOBBIES  HOUSEHOLD ITEMS  LIVING  \
Actual↓                                                              
COMMUTING                1     0        0                0       0   
ENTERTAINMENT            0     2        0                0       0   
FOOD                     0     9        0                0       0   
HEALTH                   0     1        0                0       0   
HOBBIES                  0     0        1                0       0   
HOUSEHOLD ITEMS          0     0        0                1       0   
LIVING                   0     0        0                0       1   
OTHER INCOME             0     0        0                0       0   
TECHNOLOGY               0     0        0                0       0   
UNCATEGORIZED            0     0        0                0       0   

Predicted→       OTHER INCOME  TECHNOLOGY  UNCATEGORIZED  
Actual↓                                                   
COMMUTING                   0

# Save the model

In [33]:
if SAVE_MODEL:
    with open(MODEL_NAME , 'wb') as file:
        joblib.dump(best_model, file)
        print("\nModel saved as: " + MODEL_NAME + " to current folder")


Model saved as: trained_test_model.pkl to current folder
