# Add src folder to Python path

In [1]:
import sys
import os
ROOT_DIR = os.path.realpath(os.path.join(os.path.abspath(''), '..'))
FOLDER_PATH = os.path.join(ROOT_DIR, "src/")
sys.path.append(FOLDER_PATH)

In [9]:
from app import Application
import pandas as pd
import numpy as np
import joblib
import timeit
from pprint import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parameters
Trained model can be tested manually in this module.
Data is loaded from the path and transformed to used format
in the functionality.DataFrame.load_data(), which may be
extended to accept new file formats for your own purposes.
LIMIT shows you how curtain the model is about its predictions

In [10]:
DATA_PATH   = "/Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv"
MODEL_NAME  = "trained_test_model.pkl"
LIMIT       = 0.8

In [11]:
with open(MODEL_NAME , 'rb') as file:
    model_pipeline = joblib.load(file)
    print("Loaded model: " + MODEL_NAME + "\n")

Loaded model: trained_test_model.pkl



# Used dataset

In [12]:
app = Application()
df = app.data_frame
df.load_data(DATA_PATH)
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: ISO-8859-1

Shape of the DataFrame: (112, 4)
         Date                   Receiver  Amount       Category
0  2022-10-31                    DNA Oyj  -18.90     TECHNOLOGY
1  2022-10-31                HSL Mobiili   -2.50      COMMUTING
2  2022-10-31                DISNEY PLUS   -8.99  ENTERTAINMENT
3  2022-10-31                HSL Mobiili   -2.50      COMMUTING
4  2022-10-31        K-Supermarket Derby  -13.24           FOOD
5  2022-10-31                KANRESTA OY   -3.30           FOOD
6  2022-10-31  K-CITYMARKET ESPOO LEPPAV  -60.98           FOOD
7  2022-10-30       CENTAURUS FINLAND KY -982.00         LIVING
8  2022-10-28                     KIASMA  -18.00           FOOD
9  2022-10-28            C HOTEL HELSINK  -15.00           FOOD

Rows with NaNs:
0

Rows with Empty strings:
2




# Dataset after remocing nulls and empty strings

In [13]:
df.remove_empties()
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: ISO-8859-1

Shape of the DataFrame: (110, 4)
         Date                   Receiver  Amount       Category
0  2022-10-31                    DNA Oyj  -18.90     TECHNOLOGY
1  2022-10-31                HSL Mobiili   -2.50      COMMUTING
2  2022-10-31                DISNEY PLUS   -8.99  ENTERTAINMENT
3  2022-10-31                HSL Mobiili   -2.50      COMMUTING
4  2022-10-31        K-Supermarket Derby  -13.24           FOOD
5  2022-10-31                KANRESTA OY   -3.30           FOOD
6  2022-10-31  K-CITYMARKET ESPOO LEPPAV  -60.98           FOOD
7  2022-10-30       CENTAURUS FINLAND KY -982.00         LIVING
8  2022-10-28                     KIASMA  -18.00           FOOD
9  2022-10-28            C HOTEL HELSINK  -15.00           FOOD

Rows with NaNs:
0

Rows with Empty strings:
0




# Actual testing data, new to the model

In [14]:
real_data = df.get_df()
X = real_data.iloc[:, [1, 2]]
print('Shape of the X: ' + str(X.shape) + " (Rows, Columns)")
print(X.head(10))

Shape of the X: (110, 2) (Rows, Columns)
                    Receiver  Amount
0                    DNA Oyj  -18.90
1                HSL Mobiili   -2.50
2                DISNEY PLUS   -8.99
3                HSL Mobiili   -2.50
4        K-Supermarket Derby  -13.24
5                KANRESTA OY   -3.30
6  K-CITYMARKET ESPOO LEPPAV  -60.98
7       CENTAURUS FINLAND KY -982.00
8                     KIASMA  -18.00
9            C HOTEL HELSINK  -15.00


# Validating classifier

In [16]:
start   = timeit.default_timer()
y_pred  = model_pipeline.predict(X)
probas  = model_pipeline.predict_proba(X)
stop    = timeit.default_timer()

print("\n\n")
print("     Date:                           Receiver:    Amount:     Prediction:")

for i in range(len(y_pred)):    
    category = y_pred[i]
    if probas[i].max() < LIMIT:
        category = " "   
    print("%10s %35s %10.2f %15s" % (real_data.iloc[i][0], 
                                     real_data.iloc[i][1], 
                                     real_data.iloc[i][2], 
                                     category))
    
print("\n\nPredicted: {:d} cases and threshold was: {:0.1f}".format(len(y_pred), LIMIT))
print("Total running time of predictions: {:f} seconds.".format(stop - start))




     Date:                           Receiver:    Amount:     Prediction:
2022-10-31                             DNA Oyj     -18.90                
2022-10-31                         HSL Mobiili      -2.50       COMMUTING
2022-10-31                         DISNEY PLUS      -8.99                
2022-10-31                         HSL Mobiili      -2.50       COMMUTING
2022-10-31                 K-Supermarket Derby     -13.24            FOOD
2022-10-31                         KANRESTA OY      -3.30            FOOD
2022-10-31           K-CITYMARKET ESPOO LEPPAV     -60.98                
2022-10-30                CENTAURUS FINLAND KY    -982.00          LIVING
2022-10-28                              KIASMA     -18.00            FOOD
2022-10-28                     C HOTEL HELSINK     -15.00            FOOD
2022-10-27                        PAYTRAIL OYJ     -15.00                
2022-10-27                        VFI*Baoyu Oy     -12.50   ENTERTAINMENT
2022-10-25                     Espo