# Add src folder to Python path

In [1]:
import sys
import os
ROOT_DIR = os.path.realpath(os.path.join(os.path.abspath(''), '..'))
FOLDER_PATH = os.path.join(ROOT_DIR, "src/")
sys.path.append(FOLDER_PATH)

In [2]:
from app import Application
import pandas as pd
import numpy as np
import joblib
import timeit
from pprint import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Parameters
Trained model can be tested manually in this module.
Data is loaded from the path and transformed to used format
in the functionality.DataFrame.load_data(), which may be
extended to accept new file formats for your own purposes.
LIMIT shows you how curtain the model is about its predictions

In [3]:
DATA_PATH   = "/Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv"
MODEL_NAME  = "trained_test_model.pkl"
LIMIT       = 0.8

In [4]:
with open(MODEL_NAME , 'rb') as file:
    model_pipeline = joblib.load(file)
    print("Loaded model: " + MODEL_NAME + "\n")

Loaded model: trained_test_model.pkl



# Used dataset

In [5]:
app = Application()
df = app.data_frame
df.load_data(DATA_PATH)
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: utf-8

Shape of the DataFrame: (146, 4)
         Date              Receiver  Amount   Category
0  2022-07-31          VR-YHTYMÄ OY  -49.90  COMMUTING
1  2022-07-28  DRESSMANN 707 KAMPPI  -19.95   CLOTHING
2  2022-07-28                   H&M  -19.99   CLOTHING
3  2022-07-27          VR-YHTYMÄ OY  -27.00  COMMUTING
4  2022-07-27     STOCKMANN TAPIOLA  -53.40   CLOTHING
5  2022-07-25            BESTSELLER  -79.99   CLOTHING
6  2022-07-25      NISSEN ISO OMENA   -7.90     HEALTH
7  2022-07-25      PRISMA ISO OMENA  -34.66       FOOD
8  2022-07-25          YA Iso Omena  -11.44     HEALTH
9  2022-07-25   ALEPA OTANIEMI UUSI  -14.56       FOOD

Rows with NaNs:
0

Rows with Empty strings:
2




# Dataset after remocing nulls and empty strings

In [6]:
df.remove_empties()
print(df.get_info_str())

Local path: /Users/rasmus/Ohjelmointi/visual_studio/AI_finance/tools/testing_file.csv
Data loaded from AIDF-file
Used separator: (,) endoding: utf-8

Shape of the DataFrame: (144, 4)
         Date              Receiver  Amount   Category
0  2022-07-31          VR-YHTYMÄ OY  -49.90  COMMUTING
1  2022-07-28  DRESSMANN 707 KAMPPI  -19.95   CLOTHING
2  2022-07-28                   H&M  -19.99   CLOTHING
3  2022-07-27          VR-YHTYMÄ OY  -27.00  COMMUTING
4  2022-07-27     STOCKMANN TAPIOLA  -53.40   CLOTHING
5  2022-07-25            BESTSELLER  -79.99   CLOTHING
6  2022-07-25      NISSEN ISO OMENA   -7.90     HEALTH
7  2022-07-25      PRISMA ISO OMENA  -34.66       FOOD
8  2022-07-25          YA Iso Omena  -11.44     HEALTH
9  2022-07-25   ALEPA OTANIEMI UUSI  -14.56       FOOD

Rows with NaNs:
0

Rows with Empty strings:
0




# Actual testing data, new to the model

In [7]:
real_data = df.get_df()
X = real_data.iloc[:, [1, 2]]
print('Shape of the X: ' + str(X.shape) + " (Rows, Columns)")
print(X.head(10))

Shape of the X: (144, 2) (Rows, Columns)
               Receiver  Amount
0          VR-YHTYMÄ OY  -49.90
1  DRESSMANN 707 KAMPPI  -19.95
2                   H&M  -19.99
3          VR-YHTYMÄ OY  -27.00
4     STOCKMANN TAPIOLA  -53.40
5            BESTSELLER  -79.99
6      NISSEN ISO OMENA   -7.90
7      PRISMA ISO OMENA  -34.66
8          YA Iso Omena  -11.44
9   ALEPA OTANIEMI UUSI  -14.56


# Validating classifier

In [8]:
start   = timeit.default_timer()
y_pred  = model_pipeline.predict(X)
probas  = model_pipeline.predict_proba(X)
stop    = timeit.default_timer()

print("\n\n")
print("     Date:                           Receiver:    Amount:     Prediction:")

for i in range(len(y_pred)):    
    category = y_pred[i]
    if probas[i].max() < LIMIT:
        category = " "   
    print("%10s %35s %10.2f %15s" % (real_data.iloc[i][0], 
                                     real_data.iloc[i][1], 
                                     real_data.iloc[i][2], 
                                     category))
    
print("\n\nPredicted: {:d} cases and threshold was: {:0.1f}".format(len(y_pred), LIMIT))
print("Total running time of predictions: {:f} seconds.".format(stop - start))




     Date:                           Receiver:    Amount:     Prediction:
2022-07-31                        VR-YHTYMÄ OY     -49.90       COMMUTING
2022-07-28                DRESSMANN 707 KAMPPI     -19.95                
2022-07-28                                 H&M     -19.99                
2022-07-27                        VR-YHTYMÄ OY     -27.00       COMMUTING
2022-07-27                   STOCKMANN TAPIOLA     -53.40                
2022-07-25                          BESTSELLER     -79.99                
2022-07-25                    NISSEN ISO OMENA      -7.90                
2022-07-25                    PRISMA ISO OMENA     -34.66                
2022-07-25                        YA Iso Omena     -11.44                
2022-07-25                 ALEPA OTANIEMI UUSI     -14.56            FOOD
2022-07-25            Compass Group Finland Oy      -2.70            FOOD
2022-07-25                 ALEPA OTANIEMI UUSI      -6.25            FOOD
2022-07-25                     AALT