Imports

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

Reading data from csv

In [2]:
df = pd.read_csv('../data/prepared_data.csv')

Split into training and test sets

In [5]:
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Random Forest

In [9]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred)
f1_rf = f1_score(y_test, y_pred)

print(f"Accuracy: {acc_rf:.2f}")
print(f"F1-score: {f1_rf:.2f}")

Best params: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 1.00
F1-score: 0.74


LightGBM

In [12]:
param_grid_lgb = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'num_leaves': [31, 50],
    'class_weight': ['balanced']
}

lgb = LGBMClassifier(random_state=42)
grid_search_lgb = GridSearchCV(lgb, param_grid_lgb, cv=3, scoring='f1', n_jobs=-1)
grid_search_lgb.fit(X_train, y_train)

print("LightGBM best params:", grid_search_lgb.best_params_)

best_lgb = grid_search_lgb.best_estimator_
y_pred_lgb = best_lgb.predict(X_test)

acc_lgb = accuracy_score(y_test, y_pred_lgb)
f1_lgb = f1_score(y_test, y_pred_lgb)

print(f"LightGBM Accuracy: {acc_lgb:.2f}")
print(f"LightGBM F1-score: {f1_lgb:.2f}")

[WinError 2] Nie można odnaleźć określonego pliku
  File "C:\Users\Szymon\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\Szymon\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Szymon\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                     

[LightGBM] [Info] Number of positive: 6570, number of negative: 5083526
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1285
[LightGBM] [Info] Number of data points in the train set: 5090096, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM best params: {'class_weight': 'balanced', 'max_depth': 20, 'n_estimators': 200, 'num_leaves': 50}
LightGBM Accuracy: 1.00
LightGBM F1-score: 0.69


Logistic Regression

In [13]:
param_grid_logreg = {
    'penalty': ['l2'],
    'C': [0.1, 1.0, 10.0],
    'class_weight': ['balanced'],
    'solver': ['liblinear', 'lbfgs']
}

logreg = LogisticRegression(random_state=42, max_iter=500)
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=3, scoring='f1', n_jobs=-1)
grid_search_logreg.fit(X_train, y_train)

print("LogisticRegression best params:", grid_search_logreg.best_params_)

best_logreg = grid_search_logreg.best_estimator_
y_pred_logreg = best_logreg.predict(X_test)

acc_logreg = accuracy_score(y_test, y_pred_logreg)
f1_logreg = f1_score(y_test, y_pred_logreg)

print(f"LogReg Accuracy: {acc_logreg:.2f}")
print(f"LogReg F1-score: {f1_logreg:.2f}")

LogisticRegression best params: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}
LogReg Accuracy: 0.96
LogReg F1-score: 0.05


Model deployment

In [21]:
import joblib

joblib.dump(best_model, "../model/randomforest_model.pkl")

['../model/randomforest_model.pkl']