In [None]:
import pandas as pd
import pickle

with open("train_test_data.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [None]:
# pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [None]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'num_leaves': [31, 50, 70],  # More leaves = better fraud detection
    'learning_rate': [0.01, 0.05, 0.1],  # Lower rate = better separation
    'n_estimators': [500, 1000]  # More estimators = better detection
}

model = lgb.LGBMClassifier(class_weight={0: 1, 1: 2})

grid = GridSearchCV(model, param_grid, cv=3, scoring='recall')  # Optimize for recall
grid.fit(X_train, y_train)
print(grid.best_params_)

[LightGBM] [Info] Number of positive: 782590, number of negative: 782591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1409
[LightGBM] [Info] Number of data points in the train set: 1565181, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666666 -> initscore=0.693146
[LightGBM] [Info] Start training from score 0.693146
[LightGBM] [Info] Number of positive: 782591, number of negative: 782590
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1409
[LightGBM] [Info] Number of data points in the train set: 1565181, number of used features: 21
[LightGBM] [

In [None]:
# Best parameters: {'learning_rate': 0.1, 'n_estimators': 1000, 'num_leaves': 70}

best_clf = grid.best_estimator_

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_clf, X_train, y_train, cv=3, scoring='recall')
print(f'Cross-validation scores: {scores}')
print(f'Mean cross-validation score: {scores.mean()}')

[LightGBM] [Info] Number of positive: 782590, number of negative: 782591
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.114910 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1409
[LightGBM] [Info] Number of data points in the train set: 1565181, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.666666 -> initscore=0.693146
[LightGBM] [Info] Start training from score 0.693146
[LightGBM] [Info] Number of positive: 782591, number of negative: 782590
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121816 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1409
[LightGBM] [Info] Number of data points in the train set: 1565181, number of used features: 21
[LightGBM] [

In [None]:
# Cross-validation scores: [0.96089917 0.9610064  0.96093996]
# Mean cross-validation score: 0.9609485078187677

preds = best_clf.predict(X_test)
pd.Series(preds).value_counts()

Unnamed: 0,count
0,299708
1,287236


In [None]:
pd.crosstab(y_test, preds)

col_0,0,1
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,288583,4889
1,11125,282347


In [None]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97    293472
           1       0.98      0.96      0.97    293472

    accuracy                           0.97    586944
   macro avg       0.97      0.97      0.97    586944
weighted avg       0.97      0.97      0.97    586944



In [None]:
import joblib

# joblib.dump(best_clf, "frauddetection.pkl")

['frauddetection.pkl']

In [None]:
# Get predicted probabilities
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]  # Probability of class 1

# Adjust threshold to reduce False Negatives
threshold = 0.4  # Try 0.4 or lower like 0.35
y_pred_adjusted = (y_pred_proba > threshold).astype(int)

# Check new confusion matrix
print(pd.crosstab(y_test, y_pred_adjusted))
print(classification_report(y_test, y_pred_adjusted))

col_0       0       1
Label                
0      284468    9004
1        8534  284938
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    293472
           1       0.97      0.97      0.97    293472

    accuracy                           0.97    586944
   macro avg       0.97      0.97      0.97    586944
weighted avg       0.97      0.97      0.97    586944

