In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import (
    PowerTransformer,
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)
from skopt import BayesSearchCV
from skopt.space import Integer, Real
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE

from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import re
from sklearn.impute import SimpleImputer

from ydata_profiling import ProfileReport
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Load data
excel_file_path = "./creditcard1.csv"
df = pd.read_csv(excel_file_path, encoding="latin-1")

In [3]:
df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
67108,52501.0,1.22989,0.148845,0.146808,0.521021,-0.325817,-0.60466,-0.148411,0.071443,0.082409,...,-0.277678,-0.934802,0.096725,-0.088763,0.164583,0.103386,-0.040241,0.01371,4.49,0
282992,172122.0,0.822027,-2.36518,-2.013855,0.61591,-0.607972,-0.262464,0.77301,-0.283916,1.148864,...,0.191159,-0.736414,-0.347905,0.497661,-0.269836,-0.172189,-0.157457,0.059176,652.72,0
182758,125794.0,-2.718769,0.665312,-1.750642,0.345594,2.143153,-0.400648,1.742902,-0.954034,0.493019,...,-0.203492,0.633018,0.002111,0.139716,-0.343731,-0.858093,-1.64062,-0.417682,52.98,0
151719,97248.0,1.994146,-0.218142,-1.062654,0.293147,0.100398,-0.267599,-0.193873,-0.153338,2.108615,...,-0.012331,0.4597,0.060589,0.773688,0.167847,0.097727,-0.061668,-0.06893,15.99,0
68790,53247.0,-0.749912,0.44879,1.533313,-1.403425,-0.041087,-0.644681,0.397774,0.203916,-0.03683,...,-0.046391,-0.260775,-0.038609,-0.010083,-0.362843,0.726389,0.233238,0.161877,22.84,0


In [4]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,...,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0
mean,94811.0776,0.005917,-0.004135,0.001613,-0.002966,0.001828,-0.001139,0.001801,-0.000854,-0.001596,...,-0.000371,-1.5e-05,0.000198,0.000214,-0.000232,0.000149,0.001763,0.000547,88.472687,0.001667
std,47481.047891,1.948026,1.646703,1.508682,1.414184,1.377008,1.331931,1.227664,1.179054,1.095492,...,0.723909,0.72455,0.623702,0.605627,0.52122,0.482053,0.395744,0.328027,250.399437,0.040796
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,54204.75,-0.915951,-0.600321,-0.889682,-0.850134,-0.68983,-0.769031,-0.552509,-0.208828,-0.644221,...,-0.228305,-0.5427,-0.161703,-0.354453,-0.317485,-0.326763,-0.070641,-0.052818,5.6,0.0
50%,84692.5,0.020384,0.063949,0.179963,-0.022248,-0.053468,-0.275168,0.040859,0.021898,-0.052596,...,-0.029441,0.006675,-0.011159,0.041016,0.016278,-0.052172,0.001479,0.011288,22.0,0.0
75%,139298.0,1.316068,0.800283,1.02696,0.739647,0.612218,0.396792,0.570474,0.325704,0.595977,...,0.186194,0.528245,0.147748,0.439738,0.350667,0.240261,0.091208,0.078276,77.51,0.0
max,172792.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,...,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0


In [5]:
def remove_outliers(df, outlier_dict):
    for distribution, category in outlier_dict.items():
        if distribution == "normal":
            for cat in category:
                upper_limit = df[cat].mean() + 3 * df[cat].std()
                lower_limit = df[cat].mean() - 3 * df[cat].std()
                print(cat, upper_limit, lower_limit)
                # capping
                # df[cat] = np.where(df[cat] > upper_limit,upper_limit,np.where(df[cat] < lower_limit, lower_limit, df[cat]))
                # Trimming
                df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
        elif distribution == "skew":
            for cat in category:
                percentile25 = df[cat].quantile(0.25)
                percentile75 = df[cat].quantile(0.75)
                iqr = percentile75 - percentile25
                upper_limit = percentile75 + 1.5 * iqr
                lower_limit = percentile25 - 1.5 * iqr
                print(cat, upper_limit, lower_limit)
                # capping
                # df[cat] = np.where(
                #     df[cat] > upper_limit,
                #     upper_limit,
                #     np.where(df[cat] < lower_limit, lower_limit, df[cat]),
                # )
                # Trimming
                df = df[(df[cat] < upper_limit) & (df[cat] > lower_limit)]
    return df

In [6]:
outlier_dict = {
    "normal": [],
    "skew": [],
}

def pre_process(df):
    
    return df

df = pre_process(df)
df = remove_outliers(df, outlier_dict)

In [7]:
# df.to_csv("df.csv", index=False)

In [8]:
# Define features and target
def get_X_Y(df):
    X = df.drop(columns=["Class", "Time"])
    Y = df["Class"]
    return X, Y


X, Y = get_X_Y(df)
# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=5
)
print(X_train.shape)

(226980, 29)


In [9]:
# Get the list of categorical column names
numerical_features = X_train.columns

In [10]:
# Separate transformers for categorical and numerical features

# trf = FunctionTransformer(np.log1p, validate=True)
# trf = PowerTransformer()
# trf = FunctionTransformer(np.sqrt, validate=True)
# trf = FunctionTransformer(np.sin)
trf = StandardScaler()
poly = PolynomialFeatures(degree=3,include_bias=False)
# trf = MinMaxScaler()

numerical_transformer_1 = Pipeline(
    steps=[
        ("log", trf),
        ("poly", poly),
    ]
)

In [None]:
from sklearn.linear_model import LogisticRegression


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer_1, numerical_features),
    ]
)

model = LogisticRegression(
    verbose=0,
    max_iter=1000,
    class_weight={0:0.5,1:8},
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.15,
)

# Define the pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

pipeline = ImbPipeline([
    ("preprocessor", preprocessor), 
    # ('smote', SMOTE(random_state=42, k_neighbors=5, sampling_strategy=0.20)),
    ("model", model)])

pipeline.fit(X_train, Y_train)

In [None]:
# Evaluate the tuned model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(
    "Cross-validation accuracy:",
    cross_val_score(pipeline, X_test, Y_test, cv=3, scoring="accuracy").mean(),
)

cm = confusion_matrix(Y_test, y_pred)
print(cm)

report = classification_report(Y_test, y_pred, target_names=['0', '1'])
print(report)

Accuracy: 0.9992598597257957




Cross-validation accuracy: 0.9994360801842822
[[56625    29]
 [   13    79]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56654
           1       0.73      0.86      0.79        92

    accuracy                           1.00     56746
   macro avg       0.87      0.93      0.89     56746
weighted avg       1.00      1.00      1.00     56746

