# **Fraudulent Card Transaction**

In [2]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module='lightgbm')

# ===============================
# 1. General Libraries
# ===============================
import pandas as pd
import numpy as np
import pickle
import random
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# ===============================
# 2. Preprocessing Libraries
# ===============================
# Model selection and preprocessing
from sklearn.model_selection import train_test_split, cross_validate, RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA

# Imbalanced learning techniques
import imblearn
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

# ===============================
# 3. Model Building Libraries
# ===============================
# Clustering and Metrics
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

# Model evaluation and metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

# Cross-validation and hyperparameter tuning
from sklearn.model_selection import cross_val_predict, StratifiedKFold, RandomizedSearchCV

# Machine learning models
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [3]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,transactionAmount,posEntryMode,posConditionCode,transactionType,cardPresent,isFraud,trans_month,trans_day,trans_hour,trans_isNight,...,trans_avg1days,trans_count1days,trans_avg7days,time_diff_transaction,cus_uid_transAmt_mean,cus_uid_avilMon_mean,cus_uid_curBal_mean,merch_uid_transAmt_mean,merch_uid_avilMon_mean,merch_uid_curBal_mean
0,44.09,9,1,PURCHASE,0,0,1,1,0,1,...,44.09,1.0,44.09,0.0,145.515455,49655.585909,344.414091,151.373146,7214.234306,6175.906019
1,329.57,9,8,PURCHASE,0,0,1,1,0,1,...,329.57,1.0,329.57,0.0,147.556265,1802.025077,3197.974923,145.508305,6351.587388,4821.217229
2,164.57,5,1,PURCHASE,0,0,1,1,0,1,...,164.57,1.0,164.57,0.0,144.130616,7402.268714,12597.731286,144.396592,6227.844715,4720.182508
3,122.83,2,8,PURCHASE,0,0,1,1,0,1,...,122.83,1.0,122.83,0.0,122.653,6505.911273,3494.088727,145.764513,6333.203546,4814.376271
4,0.0,5,1,ADDRESS_VERIFICATION,0,0,1,1,0,1,...,0.0,1.0,0.0,0.0,133.348058,1055.051591,1444.948409,144.028155,5682.116548,4701.445096


In [4]:
y = data["isFraud"]
X = data.drop(columns = ["isFraud"], axis = 0)
y.value_counts(normalize = True) * 100 

isFraud
0    98.420958
1     1.579042
Name: proportion, dtype: float64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
y_train = pd.DataFrame(y_train)

In [7]:
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns

In [8]:
X_train[categorical_features].nunique()

posEntryMode        6
posConditionCode    4
transactionType     4
dtype: int64

In [9]:
X_train[numerical_features].head()

Unnamed: 0,transactionAmount,cardPresent,trans_month,trans_day,trans_hour,trans_isNight,trans_isWeekend,trans_avg1days,trans_count1days,trans_avg7days,time_diff_transaction,cus_uid_transAmt_mean,cus_uid_avilMon_mean,cus_uid_curBal_mean,merch_uid_transAmt_mean,merch_uid_avilMon_mean,merch_uid_curBal_mean
404324,213.21,0,7,13,7,0,0,213.21,1.0,213.21,10.0,174.394545,1016.919545,1483.080455,147.257636,4543.384243,4049.085748
392503,3.94,0,7,7,21,0,0,195.27,3.0,89.091667,0.0,143.86629,7119.471042,12880.528958,147.276838,7637.8039,6225.293273
274051,245.94,0,5,13,17,0,0,383.895,2.0,283.433,0.0,129.55142,2048.389148,2951.610852,145.764513,6333.203546,4814.376271
566486,365.76,1,9,25,3,1,1,156.995,6.0,161.178649,0.0,146.691288,10365.988192,9634.011808,145.449707,5905.24411,3506.341256
604546,78.65,0,10,12,3,1,0,78.65,1.0,130.1225,2.0,153.430921,6902.495987,8097.504013,31.161185,7999.957947,2125.800921


In [10]:
ct = ColumnTransformer(
    [('scaler', StandardScaler(),numerical_features,),  # scaling on numeric features, 14 columns
      # OHE on categorical features, drop = first to avoid multicollinary  we will have 29 columns
    ('onehot', OneHotEncoder(drop="first", dtype=int, sparse_output=False), categorical_features)]  # OHE on categorical features,   
    )
X_transformed = ct.fit_transform(X_train)
categorical_feature_names = ct.named_transformers_["onehot"].get_feature_names_out(categorical_features).tolist()
feature_names = list(numerical_features) + categorical_feature_names
X_train_trans = pd.DataFrame(X_transformed, columns = feature_names, index = X_train.index)

In [11]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [12]:
data = reduce_mem_usage(X_train_trans)

Mem. usage decreased to 38.40 Mb (72.4% reduction)


In [13]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# Train-test split (use stratify to keep fraud ratio consistent)


X_train, X_valid, y_train, y_valid = train_test_split(X_train_trans, y_train, test_size=0.2,  stratify = y_train )

In [30]:
# LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

# Parameters (your given dict)
lgb_params = {
 'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.001,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':80000,
                    'max_bin':255,
                    'verbose':-1,
                    'early_stopping_rounds':100
}

# Train the model
model = lgb.train(
    lgb_params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid']
)

# Predict on validation set
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

# Evaluate
auc = roc_auc_score(y_valid, y_pred)
print(f"Validation AUC: {auc:.4f}")

Validation AUC: 0.8707
