In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score,ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTENC
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import shap
import mlflow


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load dataset
df = pd.read_csv('carddata.csv')  # Replace with your actual file path
print("Shape of dataset:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())
print(df.describe())
print("Dataset info:", df.info())
print("\nTarget distribution:\n", df['is_fraud'].value_counts())
print("\nTarget distribution (normalized):\n", df['is_fraud'].value_counts(normalize=True))

Shape of dataset: (1296675, 24)

Data types:
 Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
merch_zipcode            float64
dtype: object

Missing values per column:
 Unnamed: 0                    0
trans_date_trans_time         0
cc_num                        0
merchant               

In [3]:
#Droppping unnecessary columns
cols_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 'zip', 'dob', 'trans_num']
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
print("Columns after removal:", df.columns)
print("Dataset info:", df.info())

Columns after removal: Index(['trans_date_trans_time', 'merchant', 'category', 'amt', 'gender',
       'state', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   merchant               1296675 non-null  object 
 2   category               1296675 non-null  object 
 3   amt                    1296675 non-null  float64
 4   gender                 1296675 non-null  object 
 5   state                  1296675 non-null  object 
 6   lat                    1296675 non-null  float64
 7   long                   1296675 non-null  float64
 8   city_pop               1296675 non-null  int64  
 9   job                    1296675 non-null  ob

In [6]:
#Handling missing values
df['merch_zipcode'] = df['merch_zipcode'].fillna(df['merch_zipcode'].median())


In [7]:
#Feature engineering(extracting time of the day)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_hour'] = df['trans_date_trans_time'].dt.hour

def time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'day'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['trans_time_of_day'] = df['trans_hour'].apply(time_of_day)
df['trans_time_of_day'] = df['trans_time_of_day'].astype('category')
print(df[['trans_date_trans_time', 'trans_hour', 'trans_time_of_day']].head(200))

df['amt_bin'] = pd.qcut(df['amt'], 4, labels=['low', 'medium', 'high', 'very_high'])
print(df['amt_bin'].value_counts())


    trans_date_trans_time  trans_hour trans_time_of_day
0     2019-01-01 00:00:18           0             night
1     2019-01-01 00:00:44           0             night
2     2019-01-01 00:00:51           0             night
3     2019-01-01 00:01:16           0             night
4     2019-01-01 00:03:06           0             night
..                    ...         ...               ...
195   2019-01-01 02:23:41           2             night
196   2019-01-01 02:26:14           2             night
197   2019-01-01 02:26:16           2             night
198   2019-01-01 02:27:58           2             night
199   2019-01-01 02:28:24           2             night

[200 rows x 3 columns]
amt_bin
low          324325
high         324151
very_high    324112
medium       324087
Name: count, dtype: int64


In [8]:
#Sampling only 100k rows
fraud_df = df[df['is_fraud'] == 1]
nonfraud_df = df[df['is_fraud'] == 0]
n_nonfraud = 100000 - len(fraud_df)
nonfraud_sampled = nonfraud_df.sample(n=n_nonfraud, random_state=42)
df_small = pd.concat([fraud_df, nonfraud_sampled])
df_small = df_small.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_small['is_fraud'].value_counts())

is_fraud
0    92494
1     7506
Name: count, dtype: int64


In [9]:
#Preparing features and target
y = df_small['is_fraud']
X = df_small.drop(columns=['is_fraud'])

In [10]:
#Train/test splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train = X_train.drop(columns=['trans_date_trans_time'])
X_test = X_test.drop(columns=['trans_date_trans_time'])
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
print("Fraud distribution in train:\n", y_train.value_counts())
print("Fraud distribution in test:\n", y_test.value_counts())

Train size: (80000, 16)
Test size: (20000, 16)
Fraud distribution in train:
 is_fraud
0    73995
1     6005
Name: count, dtype: int64
Fraud distribution in test:
 is_fraud
0    18499
1     1501
Name: count, dtype: int64


In [11]:
#Handling categorical features
categorical_cols = ['merchant', 'category', 'gender', 'state', 'job', 'trans_time_of_day', 'amt_bin']
cat_features_indices = [X_train.columns.get_loc(col) for col in categorical_cols]


In [12]:
#BAlancing with SMOTENC
smote_nc = SMOTENC(categorical_features=cat_features_indices, random_state=42)
X_train_res, y_train_res = smote_nc.fit_resample(X_train, y_train)
print("After SMOTENC, class distribution:\n", y_train_res.value_counts())

After SMOTENC, class distribution:
 is_fraud
0    73995
1    73995
Name: count, dtype: int64


In [13]:
#Converting categorical columns to category dtype
for col in categorical_cols:
    X_train_res[col] = X_train_res[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [17]:
#Training LightGBM model
#Set mlflow experiment
mlflow.set_experiment("Fraud_Detection_Models")
with mlflow.start_run(run_name="LGBM-Model"):
    model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
)
model.fit(X_train_res, y_train_res)

# Log parameters
mlflow.log_param("model_type", "LightGBM")
mlflow.log_param("n_estimators", 500)
mlflow.log_param("learning_rate", 0.05)
mlflow.log_param("max_depth", -1)
mlflow.log_param("random_state", 42)
mlflow.log_param("n_jobs", -1)

2025/10/05 18:22:41 INFO mlflow.tracking.fluent: Experiment with name 'Fraud_Detection_Models' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



[LightGBM] [Info] Number of positive: 73995, number of negative: 73995
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3268
[LightGBM] [Info] Number of data points in the train set: 147990, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


-1

In [18]:
 # Prediction
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [26]:
#Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
mlflow.log_metric("roc_auc", roc_auc)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
print("ROC AUC Score:", roc_auc)
print("Precision:", precision)
print("Recall:", recall)
    
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.savefig('confusion_matrix_lgbm.png')
mlflow.log_artifact('confusion_matrix_lgbm.png')
#plt.show()
plt.close()

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     18499
           1       0.88      0.86      0.87      1501

    accuracy                           0.98     20000
   macro avg       0.93      0.93      0.93     20000
weighted avg       0.98      0.98      0.98     20000

ROC AUC Score: 0.9912063957649871
Precision: 0.8769438810006761
Recall: 0.8640906062624917


In [27]:
 #Feature Importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature Importances (LGBM)")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.savefig('feature_importance_lgbm.png')
mlflow.log_artifact('feature_importance_lgbm.png')
#plt.show()
plt.close()

In [28]:
#SHAP Explanations
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)
plt.savefig('shap_bar_lgbm.png')
mlflow.log_artifact('shap_bar_lgbm.png')
plt.close()
shap.summary_plot(shap_values, X_test, show=False)
plt.savefig('shap_summary_lgbm.png')
mlflow.log_artifact('shap_summary_lgbm.png')
#plt.show()
plt.close()

# Log model
mlflow.lightgbm.log_model(model, "lgbm_model")



<mlflow.models.model.ModelInfo at 0x1cd34ca25f0>

In [29]:
# Step 16: Print 10 sample predictions
#sample_indices = random.sample(range(len(X_test)), 10)
#for i in sample_indices:
#    print(f"Actual: {y_test.iloc[i]}, Predicted: {y_pred[i]}")

mlflow.end_run()

Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
