# step 1 : Import libraries


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE


# Step 2: Load the dataset

In [2]:
data = pd.read_csv("creditcard.csv")

# Step 3: Data preprocessing

## Check for missing values

In [3]:
missing_values = data.isnull().sum()
print("Missing values per feature:")
print(missing_values)


Missing values per feature:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


## Check for and handle duplicates

In [4]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Drop duplicates (if needed)
data.drop_duplicates(inplace=True)

#Check for duplicate after handling
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows after handling: {duplicates}")

Number of duplicate rows: 1081
Number of duplicate rows after handling: 0


## Identify and handle outliers

In [5]:
# Z-score method for detecting outliers
from scipy import stats
z_scores = np.abs(stats.zscore(data))
outliers = (z_scores > 3).any(axis=1)
print(f"Number of outlier rows: {outliers.sum()}")

# In the context of fraud detection I decide to keep the outliers because they are more likely to be fraudulent transactions


Number of outlier rows: 37930


## Standardize numerical features

In [6]:
scaler = StandardScaler()
data[['Time', 'Amount']] = scaler.fit_transform(data[['Time', 'Amount']])


# Step 4: Data splitting

In [7]:
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Step 5: Handling imbalance data

In [8]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Step 6: Model Selection and Training

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# 1. Logistic Regression
# Reason: It's a simple, fast, and interpretable linear model that works well when the relationship between the features and the target is approximately linear.
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)
print("Finish training LR")

# 2. Random Forest
# Reason: It's an ensemble method that works well with high-dimensional data and can capture complex patterns by combining multiple decision trees.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)
print("Finish training RF")

# 3. K-Nearest Neighbors (KNN)
# Reason: It's a non-parametric method that can capture non-linear relationships in the data by considering the "neighborhood" of data points.
knn = KNeighborsClassifier()
knn.fit(X_train_resampled, y_train_resampled)
print("Finisn training KNN")

# 4. XGBoost
# Reason: It's an efficient gradient boosting algorithm that can handle a wide range of data and is known for its high performance and speed.
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_resampled, y_train_resampled)
print("Finish training XGBoost")

# 5. LightGBM
# Reason: It's a gradient boosting framework that uses tree-based learning algorithms and is designed for large datasets, offering better efficiency and speed than other gradient boosting methods.
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_resampled, y_train_resampled)
print("Finsih training lightBGM")


  from pandas import MultiIndex, Int64Index


Finish training LR
Finish training RF
Finisn training KNN


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Finish training XGBoost
Finsih training lightBGM
