## 1. importing all the libraries 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score

### 2.Add csv file

In [2]:
# Load dataset
df = pd.read_csv('/kaggle/input/datasets/organizations/mlg-ulb/creditcardfraud/creditcard.csv')

### 3.Dataset Loading

In [3]:


# Change: Use RobustScaler instead of StandardScaler to handle outliers better
scaler = RobustScaler()
df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

#### Load Data

In [4]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_amount,scaled_time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.269825,-0.994983
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.983721,-0.994972
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.418291,-0.994972
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670579,-0.99496


#### Checking Dataset information

In [5]:
df.shape


df.info()


df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   V1             284807 non-null  float64
 1   V2             284807 non-null  float64
 2   V3             284807 non-null  float64
 3   V4             284807 non-null  float64
 4   V5             284807 non-null  float64
 5   V6             284807 non-null  float64
 6   V7             284807 non-null  float64
 7   V8             284807 non-null  float64
 8   V9             284807 non-null  float64
 9   V10            284807 non-null  float64
 10  V11            284807 non-null  float64
 11  V12            284807 non-null  float64
 12  V13            284807 non-null  float64
 13  V14            284807 non-null  float64
 14  V15            284807 non-null  float64
 15  V16            284807 non-null  float64
 16  V17            284807 non-null  float64
 17  V18            284807 non-nul

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_amount,scaled_time
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,2.239053e-15,...,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,0.001727,0.927124,0.118914
std,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,1.08885,...,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,0.041527,3.495006,0.557903
min,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,-24.58826,...,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,-0.307413,-0.994983
25%,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,-0.5354257,...,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,0.0,-0.229162,-0.35821
50%,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,-0.09291738,...,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,0.0,0.0,0.0
75%,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,0.4539234,...,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,0.0,0.770838,0.64179
max,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,23.74514,...,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,1.0,358.683155,1.035022


#### Data cleaning

In [6]:
df.isnull().sum()

V1               0
V2               0
V3               0
V4               0
V5               0
V6               0
V7               0
V8               0
V9               0
V10              0
V11              0
V12              0
V13              0
V14              0
V15              0
V16              0
V17              0
V18              0
V19              0
V20              0
V21              0
V22              0
V23              0
V24              0
V25              0
V26              0
V27              0
V28              0
Class            0
scaled_amount    0
scaled_time      0
dtype: int64

### Analysing the Data

In [7]:
# Combine the resampled features and labels into one DataFrame
new_dataset = pd.DataFrame(X_train_res, columns=X.columns)
new_dataset['Class'] = y_train_res

# Now your original code will work!
sns.countplot(x='Class', data=new_dataset)
plt.title("Balanced Class Distribution")
plt.show()

NameError: name 'X_train_res' is not defined

### 4.Handling Data Imbalance (The "Different" Twist)
#### The original notebook likely used undersampling or simple class weights. We will use SMOTE to synthesize new fraud cases, ensuring the model doesn't just "guess" the majority class.

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

# Split data first to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversampling only the training data
sm = SMOTE(sampling_strategy='minority', random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print(f'Original dataset shape {y_train.shape}')
print(f'Resampled dataset shape {y_train_res.shape}')

### 5.Training a High-Performance Model
#### Instead of a simple Random Forest, we’ll use XGBoost with a focus on the Precision-Recall curve, which is more important for fraud than standard accuracy.

In [None]:
# XGBoost is highly efficient for imbalanced tabular data
model = XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, scale_pos_weight=1)
model.fit(X_train_res, y_train_res)

# Predictions
y_pred = model.predict(X_test)

### 6.Precision-Recall Curve (The "Gold Standard" for Fraud)
#### Standard Confusion Matrices are great, but the AUPRC (Area Under the Precision-Recall Curve) is  what professional banks actually use to evaluate models

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    model, X_test, y_test, name="XGBoost Fraud Detector"
)
_ = display.ax_.set_title("Precision-Recall Curve (Higher is Better)")

### 

### 7.Implementing Cost-Sensitive Learning
#### In the link you provided, the focus is often on simple accuracy. In a "different" and better project, you should use scale_pos_weight. This tells the model that fraud (Class 1) is much more important to get right than Normal (Class 0) transactions.

In [None]:
from xgboost import XGBClassifier

# Calculate the ratio for scale_pos_weight
ratio = float(df['Class'].value_counts()[0]) / df['Class'].value_counts()[1]

# Model with weighted classes
model = XGBClassifier(
    n_estimators=200, 
    max_depth=5, 
    scale_pos_weight=ratio, # This handles imbalance without needing SMOTE!
    learning_rate=0.05
)

model.fit(X_train, y_train)

### 8.Implementing Cost-Sensitive Learning
#### In the link you provided, the focus is often on simple accuracy. In a "different" and better project, you should use scale_pos_weight.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.show()

### 9.Advanced Evaluation¶
#### We will use a Precision-Recall Curve and a custom Heatmap for the Confusion Matrix to make the reporting look professional and distinct.

In [None]:
def plot_results(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
    plt.title('Fraud Detection Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    print(classification_report(y_true, y_pred))

plot_results(y_test, y_pred)