In [None]:
# Install all required libraries
!pip install shap lime scikit-learn imbalanced-learn xgboost lightgbm catboost plotly kaleido openpyxl -q

print("✓ Installation complete!")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for lime (setup.py) ... [?25l[?25hdone
✓ Installation complete!


In [None]:
# Import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Modeling libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Imbalanced data handling
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek

# Metrics
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve,
    f1_score, accuracy_score, recall_score, precision_score,
    average_precision_score
)

# Explainability
import shap
from lime import lime_tabular

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Utilities
import pickle
import time
from datetime import datetime

# Set styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✓ All libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✓ All libraries imported successfully!
Timestamp: 2026-01-12 14:48:29


In [None]:
# Alternative: Direct download without Kaggle API
print("Downloading PaySim dataset directly...")

# Download from alternative source
!wget https://www.kaggle.com/api/v1/datasets/download/ealaxi/paysim1 -O paysim1.zip

# OR use gdown if the file is on Google Drive (I can provide a mirror)
# !pip install gdown
# !gdown <file_id> -O paysim1.zip

print("\nExtracting files...")
!unzip -q paysim1.zip

print("✓ Dataset downloaded and extracted!")

# Verify the file exists
import os
if os.path.exists('PS_20174392719_1491204439457_log.csv'):
    print("✓ Dataset file found!")
else:
    print("❌ Dataset file not found. Listing files:")
    !ls -lh

Downloading PaySim dataset directly...
--2026-01-12 14:53:03--  https://www.kaggle.com/api/v1/datasets/download/ealaxi/paysim1
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://storage.googleapis.com:443/kaggle-data-sets/1069/1940/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20260112%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20260112T140819Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=62968d604274c7e3cca2239b8e35d9b1e6d8b3ff800ce4bf129b5739a11c37d8691346508e66efbb0d216ae62f6cdcd853f66c4ce82415ee2cc2f21a0b0823c2b4fb7b83ba45b1fe34c3f10c7db83400cb0b1d972a8262778b7f5737d31ae849020b3f82f1b4af8f69c2eb5681cecb1ca10e09da99893378b1b091f22e7bbca883078fb7d20126feb0531a20239512d098dccaded627ac7ac49e051f2e1b7ceaf9e72d789abcc7d1c376de4b46755

In [None]:
# Load the dataset
print("Loading PaySim dataset...")
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')

print("✓ Dataset loaded successfully!")
print(f"\nDataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Loading PaySim dataset...
✓ Dataset loaded successfully!

Dataset Shape: (6362620, 11)
Memory Usage: 1452.57 MB


In [None]:
# Display first few rows
print("\n" + "="*80)
print("FIRST 10 TRANSACTIONS")
print("="*80)
print(df.head(10))


FIRST 10 TRANSACTIONS
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815      170136.00       160296.36   
1     1   PAYMENT   1864.28  C1666544295       21249.00        19384.72   
2     1  TRANSFER    181.00  C1305486145         181.00            0.00   
3     1  CASH_OUT    181.00   C840083671         181.00            0.00   
4     1   PAYMENT  11668.14  C2048537720       41554.00        29885.86   
5     1   PAYMENT   7817.71    C90045638       53860.00        46042.29   
6     1   PAYMENT   7107.77   C154988899      183195.00       176087.23   
7     1   PAYMENT   7861.64  C1912850431      176087.23       168225.59   
8     1   PAYMENT   4024.36  C1265012928        2671.00            0.00   
9     1     DEBIT   5337.77   C712410124       41720.00        36382.23   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0            0.00        0               0  
1  

In [None]:
# Dataset information
print("\n" + "="*80)
print("DATASET INFORMATION")
print("="*80)
print(df.info())


DATASET INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


In [None]:
# Check for missing values
print("\n" + "="*80)
print("MISSING VALUES")
print("="*80)
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")


MISSING VALUES
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Total missing values: 0


In [None]:
# Target variable analysis
print("\n" + "="*80)
print("FRAUD DISTRIBUTION")
print("="*80)

fraud_counts = df['isFraud'].value_counts()
fraud_percentage = df['isFraud'].mean() * 100

print(f"Legitimate Transactions: {fraud_counts[0]:,} ({100-fraud_percentage:.4f}%)")
print(f"Fraudulent Transactions: {fraud_counts[1]:,} ({fraud_percentage:.4f}%)")
print(f"\nImbalance Ratio: 1:{fraud_counts[0]/fraud_counts[1]:.0f}")


FRAUD DISTRIBUTION
Legitimate Transactions: 6,354,407 (99.8709%)
Fraudulent Transactions: 8,213 (0.1291%)

Imbalance Ratio: 1:774


In [None]:
# Visualize fraud distribution
fig = go.Figure()

fig.add_trace(go.Bar(
    x=['Legitimate', 'Fraudulent'],
    y=[fraud_counts[0], fraud_counts[1]],
    text=[f"{fraud_counts[0]:,}<br>({100-fraud_percentage:.2f}%)",
          f"{fraud_counts[1]:,}<br>({fraud_percentage:.4f}%)"],
    textposition='auto',
    marker_color=['green', 'red']
))

fig.update_layout(
    title='Transaction Distribution: Legitimate vs Fraudulent',
    xaxis_title='Transaction Type',
    yaxis_title='Count',
    height=500,
    showlegend=False
)

fig.show()

In [None]:
# Transaction type analysis
print("\n" + "="*80)
print("TRANSACTION TYPE DISTRIBUTION")
print("="*80)

type_counts = df['type'].value_counts()
print(type_counts)
print(f"\nTotal transaction types: {df['type'].nunique()}")


TRANSACTION TYPE DISTRIBUTION
type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

Total transaction types: 5


In [None]:
# Fraud by transaction type
fraud_by_type = pd.crosstab(df['type'], df['isFraud'], normalize='index') * 100

print("\n" + "="*80)
print("FRAUD RATE BY TRANSACTION TYPE")
print("="*80)
print(fraud_by_type)

# Visualize
fraud_rate_by_type = df.groupby('type')['isFraud'].agg(['sum', 'count'])
fraud_rate_by_type['rate'] = (fraud_rate_by_type['sum'] / fraud_rate_by_type['count']) * 100

fig = go.Figure()

fig.add_trace(go.Bar(
    x=fraud_rate_by_type.index,
    y=fraud_rate_by_type['rate'],
    text=[f"{val:.2f}%" for val in fraud_rate_by_type['rate']],
    textposition='auto',
    marker_color=['red' if val > 0 else 'green' for val in fraud_rate_by_type['rate']]
))

fig.update_layout(
    title='Fraud Rate by Transaction Type',
    xaxis_title='Transaction Type',
    yaxis_title='Fraud Rate (%)',
    height=500
)

fig.show()


FRAUD RATE BY TRANSACTION TYPE
isFraud            0         1
type                          
CASH_IN   100.000000  0.000000
CASH_OUT   99.816045  0.183955
DEBIT     100.000000  0.000000
PAYMENT   100.000000  0.000000
TRANSFER   99.231201  0.768799


In [None]:
# Amount distribution analysis
print("\n" + "="*80)
print("TRANSACTION AMOUNT STATISTICS")
print("="*80)

print("\nAll Transactions:")
print(df['amount'].describe())

print("\nLegitimate Transactions:")
print(df[df['isFraud']==0]['amount'].describe())

print("\nFraudulent Transactions:")
print(df[df['isFraud']==1]['amount'].describe())


TRANSACTION AMOUNT STATISTICS

All Transactions:
count    6.362620e+06
mean     1.798619e+05
std      6.038582e+05
min      0.000000e+00
25%      1.338957e+04
50%      7.487194e+04
75%      2.087215e+05
max      9.244552e+07
Name: amount, dtype: float64

Legitimate Transactions:
count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

Fraudulent Transactions:
count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64


In [None]:
# Visualize amount distribution (using sample for performance)
print("Sampling data for visualization...")
sample_size = 50000
df_sample = df.sample(n=min(sample_size, len(df)), random_state=42)

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Legitimate Transactions', 'Fraudulent Transactions')
)

# Legitimate
legitimate_amounts = df[df['isFraud']==0]['amount'].sample(n=min(10000, len(df[df['isFraud']==0])))
fig.add_trace(
    go.Histogram(x=legitimate_amounts, nbinsx=50, name='Legitimate', marker_color='green'),
    row=1, col=1
)

# Fraudulent
fraud_amounts = df[df['isFraud']==1]['amount']
fig.add_trace(
    go.Histogram(x=fraud_amounts, nbinsx=50, name='Fraudulent', marker_color='red'),
    row=1, col=2
)

fig.update_xaxes(title_text="Amount", row=1, col=1)
fig.update_xaxes(title_text="Amount", row=1, col=2)
fig.update_yaxes(title_text="Frequency", row=1, col=1)

fig.update_layout(
    title_text='Transaction Amount Distribution',
    height=500,
    showlegend=False
)

fig.show()

Sampling data for visualization...


In [None]:
print("="*80)
print("FEATURE ENGINEERING")
print("="*80)

# Create a copy for feature engineering
df_featured = df.copy()

print(f"Starting with {df_featured.shape[1]} original features")
print(f"Total transactions: {len(df_featured):,}")

FEATURE ENGINEERING
Starting with 11 original features
Total transactions: 6,362,620


In [None]:
# 1. Transaction amount features
print("\n1. Creating amount-based features...")

df_featured['amount_log'] = np.log1p(df_featured['amount'])
df_featured['amount_sqrt'] = np.sqrt(df_featured['amount'])

# Amount bins
df_featured['amount_bin'] = pd.cut(df_featured['amount'],
                                    bins=[0, 1000, 10000, 50000, 200000, float('inf')],
                                    labels=['very_low', 'low', 'medium', 'high', 'very_high'])

print("   ✓ Created: amount_log, amount_sqrt, amount_bin")


1. Creating amount-based features...
   ✓ Created: amount_log, amount_sqrt, amount_bin


In [None]:
# 2. Balance-based features
print("\n2. Creating balance-based features...")

df_featured['orig_balance_ratio'] = df_featured['amount'] / (df_featured['oldbalanceOrg'] + 1)
df_featured['dest_balance_ratio'] = df_featured['amount'] / (df_featured['oldbalanceDest'] + 1)

# Balance change verification
df_featured['orig_balance_change'] = df_featured['oldbalanceOrg'] - df_featured['newbalanceOrig']
df_featured['dest_balance_change'] = df_featured['newbalanceDest'] - df_featured['oldbalanceDest']

# Balance mismatch indicator (very important for fraud!)
df_featured['is_amount_mismatch'] = (np.abs(df_featured['amount'] - df_featured['orig_balance_change']) > 0.01).astype(int)

# Zero balance indicators
df_featured['is_orig_zero_balance'] = (df_featured['oldbalanceOrg'] == 0).astype(int)
df_featured['is_dest_zero_balance'] = (df_featured['oldbalanceDest'] == 0).astype(int)

print("   ✓ Created: 7 balance-based features")


2. Creating balance-based features...
   ✓ Created: 7 balance-based features


In [None]:
# 3. Time-based features
print("\n3. Creating time-based features...")

df_featured['hour_of_day'] = df_featured['step'] % 24
df_featured['day_of_month'] = df_featured['step'] // 24

# Time period (morning, afternoon, evening, night)
df_featured['time_period'] = pd.cut(df_featured['hour_of_day'],
                                     bins=[0, 6, 12, 18, 24],
                                     labels=['night', 'morning', 'afternoon', 'evening'],
                                     include_lowest=True)

print("   ✓ Created: hour_of_day, day_of_month, time_period")


3. Creating time-based features...
   ✓ Created: hour_of_day, day_of_month, time_period


In [None]:
# 4. Transaction type encoding
print("\n4. Encoding transaction types...")

df_featured['type_encoded'] = LabelEncoder().fit_transform(df_featured['type'])

# One-hot encoding for transaction type
type_dummies = pd.get_dummies(df_featured['type'], prefix='type')
df_featured = pd.concat([df_featured, type_dummies], axis=1)

print(f"   ✓ Created: type_encoded + {len(type_dummies.columns)} one-hot encoded features")


4. Encoding transaction types...
   ✓ Created: type_encoded + 5 one-hot encoded features


In [None]:
# 5. Account activity indicators
print("\n5. Creating account activity features...")

# Check if account was drained
df_featured['is_orig_drained'] = (df_featured['newbalanceOrig'] == 0).astype(int)
df_featured['is_exact_amount'] = (df_featured['amount'] == df_featured['oldbalanceOrg']).astype(int)

print("   ✓ Created: is_orig_drained, is_exact_amount")


5. Creating account activity features...
   ✓ Created: is_orig_drained, is_exact_amount


In [None]:
# 6. Merchant account indicators
print("\n6. Creating merchant indicators...")

# In PaySim, merchants start with 'M'
df_featured['is_dest_merchant'] = df_featured['nameDest'].str.startswith('M').astype(int)
df_featured['is_orig_merchant'] = df_featured['nameOrig'].str.startswith('M').astype(int)

print("   ✓ Created: is_dest_merchant, is_orig_merchant")


6. Creating merchant indicators...
   ✓ Created: is_dest_merchant, is_orig_merchant


In [None]:
# 7. Statistical features (by transaction type)
print("\n7. Creating statistical aggregations...")

# Amount percentiles by transaction type
amount_percentiles = df_featured.groupby('type')['amount'].transform(
    lambda x: x.rank(pct=True)
)
df_featured['amount_percentile_by_type'] = amount_percentiles

print("   ✓ Created: amount_percentile_by_type")


7. Creating statistical aggregations...
   ✓ Created: amount_percentile_by_type


In [None]:
# 8. Interaction features
print("\n8. Creating interaction features...")

df_featured['amount_x_orig_balance_ratio'] = df_featured['amount'] * df_featured['orig_balance_ratio']
df_featured['hour_x_amount'] = df_featured['hour_of_day'] * df_featured['amount_log']

print("   ✓ Created: 2 interaction features")


8. Creating interaction features...
   ✓ Created: 2 interaction features


In [None]:
# 9. Label encoding for remaining categorical features
print("\n9. Encoding remaining categorical features...")

le_amount_bin = LabelEncoder()
df_featured['amount_bin_encoded'] = le_amount_bin.fit_transform(df_featured['amount_bin'])

le_time_period = LabelEncoder()
df_featured['time_period_encoded'] = le_time_period.fit_transform(df_featured['time_period'])

print("   ✓ Encoded: amount_bin, time_period")


9. Encoding remaining categorical features...
   ✓ Encoded: amount_bin, time_period


In [None]:
# Summary
print("\n" + "="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)

new_features = [col for col in df_featured.columns if col not in df.columns]

print(f"\nOriginal features: {df.shape[1]}")
print(f"Total features after engineering: {df_featured.shape[1]}")
print(f"New features created: {len(new_features)}")

print(f"\nNew feature list ({len(new_features)} features):")
for i, feature in enumerate(new_features, 1):
    print(f"{i:2d}. {feature}")

print("\n✓ Feature engineering complete!")


FEATURE ENGINEERING SUMMARY

Original features: 11
Total features after engineering: 39
New features created: 28

New feature list (28 features):
 1. amount_log
 2. amount_sqrt
 3. amount_bin
 4. orig_balance_ratio
 5. dest_balance_ratio
 6. orig_balance_change
 7. dest_balance_change
 8. is_amount_mismatch
 9. is_orig_zero_balance
10. is_dest_zero_balance
11. hour_of_day
12. day_of_month
13. time_period
14. type_encoded
15. type_CASH_IN
16. type_CASH_OUT
17. type_DEBIT
18. type_PAYMENT
19. type_TRANSFER
20. is_orig_drained
21. is_exact_amount
22. is_dest_merchant
23. is_orig_merchant
24. amount_percentile_by_type
25. amount_x_orig_balance_ratio
26. hour_x_amount
27. amount_bin_encoded
28. time_period_encoded

✓ Feature engineering complete!


In [None]:
print("="*80)
print("DATA PREPROCESSING")
print("="*80)

# Create a copy for preprocessing
df_processed = df_featured.copy()

# Drop unnecessary columns
print("\n1. Dropping unnecessary columns...")
columns_to_drop = [
    'nameOrig',  # Customer ID (high cardinality, not useful)
    'nameDest',  # Recipient ID (high cardinality, not useful)
    'isFlaggedFraud',  # System flag (we're building better model)
    'orig_balance_change',  # Already captured in other features
    'dest_balance_change',  # Already captured in other features
    'amount_bin',  # Categorical version (using encoded version)
    'time_period'  # Categorical version (using encoded version)
]

df_processed = df_processed.drop(columns=columns_to_drop, errors='ignore')
print(f"   ✓ Dropped {len(columns_to_drop)} columns")
print(f"   Remaining columns: {df_processed.shape[1]}")

DATA PREPROCESSING

1. Dropping unnecessary columns...
   ✓ Dropped 7 columns
   Remaining columns: 32


In [None]:
# 2. Separate features and target
print("\n2. Separating features and target...")

X = df_processed.drop('isFraud', axis=1)
y = df_processed['isFraud']

print(f"   Feature matrix shape: {X.shape}")
print(f"   Target vector shape: {y.shape}")
print(f"\n   Target distribution:")
print(f"   - Legitimate: {(y==0).sum():,}")
print(f"   - Fraudulent: {(y==1).sum():,}")


2. Separating features and target...
   Feature matrix shape: (6362620, 31)
   Target vector shape: (6362620,)

   Target distribution:
   - Legitimate: 6,354,407
   - Fraudulent: 8,213


In [None]:
# 3. Drop original 'type' column (we have encoded versions)
print("\n3. Dropping original 'type' column...")
if 'type' in X.columns:
    X = X.drop('type', axis=1)
    print("   ✓ Dropped 'type' column")

print(f"   Final feature count: {X.shape[1]}")


3. Dropping original 'type' column...
   ✓ Dropped 'type' column
   Final feature count: 30


In [None]:
# 4. Check for missing values
print("\n4. Checking for missing values...")
missing_check = X.isnull().sum()
if missing_check.sum() > 0:
    print("   Missing values found:")
    print(missing_check[missing_check > 0])
    # Fill with median
    X = X.fillna(X.median())
    print("   ✓ Missing values filled with median")
else:
    print("   ✓ No missing values found")

# 5. Handle infinite values
print("\n5. Handling infinite values...")
X = X.replace([np.inf, -np.inf], np.nan)
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.median())
    print("   ✓ Infinite values handled")
else:
    print("   ✓ No infinite values found")

print("\n✓ Data preprocessing complete!")
print(f"Final feature matrix shape: {X.shape}")


4. Checking for missing values...
   ✓ No missing values found

5. Handling infinite values...
   ✓ No infinite values found

✓ Data preprocessing complete!
Final feature matrix shape: (6362620, 30)
