In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!\n\n")
print(">PART 1: LOADING DATASET")
df = pd.read_csv('A:\\fraud_detection\\data\\creditcard.csv')
print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")

All libraries imported successfully!


>PART 1: LOADING DATASET

Dataset loaded successfully!
Shape: (284807, 31)
Rows: 284807, Columns: 31


In [42]:
print("\nFirst 5 rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nBasic Statistics:")
print(df.describe())


First 5 rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26      

In [43]:
print("PART 2: DATA QUALITY CHECK")

PART 2: DATA QUALITY CHECK


In [44]:
print("\nMissing values:")
missing_count = df.isnull().sum()
print(missing_count[missing_count > 0])
if missing_count.sum() == 0:
    print(" No missing values!")


Missing values:
Series([], dtype: int64)
 No missing values!


In [45]:
print(f"\nDuplicate rows: {df.duplicated().sum()}")
if df.duplicated().sum() == 0:
    print("No duplicates!")


Duplicate rows: 1081


In [46]:
print("\nClass Distribution (Target Variable):")
print(df['Class'].value_counts())
print(f"\nFraud percentage: {df['Class'].sum()/len(df)*100:.2f}%")
print(f"Legitimate percentage: {(1-df['Class'].sum()/len(df))*100:.2f}%")


Class Distribution (Target Variable):
Class
0    284315
1       492
Name: count, dtype: int64

Fraud percentage: 0.17%
Legitimate percentage: 99.83%


In [47]:
print("PART 3: ADDING REAL-WORLD DATA CHALLENGES")

PART 3: ADDING REAL-WORLD DATA CHALLENGES


In [48]:
df_messy = df.copy()

In [49]:
print("\nAdding missing values (5%)...")
np.random.seed(42)
missing_percentage = 0.05

for col in df_messy.columns[:-1]:
    missing_indices = np.random.choice(
        df_messy.index, 
        size=int(len(df_messy)*missing_percentage), 
        replace=False
    )
    df_messy.loc[missing_indices, col] = np.nan

total_missing = df_messy.isnull().sum().sum()
print(f"Total missing values added: {total_missing}")
print(f"Challenge 1 complete!")


Adding missing values (5%)...
Total missing values added: 427200
Challenge 1 complete!


In [50]:
print("\nAdding duplicate transactions (2%)...")
duplicate_count = int(len(df_messy) * 0.02)
duplicates = df_messy.sample(n=duplicate_count, random_state=42)
df_messy = pd.concat([df_messy, duplicates], ignore_index=True)
print(f"Duplicate rows added: {duplicate_count}")
print(f"New dataset size: {len(df_messy)}")
print(f"Challenge 2 complete!")


Adding duplicate transactions (2%)...
Duplicate rows added: 5696
New dataset size: 290503
Challenge 2 complete!


In [51]:
print("\nAdding outliers (extreme values)...")
outlier_indices = np.random.choice(
    df_messy.index, 
    size=int(len(df_messy)*0.01), 
    replace=False
)
for idx in outlier_indices:
    df_messy.loc[idx, 'Amount'] = np.random.choice([999999, -1000, 0.01])
    df_messy.loc[idx, 'Time'] = np.random.choice([99999, -100])

print(f"Outliers added to {len(outlier_indices)} rows")
print(f"Challenge 3 complete!")



Adding outliers (extreme values)...
Outliers added to 2905 rows
Challenge 3 complete!


In [53]:
print("\n4️Adding data type inconsistencies...")
string_indices = np.random.choice(
    df_messy.index, 
    size=int(len(df_messy)*0.005), 
    replace=False
)
for idx in string_indices:
    df_messy.loc[idx, 'Amount'] = str(df_messy.loc[idx, 'Amount'])

print(f"   String type added to Amount column: {len(string_indices)} values")
print(f"    Challenge 4 complete!")

print("\n Messy dataset created with real-world challenges!")


4️Adding data type inconsistencies...
   String type added to Amount column: 1452 values
    Challenge 4 complete!

 Messy dataset created with real-world challenges!


In [54]:
print("\n" + "="*60)
print("PART 4: PREPROCESSING PIPELINE")
print("="*60)


PART 4: PREPROCESSING PIPELINE


In [55]:
df_clean = df_messy.copy()
print("\n[STEP 1] Fixing Data Type Issues FIRST...")
print(f"Before - Amount dtype: {df_clean['Amount'].dtype}")
df_clean['Amount'] = pd.to_numeric(df_clean['Amount'], errors='coerce')

for col in [f'V{i}' for i in range(1, 29)]:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

df_clean['Time'] = pd.to_numeric(df_clean['Time'], errors='coerce')

df_clean['Class'] = pd.to_numeric(df_clean['Class'], errors='coerce')
df_clean['Class'] = df_clean['Class'].astype('Int64')
print(f"After - All columns converted to numeric")
print("Data types fixed!")


[STEP 1] Fixing Data Type Issues FIRST...
Before - Amount dtype: object
After - All columns converted to numeric
Data types fixed!


In [56]:
for col in [f'V{i}' for i in range(1, 29)]:
    df_clean[col].fillna(df_clean[col].mean(), inplace=True)

print(f"After: {df_clean.isnull().sum().sum()} missing cells")
print(" Missing values handled!")

After: 34100 missing cells
 Missing values handled!


In [57]:
print("\n[STEP 3] Removing Duplicates...")
print(f"Before: {len(df_clean)} rows, {df_clean.duplicated().sum()} duplicates")

df_clean = df_clean.drop_duplicates()

print(f"After: {len(df_clean)} rows, {df_clean.duplicated().sum()} duplicates")
print(" Duplicates removed!")

print("\n[STEP 4] Handling Outliers (Capping with IQR method)...")
print(f"Before - Amount range: {df_clean['Amount'].min():.2f} to {df_clean['Amount'].max():.2f}")

Q1_amount = df_clean['Amount'].quantile(0.25)
Q3_amount = df_clean['Amount'].quantile(0.75)
IQR_amount = Q3_amount - Q1_amount
lower_bound = Q1_amount - 1.5*IQR_amount
upper_bound = Q3_amount + 1.5*IQR_amount

df_clean['Amount'] = df_clean['Amount'].clip(lower_bound, upper_bound)

Q1_time = df_clean['Time'].quantile(0.25)
Q3_time = df_clean['Time'].quantile(0.75)
IQR_time = Q3_time - Q1_time

df_clean['Time'] = df_clean['Time'].clip(
    Q1_time - 1.5*IQR_time, 
    Q3_time + 1.5*IQR_time
)

print(f"After - Amount range: {df_clean['Amount'].min():.2f} to {df_clean['Amount'].max():.2f}")
print("Outliers capped!")


[STEP 3] Removing Duplicates...
Before: 290503 rows, 5649 duplicates
After: 284854 rows, 0 duplicates
 Duplicates removed!

[STEP 4] Handling Outliers (Capping with IQR method)...
Before - Amount range: -1000.00 to 999999.00
After - Amount range: -103.18 to 186.23
Outliers capped!


In [58]:
print("\n" + "="*60)
print("PART 5: FEATURE SCALING (NORMALIZATION)")
print("="*60)

X = df_clean.drop('Class', axis=1)
y = df_clean['Class']

print(f"\nBefore Scaling:")
print(f"V1 mean: {X['V1'].mean():.4f}, std: {X['V1'].std():.4f}")
print(f"Amount mean: {X['Amount'].mean():.4f}, std: {X['Amount'].std():.4f}")
print(f"Time mean: {X['Time'].mean():.4f}, std: {X['Time'].std():.4f}")

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print(f"\nAfter Scaling:")
print(f"V1 mean: {X_scaled['V1'].mean():.6f}, std: {X_scaled['V1'].std():.4f}")
print(f"Amount mean: {X_scaled['Amount'].mean():.6f}, std: {X_scaled['Amount'].std():.4f}")
print(f"Time mean: {X_scaled['Time'].mean():.6f}, std: {X_scaled['Time'].std():.4f}")

print("\nAll features normalized! (mean ≈ 0, std ≈ 1)")



PART 5: FEATURE SCALING (NORMALIZATION)

Before Scaling:
V1 mean: 0.0001, std: 1.9102
Amount mean: 51.6304, std: 63.0037
Time mean: 94349.0493, std: 47755.1629

After Scaling:
V1 mean: -0.000000, std: 1.0000
Amount mean: -0.000000, std: 1.0000
Time mean: 0.000000, std: 1.0000

All features normalized! (mean ≈ 0, std ≈ 1)


In [59]:
print("\n" + "="*60)
print("PART 6: TRAIN-TEST SPLIT")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, 
    y,
    test_size=0.3,
    random_state=42,
    stratify=y 
)

print(f"\nTraining set: {len(X_train)} samples ({len(X_train)/len(X_scaled)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X_scaled)*100:.1f}%)")

print(f"\nFraud in training: {y_train.sum()} ({y_train.sum()/len(y_train)*100:.2f}%)")
print(f"Fraud in test: {y_test.sum()} ({y_test.sum()/len(y_test)*100:.2f}%)")

print(" Data properly split!")


PART 6: TRAIN-TEST SPLIT

Training set: 199397 samples (70.0%)
Test set: 85457 samples (30.0%)

Fraud in training: 344 (0.17%)
Fraud in test: 147 (0.17%)
 Data properly split!


In [60]:
print("\n" + "="*60)
print("PART 7: HANDLING CLASS IMBALANCE WITH SMOTE")
print("="*60)

print("\nChecking for NaN values before SMOTE...")
print(f"NaN in X_train: {X_train.isnull().sum().sum()}")
print(f"NaN in y_train: {y_train.isnull().sum()}")

if X_train.isnull().sum().sum() > 0:
    print("\n Found NaN values. Filling them now...")
    for col in X_train.columns:
        if X_train[col].isnull().sum() > 0:
            print(f"   Filling {X_train[col].isnull().sum()} NaN in {col}")
            
            if col == 'Amount':
                X_train[col].fillna(X_train[col].median(), inplace=True)
            elif col == 'Time':
                X_train[col].fillna(X_train[col].mean(), inplace=True)
            else:
                # For V features
                X_train[col].fillna(X_train[col].mean(), inplace=True)
    
    print("All NaN values filled!")

if y_train.isnull().sum() > 0:
    print(f"Found {y_train.isnull().sum()} NaN in y_train. Dropping these rows...")
    valid_indices = y_train.notna()
    X_train = X_train[valid_indices]
    y_train = y_train[valid_indices]
    print(" NaN rows dropped from y_train!")

print(f"\nFinal check - NaN in X_train: {X_train.isnull().sum().sum()}")
print(f"Final check - NaN in y_train: {y_train.isnull().sum()}")

print(f"\nBefore SMOTE:")
print(f"Legitimate transactions: {(y_train == 0).sum()}")
print(f"Fraudulent transactions: {(y_train == 1).sum()}")
ratio_before = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Ratio: {ratio_before:.1f}:1")

fraud_count = (y_train == 1).sum()
print(f"\nFraud samples available: {fraud_count}")

if fraud_count < 6:
    k_neighbors = max(1, fraud_count - 1)
    print(f"Few fraud samples detected. Using k_neighbors={k_neighbors}")
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
else:
    smote = SMOTE(random_state=42, k_neighbors=5)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE:")
print(f"Legitimate transactions: {(y_train_balanced == 0).sum()}")
print(f"Fraudulent transactions: {(y_train_balanced == 1).sum()}")
ratio_after = (y_train_balanced == 0).sum() / (y_train_balanced == 1).sum()
print(f"Ratio: {ratio_after:.1f}:1")

print("\nClass imbalance balanced!")


PART 7: HANDLING CLASS IMBALANCE WITH SMOTE

Checking for NaN values before SMOTE...
NaN in X_train: 19808
NaN in y_train: 0

 Found NaN values. Filling them now...
   Filling 9886 NaN in Time
   Filling 9922 NaN in Amount
All NaN values filled!

Final check - NaN in X_train: 0
Final check - NaN in y_train: 0

Before SMOTE:
Legitimate transactions: 199053
Fraudulent transactions: 344
Ratio: 578.6:1

Fraud samples available: 344

After SMOTE:
Legitimate transactions: 199053
Fraudulent transactions: 199053
Ratio: 1.0:1

Class imbalance balanced!


In [61]:
print("\n" + "="*60)
print("PART 8: SAVING PROCESSED DATA")
print("="*60)

import pickle

processed_data = {
    'X_train': X_train_balanced,
    'X_test': X_test,
    'y_train': y_train_balanced,
    'y_test': y_test,
    'scaler': scaler,
    'smote': smote
}

with open('A:\\fraud_detection\\data\\final_processed_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

print("Processed data saved to: final_processed_data.pkl")


PART 8: SAVING PROCESSED DATA
Processed data saved to: final_processed_data.pkl


In [62]:
print("\n" + "="*60)
print("✅ PREPROCESSING COMPLETE!")
print("="*60)

print(f"\nFinal Dataset Statistics:")
print(f"Training set: {len(X_train_balanced)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Features: {X_train_balanced.shape[1]}")
print(f"Fraud in training: {(y_train_balanced == 1).sum()} samples")
print(f"Fraud in test: {(y_test == 1).sum()} samples")


✅ PREPROCESSING COMPLETE!

Final Dataset Statistics:
Training set: 398106 samples
Test set: 85457 samples
Features: 30
Fraud in training: 199053 samples
Fraud in test: 147 samples
