In [None]:
# Import the required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, average_precision_score
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from collections import Counter

In [None]:
# Step 1: Data Understanding
print("\nStep 1: Data Understanding")
# Load the dataset...data can be downloaded from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data
print("Loading dataset...")
data = pd.read_csv('creditcard.csv')
print("Dataset loaded successfully.")
print(f"Dataset contains {data.shape[0]} rows and {data.shape[1]} columns.")

In [None]:
# Display information about the dataset
print("\nDataset Information:")
print(data.info())

In [None]:
# Check for missing values
print("\nChecking for missing values...")
print(data.isnull().sum())

In [None]:
# Display the first few rows
print("\nPreview of the dataset:")
print(data.head())

In [None]:
# Step 2: Exploratory Data Analysis (EDA)
print("\nStep 2: Exploratory Data Analysis")
# Observe the different feature types present in the data
print("\nFeature Types:")
print(data.dtypes)

In [None]:
# Class distribution
print("\nClass Distribution:")
class_counts = data['Class'].value_counts()
print(class_counts)

In [None]:
# Bar plot for fraudulent vs non-fraudulent transactions
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=data)
plt.title("Number and Percentage of Fraudulent vs Non-Fraudulent Transactions")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
fraud_percentage = (class_counts[1] / len(data)) * 100
print(f"Percentage of Fraudulent Transactions: {fraud_percentage:.2f}%")

In [None]:
# Scatter plot: Class distribution with Time
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Time', y='Class', data=data, hue='Class', palette=['red', 'blue'])
plt.title("Distribution of Classes with Time")
plt.xlabel("Time")
plt.ylabel("Class")
plt.show()

In [None]:
# Scatter plot: Class distribution with Amount
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Amount', y='Class', data=data, hue='Class', palette=['red', 'blue'])
plt.title("Distribution of Classes with Amount")
plt.xlabel("Amount")
plt.ylabel("Class")
plt.show()

In [None]:
# Plot histogram to observe skewness
print("\nPlotting histogram for 'Amount' to observe skewness...")
plt.figure(figsize=(8, 6))
plt.hist(data['Amount'], bins=50, color='blue', alpha=0.7)
plt.title("Histogram of Transaction Amount")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Apply Power Transformer to make distribution more Gaussian
print("\nApplying PowerTransformer to handle skewness...")
power_transformer = PowerTransformer(copy=False)
data[['Amount']] = power_transformer.fit_transform(data[['Amount']])

In [None]:
# Plot histogram again after Power Transformation
plt.figure(figsize=(8, 6))
plt.hist(data['Amount'], bins=50, color='green', alpha=0.7)
plt.title("Histogram of Transaction Amount after Power Transformation")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Drop unnecessary columns
print("\nDropping unnecessary columns...")
columns_to_drop = ['Unnamed: 0']  # Example of columns to drop; adjust if necessary
data = data.drop(columns=columns_to_drop, errors='ignore')
print(f"Remaining columns: {data.columns.tolist()}\n")

In [None]:
# Step 3: Train/Test Split
print("\nStep 3: Train/Test Split")
X = data.drop(columns=['Class'])
y = data['Class']

In [None]:
# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize Models
models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, solver='saga', C=0.1,random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'KNeighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [None]:
# Model Building and Evaluation on Imbalanced Data
def evaluate_model_imbalanced(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Model: {model.__class__.__name__}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
print("\nEvaluating models on imbalanced data...")
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    evaluate_model_imbalanced(model, X_train, y_train, X_test, y_test)

In [None]:
# Define a function for cross-validation and hyperparameter tuning
def evaluate_model(model, param_grid, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='roc_auc', verbose=1, n_jobs=-1)
    grid_search.fit(X, y)
    print("Best Parameters:", grid_search.best_params_)
    print("Best ROC-AUC:", grid_search.best_score_)
    return grid_search.best_estimator_

In [None]:
# Hyperparameter grids
param_grids = {
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'RandomForest': {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'DecisionTree': {'max_depth': [None, 10, 20]},
    'KNeighbors': {'n_neighbors': [3, 5, 7]},
    'XGBoost': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
}

In [None]:
# Step 5: Handling Class Imbalance
print("\nStep 5: Handling Class Imbalance")

In [None]:
# 1. Random Oversampling
print("\nApplying Random Oversampling...")
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
print("Random Oversampling Class Distribution:", Counter(y_train_ros))

In [None]:
# 2. SMOTE
print("\nApplying SMOTE...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("SMOTE Class Distribution:", Counter(y_train_smote))

In [None]:
# 3. ADASYN
print("\nApplying ADASYN...")
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print("ADASYN Class Distribution:", Counter(y_train_adasyn))

In [None]:
# Evaluate models on balanced data (Random Oversampling, SMOTE, ADASYN)
def evaluate_balanced_data(models, param_grids, X_train_balanced, y_train_balanced, X_test, y_test, method):
    print(f"\nEvaluating models on {method} balanced data...")
    results = []
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name} on {method} data...")
        best_model = evaluate_model(model, param_grids[model_name], X_train_balanced, y_train_balanced)
        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results.append((model_name, roc_auc, precision, recall, f1))
    results_df = pd.DataFrame(results, columns=['Model', 'ROC-AUC', 'Precision', 'Recall', 'F1-Score'])
    print(f"Results on {method} data:")
    print(results_df.sort_values(by='ROC-AUC', ascending=False))


In [None]:
# Evaluate on Random Oversampling
evaluate_balanced_data(models, param_grids, X_train_ros, y_train_ros, X_test, y_test, "Random Oversampling")


In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import pandas as pd

def evaluate_balanced_data(models, param_grids, X_train_balanced, y_train_balanced, X_test, y_test, method):
    print(f"\nEvaluating models on {method} balanced data...")
    results = []

    # Use StratifiedKFold with fewer splits for faster evaluation
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name} on {method} data...")
        
        # Early stopping for boosting models
        if model_name.lower() in ['xgboost', 'lightgbm']:
            model.set_params(early_stopping_rounds=10)
            eval_set = [(X_test, y_test)]
            eval_metric = 'logloss' if model_name.lower() == 'xgboost' else 'binary_logloss'
            model.set_params(eval_set=eval_set, eval_metric=eval_metric, verbose=-1)
        
        # Get parameter grid for the current model
        param_grid = param_grids.get(model_name, {})

        # Parallelize GridSearchCV and minimize verbosity
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=skf,
            scoring='roc_auc',
            verbose=0,
            n_jobs=-1  # Utilize all available cores
        )

        # Fit the model
        grid_search.fit(X_train_balanced, y_train_balanced)

        # Retrieve best model and metrics
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        y_prob = best_model.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append((model_name, roc_auc, precision, recall, f1))
        print(f"Completed evaluation for {model_name}: ROC-AUC = {roc_auc:.4f}")

    # Results DataFrame
    results_df = pd.DataFrame(results, columns=['Model', 'ROC-AUC', 'Precision', 'Recall', 'F1-Score'])
    print(f"\nResults on {method} balanced data:")
    print(results_df.sort_values(by='ROC-AUC', ascending=False))
    
    return results_df


In [None]:
# Evaluate on Random Oversampling
evaluate_balanced_data(models, param_grids, X_train_ros, y_train_ros, X_test, y_test, "Random Oversampling")

In [None]:
# Evaluate on SMOTE
evaluate_balanced_data(models, param_grids, X_train_smote, y_train_smote, X_test, y_test, "SMOTE")

In [None]:
# Evaluate on ADASYN
evaluate_balanced_data(models, param_grids, X_train_adasyn, y_train_adasyn, X_test, y_test, "ADASYN")

In [None]:
# Step 7: Final Notes
print("\nStep 6: Model Evaluation Completed")
print("Compare performance across Random Oversampling, SMOTE, and ADASYN to determine the best class balancing technique.")