In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

### Load dataset

In [9]:
def load_data(file_path):
    """
    Loads the dataset from a CSV file.
    
    Parameters:
    - file_path: Path to the dataset file.
    
    Returns:
    - dataset: Loaded dataset.
    """
    try:
        dataset = pd.read_csv(file_path)
        return dataset
    except Exception as e:
        print(f"Failed to load data: {e}")

### Preprocess data

In [6]:
def preprocess_data(dataset):
    """
    Preprocesses the data by checking for missing values and scaling.
    
    Parameters:
    - dataset: Dataset to preprocess.
    
    Returns:
    - dataset: Preprocessed dataset.
    """
    # Check for missing values
    print("Missing values count:")
    print(dataset.isna().sum())
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
    dataset[numerical_cols] = scaler.fit_transform(dataset[numerical_cols])
    
    return dataset

### Split data into features and target

In [10]:
def split_data(dataset):
    """
    Splits the dataset into features (X) and target (y).
    
    Parameters:
    - dataset: Dataset to split.
    
    Returns:
    - X: Features.
    - y: Target variable.
    """
    X = dataset.drop(['Class'], axis=1)
    y = dataset['Class']
    return X, y

#### Train and evaluate model

In [11]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    """
    Trains a Random Forest classifier and evaluates its performance.
    
    Parameters:
    - X_train: Training features.
    - X_test: Testing features.
    - y_train: Training target.
    - y_test: Testing target.
    
    Returns:
    - None
    """
    # Apply SMOTE to balance the training set
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    
    # Train a Random Forest classifier
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train_balanced, y_train_balanced)
    
    # Predict on the test set
    y_pred = rf.predict(X_test)
    
    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
    
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc:.3f}")

#### Main function

In [12]:
# Main function
def main():
    file_path = 'creditcard.csv'  # Update with your dataset path
    dataset = load_data(file_path)
    
    # Preprocess data
    dataset = preprocess_data(dataset)
    
    # Split data into training and testing sets
    X, y = split_data(dataset)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate model
    train_and_evaluate(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()

Missing values count:
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Accuracy: 1.000, Precision: 0.882, Recall: 0.837, F1-score: 0.859, AUC-ROC: 0.975
