In [None]:
# FRAUDULENT TRANSACTION DETECTION MODEL

# --------------------------------------------------------------------------
# 1. SETUP AND DATA SOURCE
# --------------------------------------------------------------------------
# This script builds a machine learning model to detect fraudulent credit card transactions.
# We will use the "Credit Card Fraud Detection Dataset 2023" from Kaggle.
#
# --- SOURCE ---
# Dataset: Credit Card Fraud Detection Dataset 2023
# Kaggle URL: https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023
#
# --- INSTRUCTIONS ---
# 1. Download the dataset from the URL above.
# 2. Unzip the file and place 'creditcard_2023.csv' in the same directory as this script.
# 3. Run the script.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib # Add this import at the top of your script

print("Fraud Detection Model Script - Initialized")

# --------------------------------------------------------------------------
# --------------------------------------------------------------------------
# 2. DATA LOADING AND PREPARATION (Corrected)
# --------------------------------------------------------------------------
try:
    # Load the dataset using pandas
    file_path = 'creditcard_2023.csv'
    df = pd.read_csv(file_path)

    print(f"Successfully loaded dataset: {file_path}")
    print("----------------------------------------")

    # --- Data Cleaning and Preprocessing ---
    # The previous output showed missing values. We must remove them before proceeding.
    # .dropna() removes rows with NaN values. 'inplace=True' modifies the DataFrame directly.
    print(f"Original shape of dataset: {df.shape}")
    df.dropna(inplace=True)
    print(f"Shape after dropping missing values: {df.shape}\n")

    # We will drop the 'id' column as it's a unique identifier and not useful for prediction.
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
        print("Dropped 'id' column.")

    # The 'Amount' feature has a wide range of values. Scaling it helps the model
    # learn more effectively. We use StandardScaler.
    scaler = StandardScaler()
    df['scaled_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))

    # Drop the original 'Amount' column.
    df = df.drop('Amount', axis=1)
    print("'Amount' feature has been scaled and original column dropped.")
    print("----------------------------------------")

    # The script now continues to Section 3...
    # --------------------------------------------------------------------------
    # 3. MODEL TRAINING
    # --------------------------------------------------------------------------

    # --- Feature and Target Selection ---
    # 'X' contains all the features the model will learn from.
    # 'y' is the target variable we want to predict (0 for legitimate, 1 for fraud).
    X = df.drop('Class', axis=1)
    y = df['Class']

    print(f"Target Class Distribution:\n{y.value_counts(normalize=True) * 100}")
    print("\nSplitting data into training (80%) and testing (20%) sets...")

    # --- Splitting the Data ---
    # We split the data into a training set (to train the model) and a testing set
    # (to evaluate its performance on unseen data).
    # The 'stratify=y' argument ensures that the proportion of fraud/non-fraud
    # transactions is the same in both the train and test sets. This is crucial
    # for imbalanced datasets.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print("Data splitting complete.")

    # --- Training the Logistic Regression Model ---
    # Logistic Regression is a good, interpretable baseline model for classification.
    print("\nTraining the Logistic Regression model...")
    model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence
    model.fit(X_train, y_train)
    print("Model training complete.")
    print("----------------------------------------")

    # --------------------------------------------------------------------------
    # 4. MODEL EVALUATION
    # --------------------------------------------------------------------------
    print("\nEvaluating the model on the test set...")

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # --- Performance Metrics ---
    # Accuracy: Overall percentage of correct predictions.
    # Confusion Matrix: A table showing correct vs. incorrect predictions for each class.
    # Classification Report: Shows precision, recall, and F1-score for each class.
    #   - Precision: Of all transactions predicted as fraud, how many were actually fraud?
    #   - Recall: Of all actual fraud transactions, how many did the model correctly identify?

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=['Legitimate (0)', 'Fraud (1)'])

    print(f"\nModel Accuracy: {accuracy:.4f}")

    print("\nConfusion Matrix:")
    print(conf_matrix)
    # The matrix is structured as:
    # [[True Negative, False Positive],
    #  [False Negative, True Positive]]

    print("\nClassification Report:")
    print(class_report)
    print("----------------------------------------")
    print("Script finished.")


except FileNotFoundError:
    print(f"Error: The file 'creditcard_2023.csv' was not found.")
    print("Please make sure the dataset is downloaded from Kaggle and placed in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# --------------------------------------------------------------------------
# 5. SAVE THE TRAINED MODEL AND SCALER
# --------------------------------------------------------------------------
# We save the trained model and the scaler to files.
model_filename = 'fraud_model.joblib'
scaler_filename = 'scaler.joblib' # Define scaler filename

joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename) # Save the scaler

print(f"\nModel has been saved to '{model_filename}'")
print(f"Scaler has been saved to '{scaler_filename}'") # Print message for scaler
print("----------------------------------------")
print("Script finished.")

Fraud Detection Model Script - Initialized
Successfully loaded dataset: creditcard_2023.csv
----------------------------------------
Original shape of dataset: (227538, 31)
Shape after dropping missing values: (227537, 31)

Dropped 'id' column.
'Amount' feature has been scaled and original column dropped.
----------------------------------------
Target Class Distribution:
Class
0.0    99.816733
1.0     0.183267
Name: proportion, dtype: float64

Splitting data into training (80%) and testing (20%) sets...
Data splitting complete.

Training the Logistic Regression model...
Model training complete.
----------------------------------------

Evaluating the model on the test set...

Model Accuracy: 0.9990

Confusion Matrix:
[[45415    10]
 [   35    48]]

Classification Report:
                precision    recall  f1-score   support

Legitimate (0)       1.00      1.00      1.00     45425
     Fraud (1)       0.83      0.58      0.68        83

      accuracy                           1.00  

In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# --- 1. Load the Saved Model and Scaler ---
# Note: You must also save and load the scaler used during training
# to apply the *exact same* transformation to the new data.
# For simplicity in this example, we re-create it, but saving it is best practice.
print("Loading the saved fraud detection model...")
model = joblib.load('fraud_model.joblib')
scaler = StandardScaler()
print("Model loaded successfully.")

# --- 2. Load and Prepare Your New Dataset ---
# Replace 'your_new_dataset.csv' with the actual file name.
try:
    new_data = pd.read_csv('your_new_dataset.csv')
    print(f"\nSuccessfully loaded new dataset with {new_data.shape[0]} transactions.")

    # IMPORTANT: Apply the EXACT same preprocessing steps as the training data
    # a. Drop the 'id' column if it exists
    if 'id' in new_data.columns:
        new_data = new_data.drop('id', axis=1)

    # b. Store the original 'Amount' for reference before scaling
    original_amounts = new_data['Amount'].copy()

    # c. Scale the 'Amount' feature
    new_data['scaled_amount'] = scaler.fit_transform(new_data['Amount'].values.reshape(-1, 1))
    new_data = new_data.drop('Amount', axis=1)

    # d. Ensure column order is the same as the training data
    # (Excluding the original 'Class' column, which we want to predict)
    # This step is crucial if your new CSV has columns in a different order.
    features_for_prediction = new_data.drop('Class', axis=1, errors='ignore') # ignore error if 'Class' doesn't exist


    # --- 3. Make Predictions ---
    print("\nMaking predictions on the new data...")
    predictions = model.predict(features_for_prediction)


    # --- 4. Display the Results ---
    print("Predictions complete.")
    results_df = pd.DataFrame({
        'Original_Amount': original_amounts,
        'Prediction': predictions
    })

    # Map prediction from 0/1 to meaningful labels
    results_df['Result'] = results_df['Prediction'].apply(lambda x: 'Fraud' if x == 1 else 'Legitimate')

    print("\n--- Prediction Results ---")
    print(results_df)

    print("\n--- Summary ---")
    print(results_df['Result'].value_counts())

except FileNotFoundError:
    print("\nError: Make sure 'your_new_dataset.csv' is in the same directory.")
except Exception as e:
    print(f"\nAn error occurred: {e}")

Loading the saved fraud detection model...
Model loaded successfully.

Successfully loaded new dataset with 6 transactions.

An error occurred: 'Amount'


In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# --- 1. Load the Saved Model and Scaler ---
print("Loading the saved fraud detection model and scaler...")
model = joblib.load('fraud_model.joblib')
scaler = joblib.load('scaler.joblib') # <-- LOAD the saved scaler
print("Model and scaler loaded successfully.")


# --- 2. Load and Prepare Your New Dataset ---
try:
    # Read the CSV file, specifying that the second row is the header (skip the first row)
    new_data = pd.read_csv('your_new_dataset.csv', skiprows=1, header=0) # <-- ADDED skiprows=1
    print(f"\nSuccessfully loaded new dataset with {new_data.shape[0]} transactions.")

    # --- DIAGNOSTIC STEP: Print column names ---
    print("\nColumns in the loaded new dataset:")
    print(new_data.columns.tolist())
    print("------------------------------------")
    # --- END DIAGNOSTIC STEP ---


    # IMPORTANT: Apply the EXACT same preprocessing steps as the training data
    # a. Drop the 'id' column if it exists
    if 'id' in new_data.columns:
        new_data = new_data.drop('id', axis=1)

    # --- Specific Handling for 'Amount' column ---
    if 'Amount' not in new_data.columns:
        raise ValueError("The 'Amount' column is missing from the new dataset.")

    # Ensure 'Amount' column contains numeric data and handle non-numeric/missing values early
    new_data['Amount'] = pd.to_numeric(new_data['Amount'], errors='coerce')
    # Drop rows where 'Amount' could not be converted to numeric (NaNs)
    new_data.dropna(subset=['Amount'], inplace=True)

    # b. Store the original 'Amount' for reference before scaling
    # Now this copy happens AFTER ensuring 'Amount' is numeric and non-null
    original_amounts = new_data['Amount'].copy()


    # c. Scale the 'Amount' feature using the LOADED scaler
    # Use .transform() ONLY, NOT .fit_transform()
    # This applies the original scaling from the full training dataset.
    new_data['scaled_amount'] = scaler.transform(new_data['Amount'].values.reshape(-1, 1))

    # Drop the original 'Amount' column.
    new_data = new_data.drop('Amount', axis=1)


    # d. Ensure column order is the same as the training data
    # (Excluding the original 'Class' column, which we want to predict)
    # This step is crucial if your new CSV has columns in a different order.
    # We'll align the columns of new_data with the columns used for training the model (X_train)
    # Get the list of columns from the training data, excluding the target variable 'Class'
    # Assuming the training dataframe 'df' was created in a previous cell and is available
    # If 'df' is not available, you would need to save and load the list of feature columns.

    # Re-order columns to match the training data features.
    # This assumes X is available from the previous run.
    # Get the columns from the training features DataFrame
    training_columns = X.columns.tolist()

    # Ensure all necessary columns from training_columns are in new_data (except 'Class' which is not in new data)
    # and convert relevant columns to numeric
    for col in training_columns:
        if col in new_data.columns and new_data[col].dtype == 'object':
             new_data[col] = pd.to_numeric(new_data[col], errors='coerce')

    # Drop rows with non-numeric values in feature columns AFTER converting
    new_data.dropna(subset=training_columns, inplace=True)


    # Reindex the new data DataFrame to match the training columns
    # 'errors='ignore'' will skip columns in training_columns not found in new_data
    # 'fill_value=0' can be used if you want to fill missing columns with a default value
    features_for_prediction = new_data.reindex(columns=training_columns, fill_value=0)

    # Ensure the data types match between training and prediction
    # Only attempt to cast if the column exists in features_for_prediction
    for col in training_columns:
        if col in features_for_prediction.columns and col in X.columns: # Check if col exists in both
            features_for_prediction[col] = features_for_prediction[col].astype(X[col].dtype)


    # --- 3. Make Predictions ---
    # Check if features_for_prediction is empty after cleaning
    if features_for_prediction.empty:
        print("\nNo valid data remaining after preprocessing. Cannot make predictions.")
    else:
        print("\nMaking predictions on the new data...")
        predictions = model.predict(features_for_prediction)


        # --- 4. Display the Results ---
        print("Predictions complete.")
        results_df = pd.DataFrame({
            'Original_Amount': original_amounts.loc[features_for_prediction.index], # Align original amounts with cleaned data
            'Prediction': predictions
        })

        # Map prediction from 0/1 to meaningful labels
        results_df['Result'] = results_df['Prediction'].apply(lambda x: 'Fraud' if x == 1 else 'Legitimate')

        print("\n--- Prediction Results ---")
        print(results_df)

        print("\n--- Summary ---")
        print(results_df['Result'].value_counts())

except FileNotFoundError:
    print("\nError: Make sure 'your_new_dataset.csv' is in the same directory.")
except ValueError as ve: # Catch the specific Value Error for missing 'Amount'
     print(f"\nData Error: {ve}")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

Loading the saved fraud detection model and scaler...
Model and scaler loaded successfully.

Successfully loaded new dataset with 5 transactions.

Columns in the loaded new dataset:
['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
------------------------------------

Making predictions on the new data...
Predictions complete.

--- Prediction Results ---
   Original_Amount  Prediction      Result
0           150.75         1.0       Fraud
1          8500.00         1.0       Fraud
2            45.50         0.0  Legitimate
3          1230.20         1.0       Fraud
4            89.99         0.0  Legitimate

--- Summary ---
Result
Fraud         3
Legitimate    2
Name: count, dtype: int64
