Synthetic Data Generation for Fraud Detection in Financial Transactions
This project will guide you through creating a synthetic dataset for fraud detection, training a machine learning model, and evaluating its performance.

In [20]:
#1. Data Generation

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define number of samples
n_samples = 10000

# Simulate transaction features
transaction_amount = np.random.exponential(scale=100, size=n_samples)
merchant_category = np.random.randint(1, 10, size=n_samples)
time_of_day = np.random.randint(0, 24, size=n_samples)
location = np.random.rand(n_samples) * 100
is_weekend = np.random.randint(0, 2, size=n_samples)
previous_transactions = np.random.randint(0, 20, size=n_samples)
average_transaction_amount = np.random.exponential(scale=50, size=n_samples)

# Simulate fraud labels (0 for non-fraud, 1 for fraud)
fraud_probability = 0.05
is_fraud = np.random.choice([0, 1], size=n_samples, p=[1 - fraud_probability, fraud_probability])

# Introduce correlations to make the data more realistic
# Example: Larger transaction amounts are more likely to be fraudulent
fraud_indices = np.where(is_fraud == 1)
transaction_amount[fraud_indices] = np.random.exponential(scale=300, size=len(fraud_indices[0]))

# Create a Pandas DataFrame
data = {
    'transaction_amount': transaction_amount,
    'merchant_category': merchant_category,
    'time_of_day': time_of_day,
    'location': location,
    'is_weekend': is_weekend,
    'previous_transactions': previous_transactions,
    'average_transaction_amount': average_transaction_amount,
    'is_fraud': is_fraud
}
df = pd.DataFrame(data)

In [21]:
#2. Data Preprocessing

from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
#3. Address Class Imbalance (SMOTE)

from imblearn.over_sampling import SMOTE

# Apply SMOTE to oversample the minority class (fraudulent transactions)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [23]:
#4. Model Training and Evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize and train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95      1897
           1       0.22      0.35      0.27       103

    accuracy                           0.90      2000
   macro avg       0.59      0.64      0.61      2000
weighted avg       0.93      0.90      0.91      2000

ROC AUC Score: 0.6955489249760736


In [26]:
     import xgboost as xgb

     # Initialize and train an XGBoost classifier
     xgb_model = xgb.XGBClassifier(random_state=42)
     xgb_model.fit(X_resampled, y_resampled)

     # Make predictions on the test set
     y_pred_xgb = xgb_model.predict(X_test)

     # Evaluate the XGBoost model
     print(classification_report(y_test, y_pred_xgb))
     print("XGBoost ROC AUC Score:", roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1]))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      1897
           1       0.16      0.38      0.23       103

    accuracy                           0.87      2000
   macro avg       0.56      0.64      0.58      2000
weighted avg       0.92      0.87      0.89      2000

XGBoost ROC AUC Score: 0.6662179936639865


In [27]:
#  Feature Engineering:

import pandas as pd
# Feature Engineering
df['transaction_hour'] = df['time_of_day'].apply(lambda x: x // 4) #Group time into 6 hour buckets
df['transaction_amount_binned'] = pd.cut(df['transaction_amount'], bins=10, labels=False)
df['high_value_transaction'] = (df['transaction_amount'] > 200).astype(int) # High value transaction
df['transaction_ratio'] = df['transaction_amount'] / df['average_transaction_amount'] # ratio
df['recent_transactions_flag'] = (df['previous_transactions'] > 5).astype(int)
#Interaction Features
df['merchant_hour_interaction'] = df['merchant_category'] * df['transaction_hour']

# Drop original time of day
df = df.drop('time_of_day', axis=1)

# Separate features (X) and target (y)
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to oversample the minority class (fraudulent transactions)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


In [30]:
# Logistic Regression, Support Vector,neural networks

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


# 4. Model Building and Evaluation (Logistic Regression)
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_resampled, y_resampled)
y_pred_logreg = logreg_model.predict(X_test)

print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg))
print("ROC AUC Score:", roc_auc_score(y_test, logreg_model.predict_proba(X_test)[:, 1]))


# 4. Model Building and Evaluation (Support Vector Machine)
svm_model = SVC(random_state=42, probability=True) # probability=True for ROC AUC
svm_model.fit(X_resampled, y_resampled)
y_pred_svm = svm_model.predict(X_test)

print("\nSupport Vector Machine:")
print(classification_report(y_test, y_pred_svm))
print("ROC AUC Score:", roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1]))


# 4. Model Building and Evaluation (Neural Network)
nn_model = MLPClassifier(random_state=42, max_iter=500) # Increased max_iter for convergence
nn_model.fit(X_resampled, y_resampled)
y_pred_nn = nn_model.predict(X_test)

print("\nNeural Network:")
print(classification_report(y_test, y_pred_nn))
print("ROC AUC Score:", roc_auc_score(y_test, nn_model.predict_proba(X_test)[:, 1]))


Logistic Regression:
              precision    recall  f1-score   support

           0       0.74      0.64      0.69      1928
           1       0.68      0.78      0.72      1881

    accuracy                           0.70      3809
   macro avg       0.71      0.71      0.70      3809
weighted avg       0.71      0.70      0.70      3809

ROC AUC Score: 0.8027771711436268

Support Vector Machine:
              precision    recall  f1-score   support

           0       0.66      0.73      0.69      1928
           1       0.69      0.61      0.65      1881

    accuracy                           0.67      3809
   macro avg       0.67      0.67      0.67      3809
weighted avg       0.67      0.67      0.67      3809

ROC AUC Score: 0.7271013531250483

Neural Network:
              precision    recall  f1-score   support

           0       0.72      0.91      0.81      1928
           1       0.88      0.64      0.74      1881

    accuracy                           0.78      38

In [31]:
# Ensemble Methods

from sklearn.ensemble import VotingClassifier

# Assuming X_resampled, y_resampled, X_test, y_test are defined from the previous code

# Create individual models
rf_model = RandomForestClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)
logreg_model = LogisticRegression(random_state=42)

# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model), ('lr', logreg_model)],
    voting='soft'  # Use 'soft' voting for better performance with probabilities
)

# Train the voting classifier
voting_clf.fit(X_resampled, y_resampled)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the voting classifier
print("\nVoting Classifier:")
print(classification_report(y_test, y_pred_voting))
print("ROC AUC Score:", roc_auc_score(y_test, voting_clf.predict_proba(X_test)[:, 1]))



Voting Classifier:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1928
           1       0.98      0.96      0.97      1881

    accuracy                           0.97      3809
   macro avg       0.97      0.97      0.97      3809
weighted avg       0.97      0.97      0.97      3809

ROC AUC Score: 0.997275661176076


In [37]:


import pickle
# Save the trained model
filename = 'voting_classifier_model.pkl'
pickle.dump(voting_clf, open(filename, 'wb'))


In [38]:
# prompt:  deploy ensembled model with flask

!pip install flask
!pip install gunicorn

import pickle
from flask import Flask, request, jsonify

# Load the trained model
filename = 'voting_classifier_model.pkl'
loaded_model = pickle.load(open(filename, 'rb'))

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json()
        # Assuming the input data is a list of features
        features = data['features']  # Access features using the key 'features'

        # Make a prediction
        prediction = loaded_model.predict([features])[0]

        # Prepare the response
        response = {'prediction': int(prediction)} # Convert to integer for clarity
        return jsonify(response)

    except Exception as e:
        return jsonify({'error': str(e)}), 500  # Return an error response


if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=int(5000)) # Specify port explicitly


Collecting gunicorn
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Downloading gunicorn-23.0.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gunicorn
Successfully installed gunicorn-23.0.0
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
