In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Step 1: Generate Synthetic Fraud Dataset with More Features
def generate_synthetic_data(n_samples=10000, fraud_ratio=0.01):
    np.random.seed(42)
    
    n_fraud = int(n_samples * fraud_ratio)
    n_legit = n_samples - n_fraud
    
    # Legitimate transactions
    legit = np.random.normal(loc=[50, 1], scale=[15, 0.5], size=(n_legit, 2))
    legit_labels = np.zeros(n_legit)
    
    # Fraudulent transactions with added noise
    fraud = np.random.normal(loc=[200, 5], scale=[50, 2], size=(n_fraud, 2))
    fraud[:, 0] += np.random.normal(0, 20, n_fraud)  # Add noise to TransactionAmount
    fraud[:, 1] += np.random.normal(0, 1, n_fraud)  # Add noise to TransactionTime
    fraud_labels = np.ones(n_fraud)
    
    data = np.vstack((legit, fraud))
    labels = np.concatenate((legit_labels, fraud_labels))
    
    df = pd.DataFrame(data, columns=['TransactionAmount', 'TransactionTime'])
    df['TransactionHour'] = np.random.randint(0, 24, df.shape[0])  # Random transaction hour
    df['UserRiskScore'] = np.random.uniform(0, 1, df.shape[0])  # Simulated user risk score
    df['IsFraud'] = labels
    
    return df

# Generate data
data = generate_synthetic_data()

# Step 2: Preprocessing & Splitting
y = data['IsFraud']
X = data.drop(columns=['IsFraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Train Machine Learning Model (XGBoost) with Tuned Hyperparameters
xgb = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss')
xgb.fit(X_train_scaled, y_train)
y_pred = xgb.predict(X_test_scaled)

# Step 4: Evaluate Model Performance
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

# Step 5: Anomaly Detection with Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_train_scaled)
anomaly_scores = iso_forest.decision_function(X_test_scaled)
y_anomaly = iso_forest.predict(X_test_scaled)
y_anomaly = np.where(y_anomaly == -1, 1, 0)  # Convert to fraud labels

print("\nAnomaly Detection Report:")
print(classification_report(y_test, y_anomaly))





Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1980
         1.0       1.00      0.95      0.97        20

    accuracy                           1.00      2000
   macro avg       1.00      0.97      0.99      2000
weighted avg       1.00      1.00      1.00      2000

ROC AUC Score: 0.975

Anomaly Detection Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1980
         1.0       0.86      0.95      0.90        20

    accuracy                           1.00      2000
   macro avg       0.93      0.97      0.95      2000
weighted avg       1.00      1.00      1.00      2000

