In [None]:
# Fraud Transaction Detection Analysis

## 1. Introduction
# This project aims to detect fraudulent transactions using machine learning techniques.
# The dataset contains simulated transaction data with three types of fraud patterns.

## 2. Data Loading and Exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
%matplotlib inline

# Load the data
df = pd.read_csv('../data/raw/transactions.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nColumn names:", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

# Display first few rows
df.head()

## 3. Data Preprocessing
# Convert datetime column
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])

# Extract time-based features
df['Hour'] = df['TX_DATETIME'].dt.hour
df['DayOfWeek'] = df['TX_DATETIME'].dt.dayofweek
df['Day'] = df['TX_DATETIME'].dt.day

# Display processed data
df.head()

## 4. Feature Engineering
# Create features based on domain knowledge

# High amount flag (scenario 1)
df['HIGH_AMOUNT'] = (df['TX_AMOUNT'] > 220).astype(int)

# Customer behavior features
customer_avg_amount = df.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean().reset_index()
customer_avg_amount.columns = ['CUSTOMER_ID', 'CUST_AVG_AMOUNT']
df = df.merge(customer_avg_amount, on='CUSTOMER_ID', how='left')

# Deviation from customer's average amount
df['AMOUNT_DEVIATION'] = abs(df['TX_AMOUNT'] - df['CUST_AVG_AMOUNT']) / df['CUST_AVG_AMOUNT']

# Terminal risk features (scenario 2)
terminal_fraud_count = df.groupby('TERMINAL_ID')['TX_FRAUD'].sum().reset_index()
terminal_fraud_count.columns = ['TERMINAL_ID', 'TERMINAL_FRAUD_COUNT']
df = df.merge(terminal_fraud_count, on='TERMINAL_ID', how='left')

# Customer risk features (scenario 3)
customer_fraud_count = df.groupby('CUSTOMER_ID')['TX_FRAUD'].sum().reset_index()
customer_fraud_count.columns = ['CUSTOMER_ID', 'CUSTOMER_FRAUD_COUNT']
df = df.merge(customer_fraud_count, on='CUSTOMER_ID', how='left')

# Time since last transaction for each customer
df = df.sort_values(by=['CUSTOMER_ID', 'TX_DATETIME'])
df['TIME_SINCE_LAST_TX'] = df.groupby('CUSTOMER_ID')['TX_DATETIME'].diff().dt.total_seconds() / 60
df['TIME_SINCE_LAST_TX'].fillna(24*60, inplace=True)

# Display engineered features
df[['TX_AMOUNT', 'HIGH_AMOUNT', 'CUST_AVG_AMOUNT', 'AMOUNT_DEVIATION', 
    'TERMINAL_FRAUD_COUNT', 'CUSTOMER_FRAUD_COUNT', 'TIME_SINCE_LAST_TX']].head()

## 5. Data Visualization
# Distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='TX_FRAUD', data=df)
plt.title('Distribution of Fraudulent Transactions')
plt.xlabel('Fraud Status (0: Legitimate, 1: Fraudulent)')
plt.ylabel('Count')
plt.show()

# Transaction amount distribution by fraud status
plt.figure(figsize=(10, 6))
sns.histplot(df[df['TX_FRAUD'] == 0]['TX_AMOUNT'], bins=50, color='green', alpha=0.6, label='Legitimate')
sns.histplot(df[df['TX_FRAUD'] == 1]['TX_AMOUNT'], bins=50, color='red', alpha=0.6, label='Fraudulent')
plt.legend()
plt.title('Transaction Amount Distribution by Fraud Status')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

# Fraud rate by hour of day
hourly_fraud_rate = df.groupby('Hour')['TX_FRAUD'].mean()
plt.figure(figsize=(10, 6))
plt.plot(hourly_fraud_rate.index, hourly_fraud_rate.values, marker='o')
plt.title('Hourly Fraud Rate')
plt.xlabel('Hour of Day')
plt.ylabel('Fraud Rate')
plt.xticks(range(0, 24))
plt.grid(True)
plt.show()

# Correlation matrix
features = ['TX_AMOUNT', 'HIGH_AMOUNT', 'DayOfWeek', 'Hour', 
            'CUST_AVG_AMOUNT', 'AMOUNT_DEVIATION', 
            'TERMINAL_FRAUD_COUNT', 'CUSTOMER_FRAUD_COUNT', 
            'TIME_SINCE_LAST_TX', 'TX_FRAUD']

plt.figure(figsize=(12, 10))
correlation_matrix = df[features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

## 6. Data Preparation for Modeling
# Select features for modeling
features = ['TX_AMOUNT', 'HIGH_AMOUNT', 'DayOfWeek', 'Hour', 
            'CUST_AVG_AMOUNT', 'AMOUNT_DEVIATION', 
            'TERMINAL_FRAUD_COUNT', 'CUSTOMER_FRAUD_COUNT', 
            'TIME_SINCE_LAST_TX']
target = 'TX_FRAUD'

X = df[features]
y = df[target]

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print(f"Original dataset shape: {X.shape}")
print(f"Resampled dataset shape: {X_res.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_res).value_counts()}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.3, random_state=42, stratify=y_res
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

## 7. Model Training
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = model.score(X_test_scaled, y_test)
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'auc_roc': auc_roc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    # Print results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Find the best model based on AUC-ROC
best_model_name = max(results, key=lambda x: results[x]['auc_roc'])
best_model = results[best_model_name]['model']
print(f"\nBest model: {best_model_name} with AUC-ROC: {results[best_model_name]['auc_roc']:.4f}")

## 8. Model Evaluation
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))
for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {result["auc_roc"]:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend(loc='lower right')
plt.show()

# Plot confusion matrix for the best model
best_result = results[best_model_name]
cm = confusion_matrix(y_test, best_result['y_pred'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Legitimate', 'Fraudulent'], 
            yticklabels=['Legitimate', 'Fraudulent'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot precision-recall curve
plt.figure(figsize=(10, 8))
for name, result in results.items():
    precision, recall, _ = precision_recall_curve(y_test, result['y_pred_proba'])
    plt.plot(recall, precision, label=name)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()

# Plot feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title(f'Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.show()

## 9. Business Impact Analysis
# Calculate potential savings from fraud detection
df['PREDICTED_FRAUD'] = best_model.predict(scaler.transform(X))
df['FRAUD_PROBABILITY'] = best_model.predict_proba(scaler.transform(X))[:, 1]

# Calculate potential savings
true_frauds = df[(df['TX_FRAUD'] == 1) & (df['PREDICTED_FRAUD'] == 1)]
false_positives = df[(df['TX_FRAUD'] == 0) & (df['PREDICTED_FRAUD'] == 1)]
false_negatives = df[(df['TX_FRAUD'] == 1) & (df['PREDICTED_FRAUD'] == 0)]

total_fraud_amount = df[df['TX_FRAUD'] == 1]['TX_AMOUNT'].sum()
detected_fraud_amount = true_frauds['TX_AMOUNT'].sum()
missed_fraud_amount = false_negatives['TX_AMOUNT'].sum()
false_positive_amount = false_positives['TX_AMOUNT'].sum()

print(f"Total fraud amount: ${total_fraud_amount:,.2f}")
print(f"Detected fraud amount: ${detected_fraud_amount:,.2f} ({detected_fraud_amount/total_fraud_amount*100:.2f}%)")
print(f"Missed fraud amount: ${missed_fraud_amount:,.2f} ({missed_fraud_amount/total_fraud_amount*100:.2f}%)")
print(f"False positive amount: ${false_positive_amount:,.2f}")

# Calculate precision and recall for business context
precision = len(true_frauds) / (len(true_frauds) + len(false_positives))
recall = len(true_frauds) / (len(true_frauds) + len(false_negatives))

print(f"\nPrecision: {precision:.4f} (Percentage of detected frauds that are actual frauds)")
print(f"Recall: {recall:.4f} (Percentage of actual frauds that are detected)")

## 10. Conclusion and Next Steps
# Summary of findings
print("Key Findings:")
print(f"1. Best performing model: {best_model_name}")
print(f"2. AUC-ROC score: {results[best_model_name]['auc_roc']:.4f}")
print(f"3. Most important features: {feature_importance['feature'].head(3).tolist()}")
print(f"4. Business impact: Could prevent ${detected_fraud_amount:,.2f} in fraudulent transactions")

# Recommendations for deployment
print("\nRecommendations:")
print("1. Implement real-time monitoring of high-risk transactions")
print("2. Set up alerts for transactions with high fraud probability")
print("3. Regularly update the model with new transaction data")
print("4. Combine machine learning with rule-based systems for better coverage")

# Next steps
print("\nNext Steps:")
print("1. Deploy the model as an API for real-time predictions")
print("2. Implement a feedback loop to improve model performance")
print("3. Explore deep learning approaches for improved detection")
print("4. Add more features like geolocation and merchant category")