In [2]:
# ======================
# REQUIRED IMPORTS
# ======================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, 
                           confusion_matrix, 
                           accuracy_score,
                           roc_auc_score)
import os  # For file path handling

# ======================
# 1. DATA LOADING
# ======================
try:
    # Load dataset - CHANGE FILENAME IF NEEDED
    data = pd.read_csv('online_payments_fraud_dataset.csv')
    print("Data loaded successfully!\nFirst 5 rows:")
    print(data.head())
    
except FileNotFoundError:
    print("ERROR: File not found. Please check:")
    print(f"- Current directory: {os.getcwd()}")
    print(f"- Files available: {os.listdir()}")
    print("- The CSV filename is case-sensitive")
    exit()

# ======================
# 2. DATA PREPROCESSING
# ======================
# Encode categorical 'type' column
le = LabelEncoder()
data['type'] = le.fit_transform(data['type'])

# Drop unnecessary columns
data = data.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, errors='ignore')

# Feature scaling
scaler = StandardScaler()
numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
               'oldbalanceDest', 'newbalanceDest']
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# ======================
# 3. EXPLORATORY ANALYSIS
# ======================
plt.figure(figsize=(12, 5))

# Fraud distribution plot
plt.subplot(1, 2, 1)
sns.countplot(x='isFraud', data=data)
plt.title('Fraud vs Non-Fraud Transactions')

# Transaction type distribution
plt.subplot(1, 2, 2)
sns.countplot(x='type', hue='isFraud', data=data)
plt.title('Fraud Distribution by Transaction Type')
plt.tight_layout()
plt.show()

# ======================
# 4. MODEL TRAINING
# ======================
# Split data
X = data.drop('isFraud', axis=1)
y = data['isFraud']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train model
model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

# ======================
# 5. MODEL EVALUATION
# ======================
# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for AUC

# Evaluation metrics
print("\n=== Model Performance ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Feature importance
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n=== Feature Importance ===")
print(features)

# Plot feature importance
plt.figure(figsize=(10, 5))
sns.barplot(x='Importance', y='Feature', data=features)
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

# ======================
# 6. SAVE RESULTS
# ======================
# Save processed data
data.to_csv('processed_fraud_data.csv', index=False)

# Save predictions
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).to_csv('predictions.csv', index=False)

print("\n=== Execution Complete ===")
print("Saved: processed_fraud_data.csv, predictions.csv")

ModuleNotFoundError: No module named 'pandas'