# Fraud Detection Dataset - Data Analysis

<!--
    Developer: Molla Samser
    Designer & Tester: Rima Khatun
    Website: https://rskworld.in
    Email: help@rskworld.in, support@rskworld.in, info@rskworld.com
    Phone: +91 93305 39277
    Company: RSK World
    Description: Comprehensive analysis of fraud detection dataset
-->


In [None]:
# Developer: Molla Samser
# Designer & Tester: Rima Khatun
# Website: https://rskworld.in
# Email: help@rskworld.in, support@rskworld.in, info@rskworld.com
# Phone: +91 93305 39277
# Company: RSK World

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline


## 1. Load Data


In [None]:
# Load dataset
df = pd.read_csv('fraud_detection_dataset.csv')
print(f"Dataset shape: {df.shape}")
df.head()


## 2. Data Exploration


In [None]:
# Basic information
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("Dataset Statistics:")
df.describe()


In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")


In [None]:
# Fraud distribution
fraud_counts = df['is_fraud'].value_counts()
print("Fraud Distribution:")
print(fraud_counts)
print(f"\nFraud percentage: {df['is_fraud'].mean()*100:.2f}%")

# Visualize
plt.figure(figsize=(8, 6))
fraud_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Fraud vs Normal Transactions')
plt.xlabel('Is Fraud (0=Normal, 1=Fraud)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 3. Feature Analysis


In [None]:
# Transaction amount analysis
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
df[df['is_fraud'] == 0]['amount'].hist(bins=50, alpha=0.7, label='Normal', color='green')
df[df['is_fraud'] == 1]['amount'].hist(bins=50, alpha=0.7, label='Fraud', color='red')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.title('Transaction Amount Distribution')
plt.legend()
plt.grid(alpha=0.3)

plt.subplot(1, 2, 2)
df.boxplot(column='amount', by='is_fraud', ax=plt.gca())
plt.xlabel('Is Fraud')
plt.ylabel('Transaction Amount')
plt.title('Transaction Amount by Fraud Status')
plt.suptitle('')
plt.tight_layout()
plt.show()


In [None]:
# Categorical features analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Merchant category
merchant_fraud = pd.crosstab(df['merchant_category'], df['is_fraud'], normalize='index') * 100
merchant_fraud.plot(kind='bar', ax=axes[0, 0], color=['green', 'red'])
axes[0, 0].set_title('Fraud Rate by Merchant Category')
axes[0, 0].set_xlabel('Merchant Category')
axes[0, 0].set_ylabel('Percentage')
axes[0, 0].legend(['Normal', 'Fraud'])
axes[0, 0].tick_params(axis='x', rotation=45)

# Device type
device_fraud = pd.crosstab(df['device_type'], df['is_fraud'], normalize='index') * 100
device_fraud.plot(kind='bar', ax=axes[0, 1], color=['green', 'red'])
axes[0, 1].set_title('Fraud Rate by Device Type')
axes[0, 1].set_xlabel('Device Type')
axes[0, 1].set_ylabel('Percentage')
axes[0, 1].legend(['Normal', 'Fraud'])
axes[0, 1].tick_params(axis='x', rotation=45)

# Foreign transaction
foreign_fraud = pd.crosstab(df['is_foreign_transaction'], df['is_fraud'], normalize='index') * 100
foreign_fraud.plot(kind='bar', ax=axes[1, 0], color=['green', 'red'])
axes[1, 0].set_title('Fraud Rate by Foreign Transaction')
axes[1, 0].set_xlabel('Is Foreign Transaction')
axes[1, 0].set_ylabel('Percentage')
axes[1, 0].legend(['Normal', 'Fraud'])
axes[1, 0].set_xticklabels(['No', 'Yes'], rotation=0)

# Hour of day
hour_fraud = pd.crosstab(df['hour_of_day'], df['is_fraud'], normalize='index') * 100
hour_fraud[1].plot(kind='line', ax=axes[1, 1], marker='o', color='red')
axes[1, 1].set_title('Fraud Rate by Hour of Day')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Fraud Percentage')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 4. Correlation Analysis


In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = df[numeric_cols].corr()

# Plot correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": .8}, fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()


## 5. Model Building


In [None]:
# Preprocess data
df_model = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['merchant_category', 'location', 'device_type', 'user_id']

for col in categorical_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    label_encoders[col] = le

# Prepare features and target
X = df_model.drop(['transaction_id', 'is_fraud'], axis=1, errors='ignore')
y = df_model['is_fraud']

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")


In [None]:
# Handle imbalanced data with SMOTE
X_balanced, y_balanced = SMOTE(random_state=42).fit_resample(X, y)

print(f"After SMOTE - Features shape: {X_balanced.shape}")
print(f"After SMOTE - Target distribution:\n{pd.Series(y_balanced).value_counts()}")


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


In [None]:
# Train Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Model trained successfully!")


In [None]:
# Model evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Fraud'],
            yticklabels=['Normal', 'Fraud'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15))

# Visualize
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
sns.barplot(data=top_features, y='feature', x='importance')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()


## 6. Conclusion

This analysis demonstrates:
- Data exploration and visualization
- Handling of imbalanced datasets
- Model training and evaluation
- Feature importance analysis

For more information, visit: https://rskworld.in
