# 💼 Fraud Detection Using Machine Learning

This project aims to detect fraudulent financial transactions using a classification model. We will explore the data, clean it, build features, and train multiple machine learning models to identify fraudulent activity effectively.


In [None]:
# 📦 Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')


In [None]:
# 📂 Load the Dataset
# Note: Please make sure the dataset CSV is in the same directory as this notebook
df = pd.read_csv("fraud_dataset.csv")
df.head()


In [None]:
# 🧾 Data Overview
print("Shape of dataset:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
df.describe()


In [None]:
# 🧹 Data Cleaning and Feature Engineering

# Create transaction difference features
df['diffOrig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['diffDest'] = df['newbalanceDest'] - df['oldbalanceDest']

# Encode 'type' column
df['type'] = LabelEncoder().fit_transform(df['type'])

# Drop less useful columns
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

df.head()


In [None]:
# 📊 Exploratory Data Analysis

# Distribution of transaction types
sns.countplot(x='type', data=df)
plt.title('Transaction Type Distribution')
plt.show()

# Fraudulent vs Non-Fraudulent
sns.countplot(x='isFraud', data=df)
plt.title('Fraudulent vs Non-Fraudulent Transactions')
plt.show()

# Boxplot of amount by fraud
sns.boxplot(x='isFraud', y='amount', data=df)
plt.title('Transaction Amounts by Fraud Status')
plt.show()


In [None]:
# ⚖️ Handle Class Imbalance and Train-Test Split

X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df['isFraud']

# SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)


In [None]:
# 🤖 Train Models

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, X_test)
lr_pred = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


In [None]:
# 📈 Model Evaluation

def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

evaluate_model("Logistic Regression", y_test, lr_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("XGBoost", y_test, xgb_pred)


In [None]:
# 🔍 ROC Curve

for model, pred, label in [(lr, lr_pred, "LogReg"), (rf, rf_pred, "RF"), (xgb, xgb_pred, "XGB")]:
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    plt.plot(fpr, tpr, label=label)

plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid()
plt.show()


## 🔍 Key Insights

- Fraudulent transactions are more likely in `TRANSFER` and `CASH_OUT` types.
- High amounts, zero balances, and unchanged recipient balance can indicate fraud.
- Top predictors (from Random Forest/XGBoost) include: `amount`, `oldbalanceOrg`, `newbalanceDest`, `diffOrig`.

---

## 🛡️ Recommendations

- Flag high-value transfers for multi-level verification.
- Introduce time-based limits on suspicious transactions.
- Use predictive analytics to monitor unusual account behavior in real time.

---

## ✅ Next Steps

- Integrate model in transaction pipeline for real-time scoring.
- Monitor performance over time using fraud detection rate, false positives, and AUC score.
