# Predicting Fraudulent Transactions

### IMPORTING LIBRARIES

In [None]:
import pandas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

: 

In [None]:
# Suppress warnings and set plot style
import warnings
warnings.filterwarnings("ignore")
sns.set(style = "whitegrid")

### LOADING DATA

In [None]:
df = pd.read_csv("Fraud.csv")

#### ADDING FLAGGED FRAUD BASED ON BUSINESS RULE

In [None]:
# Flag fraud where type == 'TRANSFER' and amount > 200
df["isFlaggedFraud"] = df.apply(
    lambda row: 1 if row["type"] == "TRANSFER" and row["amount"] > 200 else 0,
    axis=1
)

### EXPLORATORY ANALYSIS

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df["isFraud"].value_counts()

In [None]:
df["isFlaggedFraud"].value_counts()

In [None]:
df.isnull().sum().sum()

In [None]:
df.shape[0]

In [None]:
round((df["isFraud"].value_counts()[1]/df.shape[0])*100,2)

### VISUALIZATION

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8, 5))

# Plot with seaborn
sns.countplot(data=df, x="type", order=df["type"].value_counts().index, palette="viridis")
plt.title("Distribution of Transaction Types", fontsize=14, fontweight='bold')
plt.xlabel("Transaction Type", fontsize=12)
plt.ylabel("Number of Transactions", fontsize=12)

# Annotate bars with values
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().annotate(f'{height:,}', (p.get_x() + p.get_width() / 2., height),
                       ha='center', va='bottom', fontsize=10, color='black')

# Show plot
plt.tight_layout()
plt.show()


In [None]:
fraud_by_type = df.groupby("type")["isFraud"].mean().sort_values(ascending=False)

# Convert to DataFrame for seaborn
fraud_df = fraud_by_type.reset_index()

sns.set(style="whitegrid")
plt.figure(figsize=(8, 5))

# Barplot with seaborn
sns.barplot(data=fraud_df, x ="type", y= "isFraud", palette= "Reds_r")
plt.title("Fraud Rate by Transaction Type", fontsize =14, fontweight= 'bold')
plt.xlabel("Transaction Type", fontsize=12)
plt.ylabel("Fraud Rate", fontsize=12)

# Annotate values on bars
for p in plt.gca().patches:
    height = p.get_height()
    plt.gca().annotate(f'{height:.2%}', 
                       (p.get_x() + p.get_width() / 2., height),
                       ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
df["amount"].describe().astype(int)

### Outliers

In [None]:
import numpy as np
sns.set(style="whitegrid")
plt.figure(figsize=(8, 4))

#histogram
sns.histplot(np.log1p(df["amount"]), bins=1000, kde=True, color = "seagreen", edgecolor =None)
plt.title("Log-Scaled Transaction Amount Distribution", fontsize=14, fontweight='bold')
plt.xlabel("Log(Amount + 1)", fontsize=12)
plt.ylabel("Density", fontsize=12)
median_log = np.log1p(df["amount"].median())
plt.axvline(median_log, color='black', linestyle='--', linewidth=1.5, label='Median')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.set(style="whitegrid")

# Boxplot
sns.boxplot(data=df[df["amount"] < 30000], x="isFraud", y="amount", palette=["skyblue", "salmon"])
plt.title("Transaction Amount by Fraud Status (Filtered under ₹30k)", fontsize=14, fontweight='bold')
plt.xlabel("Fraudulent (0 = No, 1 = Yes)", fontsize=12)
plt.ylabel("Transaction Amount", fontsize=12)

plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()

plt.show()


### FEATURE ENGINEERING

In [None]:
df["balanceDiffOrig"] = df ["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] =  df["oldbalanceDest"]

In [None]:
(df["balanceDiffOrig"] <0).sum()

In [None]:
(df["balanceDiffDest"] <0).sum()

In [None]:
df.head(5)

#### FRAUD OVER TIME

In [None]:
frauds_per_step = df[df["isFraud"] ==1]["step"].value_counts().sort_index()
plt.plot(frauds_per_step.index, frauds_per_step.values, label = "Frauds per Step")
plt.xlabel("Step(Time)")
plt.ylabel("Number of Fruads")
plt.title("Fruads Over Time")
plt.grid(True)
plt.show()

In [None]:
df.drop(columns = "step",inplace=True)

In [None]:
df.head()

In [None]:
top_senders = df["nameOrig"].value_counts().head(10)

In [None]:
print("Top Senders:\n", top_senders)

In [None]:
top_receivers = df["nameDest"].value_counts().head(10)

In [None]:
print("Top Receivers:\n", top_receivers)

In [None]:
fraud_users = df[df["isFraud"]==1] ["nameOrig"].value_counts().head(10)

In [None]:
print("Top Fraudulent Users:\n", fraud_users)

#### FOCUS ON TRANSFER & CASH_OUT

In [None]:
fraud_types = df[df["type"].isin(["TRANSFER","CASH_OUT"])]

In [None]:
fraud_types["type"].value_counts()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data = fraud_types, x="type", hue= "isFraud", palette ="Set2")

plt.title("Fraud Distribution in Transfer & Cash_Out", fontsize= 14, fontweight ='bold')
plt.xlabel("Transaction Type", fontsize= 12)
plt.ylabel("Transaction Count", fontsize= 12)
plt.legend(title="Is Fraud?", labels=["Not Fraud", "Fraud"])
plt.grid(axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
corr = df[['amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'isFraud']].corr()

In [None]:
corr

### CORRELATION HEATMAP

In [None]:
sns.heatmap(corr, annot = True, cmap = "BuPu", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

#### ZERO BALANCE AFTER TRANSFER

In [None]:
zero_after_transfer = df[
    (df["oldbalanceOrg"]>0)&
    (df["newbalanceOrig"] == 0)&
    (df["type"].isin(["TRANSFER", "CASH_OUT"]))
]

In [None]:
len(zero_after_transfer)

In [None]:
zero_after_transfer.head()

In [None]:
df["isFraud"].value_counts()

### MACHINE LEARNING MODELING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df.head()

In [None]:
df_model = df.drop(["nameOrig", "nameDest", "isFlaggedFraud"], axis =1)

In [None]:
df_model.head()

In [None]:
categorical = ["type"]
numeric = ["amount","oldbalanceOrg", "newbalanceOrig", "oldbalanceDest","newbalanceDest"]

## Training & Testing

In [None]:
# Target and features
y = df_model["isFraud"]
X = df_model.drop("isFraud", axis = 1)

In [None]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, stratify=y)

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers= [
        ("num", StandardScaler(), numeric),
        ("cat", OneHotEncoder(drop="first"),categorical)
    ],
    remainder = "drop"
)

In [None]:
# Preprocessing pipeline
pipeline = Pipeline([
    ("prep",preprocessor),
    ("clf", LogisticRegression(class_weight= "balanced", max_iter=1000))
])

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
pipeline.score(X_test, y_test)*100

In [None]:
import joblib

In [None]:
joblib.dump(pipeline,"Fraud_detection_pipeline.pkl")