# Online Payment Fraud Detection

Data Source: /https://www.kaggle.com/datasets/jainilcoder/online-payment-fraud-detection/data

### Importing Packages

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

### Ignores all warning messages

In [2]:
warnings.filterwarnings("ignore")

### Reading the csv file

In [None]:
df = pd.read_csv("onlinefraud.csv")
# Displaying top 5 rows
df.head()

In [None]:
# Dispalying rows and columns
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.head().T

## Data Cleaning

In [None]:
# Displaying datatypes
df.dtypes

In [None]:
# Converting datatypes from objects
df = df.convert_dtypes()
df.dtypes

#### Analysing missing values

In [None]:
# Displaying missing values
msno.bar(df)
plt.show()

In [None]:
# Displaying missing values
msno.matrix(df)
plt.show()

In [None]:
df.isnull().sum()

### Checking for outliers

In [None]:
plt.figure(figsize=(12,8))
df.boxplot()

In [None]:
ig, ax = plt.subplots(1, 1, figsize=(8, 6))
df.hist(ax=ax)
plt.tight_layout()
plt.show()

## Exploratory Data Analysis

In [None]:
# Displaying the number of Transactions using bar plot
fig = px.histogram(df, x='isFraud', color='isFraud',
                   title='Count Plot of Fraud Transactions',
                   labels={'isFraud': 'Is Fraud'},
                   text_auto=True,
                   color_discrete_sequence=px.colors.sequential.PuBu)
fig.update_layout(
    yaxis_title='Number of Transactions',
    xaxis_title='Is Fraud',
    bargap=0.2,
)
fig.show()

> There are very few fraud identified transactions. There is high chances of imbalance class so need to balance the classes using oversampling or undersampling.

In [None]:
# Displaying the number of Transactions using pie plot
fraud_counts = df['isFraud'].value_counts()
fraud_df = fraud_counts.reset_index()
fraud_df.columns = ['isFraud', 'Counts']

# Map the 'isFraud' numerical values to more descriptive labels
fraud_df['Type'] = fraud_df['isFraud'].map({0: 'Non-Fraudulent', 1: 'Fraudulent'})

# Now, plot the pie chart using Plotly Express
import plotly.express as px

fig = px.pie(fraud_df, names='Type', values='Counts',
             title='Proportion of Fraud vs. Non-Fraud Transactions',
             color='Type', color_discrete_sequence=['green', 'lightcoral'])

fig.update_traces(textinfo='percent+label')
fig.show()


> There are very few fraud identified transactions. There is high chances of imbalance class so need to balance the classes using oversampling or undersampling.

In [None]:
# Displaying the correlation Heatmap
numeric_df = df.select_dtypes(include=[np.number])
# Calculate the correlation matrix on numeric data only
correlation_matrix = numeric_df.corr()
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,  # Correlation values
    x=correlation_matrix.columns,  # Feature names for x-axis
    y=correlation_matrix.index,  # Feature names for y-axis
    colorscale='BrBG',  # Valid colorscale for correlation
    colorbar=dict(title='Correlation'),
))

# Update the layout
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(tickmode='linear'),
    yaxis=dict(tickmode='linear'),
    width=800,
    height=600,
)

# Show the plot
fig.show()


> There is a strong corelation between newbalanceOrg and oldbalanceOrg

In [None]:
import plotly.express as px

grouped_df = df.groupby('type')['amount'].sum().reset_index()
sorted_grouped_df = grouped_df.sort_values('amount', ascending=False)

# Create a bar chart using Plotly Express, now with the data sorted 
fig = px.bar(sorted_grouped_df, x='type', y='amount',
             labels={'type': 'Transaction Type', 'amount': 'Total Amount'},
             title='Transaction Type Distribution',
             color_discrete_sequence=['green'])  # Sets the bars to green

# Customize the chart
fig.update_layout(xaxis_title='Transaction Type',
                  yaxis_title='Total Amount',
                  legend_title='Transaction Type',
                  xaxis=dict(tickangle=45))  # Rotate the x-axis labels for better readability

# Show the plot
fig.show()


> 'Transfer' type of transaction has maximum amount of amount processed. Least amount of transaction happend on 'Debit'.

In [None]:
import pandas as pd
import plotly.express as px

transaction_type_counts = df['type'].value_counts()

# Convert the Series to a DataFrame for Plotly
transaction_type_counts_df = transaction_type_counts.reset_index()
transaction_type_counts_df.columns = ['Transaction Type', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(transaction_type_counts_df, x='Transaction Type', y='Count',
             title='Transaction Type Distribution',
             labels={'Count': 'Count', 'Transaction Type': 'Transaction Type'},
             color_discrete_sequence=['green'])  # Sets the bar color

# Customize the chart
fig.update_layout(xaxis_title='Transaction Type',
                  yaxis_title='Count',
                  xaxis=dict(tickangle=45))  # Rotate the x-axis labels for better readability

# Show the plot
fig.show()


> 'Cash_out' type of transaction has maximum count of amount processed. Least number of transaction happend on 'Debit'.

### Analysing which of Transaction has Fraud transactions

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='type', y='amount', data=df, hue='isFraud', palette='Set1')
plt.yscale('log')
plt.title('Box Plots of Transaction Amounts by Type and Fraud Status')
plt.xlabel('Transaction Type')
plt.ylabel('Transaction Amount (log scale)')
plt.legend(title='Fraud', loc='upper right')
plt.show()

> There are five types of transactions named Payment, Transfer, Cash_out, Debit and Cash_in. In this only 'Transfer' and 'Cash_out' have fraud transactions.

In [None]:
Result = pd.crosstab(index=df.type,columns=df.isFraud)
Result

In [None]:
transfer_total = 528812+4097
transfer_fraud = 4097/(transfer_total) * 100
transfer_fraud

In [None]:
cashout_total=2233384+4116
cashout_fraud= 4116/(cashout_total) * 100
cashout_fraud

> 76% of the fraud transactions happened in 'Transfer' and 18% of the fraud transactions happened in 'Cash_out'.

## Calculating the % of Fraud transactions

In [None]:
df.isFlaggedFraud.value_counts()

In [None]:
isFraud_flagged_fraud_records = df[(df.isFraud==1) & (df.isFlaggedFraud==1)]
isFraud_flagged_fraud_records

In [None]:
isFraud_flagged_fraud_records.shape

In [None]:
total_fraud= df[df.isFlaggedFraud ==1]
total_fraud = total_fraud.shape[0]
total_fraud

In [None]:
total_fraud= df[df.isFraud ==1]
total_fraud = total_fraud.shape[0]
total_fraud

In [None]:
total_isflaggedFraud= isFraud_flagged_fraud_records.shape[0]
total_isflaggedFraud

In [None]:
flagged_percent = total_isflaggedFraud/total_fraud * 100
print('Percentage of flagged fraud: ',round(flagged_percent,3))

unflagged_percent= (total_fraud-total_isflaggedFraud)/total_fraud * 100
print('Percentage of incorrectly flagged fraud: ',round(unflagged_percent,3))

> The data reveals a critical challenge in fraud detection, with a mere 0.195% of transactions correctly identified as fraud, against a high 99.805% of transactions that were incorrectly flagged as fraudulent. This significant imbalance suggests the fraud detection mechanism is overly cautious, producing a vast number of false positives. Such inefficiency could strain resources, erode customer trust, and diminish user experience due to unwarranted scrutiny on legitimate transactions.

## Fraud amount

In [None]:
total_transactions = df.shape[0]
fraud_transaction = df[df.isFraud==1].shape[0]
fraud_percent= fraud_transaction/total_transactions * 100
fraud_percent

In [None]:
print('Total transactions: ',total_transactions)
print('Total fraud transactions happened: ',fraud_transaction)
print("Total fraud transaction percent: ",round(fraud_percent,2))

In [None]:
fraud_amount= df[df.isFraud==1]
fraud_amount=fraud_amount.sort_values(by=['amount'],ascending=False)
fraud_amount

In [None]:
import plotly.express as px

# Assuming 'fraud_amount' is a DataFrame with a column named 'amount'
# that you want to plot

# Create a histogram using Plotly Express
fig = px.histogram(fraud_amount, x='amount', nbins=7,
                   title='Distribution of Fraud Amount',
                   labels={'amount': 'Amount'},  # Change 'amount' to your specific column name if different
                   color_discrete_sequence=['orange'])  # Sets the bars to orange

# Customize the histogram
fig.update_traces(marker_line_color='black', marker_line_width=1.5)  # Sets the edge color and width
fig.update_layout(xaxis_title='Amount', yaxis_title='Count', 
                  width=800, height=400)  # Adjusts the size, similar to figsize in matplotlib

# Show the plot
fig.show()


> Most of the fraud transaction amount is in between 1 million.

## Calculating max frequency of Steps 

In [None]:
import plotly.express as px

# Assuming df is your DataFrame and 'step' is the column you want to plot

# Create a histogram using Plotly Express
fig = px.histogram(df, x='step', nbins=50,
                   title='Distribution of Step',
                   labels={'step': 'Step'},  # Change 'step' to your specific column name if different
                   opacity=0.75,
                   marginal='box')  # Optional: adds a boxplot alongside the histogram for another view of the distribution

# Customize the histogram appearance
fig.update_layout(xaxis_title='Step', yaxis_title='Count',
                  width=900, height=400)  # Adjusts the size

# Show the plot
fig.show()


> Maximum distribution are between 150 to 350
of step.