In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [2]:
# Load and prepare the dataset
df = pd.read_csv('Anti_Fraud_Centre_Data.csv')

# Clean dollar loss data
df['Dollar Loss Clean'] = df['Dollar Loss /pertes financires'].astype(str).str.replace('$', '')
df['Dollar Loss Clean'] = df['Dollar Loss Clean'].str.replace(',', '')
df['Dollar Loss Clean'] = pd.to_numeric(df['Dollar Loss Clean'], errors='coerce')
df

Unnamed: 0,Numro d'identification / Number ID,Date Received / Date reue,Complaint Received Type,Type de plainte reue,Country,Pays,Province/State,Province/tat,Fraud and Cybercrime Thematic Categories,Catgories thmatiques sur la fraude et la cybercriminalit,...,Gender,Genre,Language of Correspondence,Langue de correspondance,Victim Age Range / Tranche d'ge des victimes,Complaint Type,Type de plainte,Number of Victims / Nombre de victimes,Dollar Loss /pertes financires,Dollar Loss Clean
0,1,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Nova Scotia,Nouvelle-cosse,Phishing,Hameonnage,...,Female,Femme,English,Anglais,'30 - 39,Attempt,Tentative,0,$0.00,0.0
1,2,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,British Columbia,Colombie-Britanique,Identity Fraud,Fraude l'identit,...,Female,Femme,English,Anglais,'70 - 79,Victim,Victime,1,$0.00,0.0
2,3,2021-01-02,CAFC Website,CAFC site web,Not Specified,Non spcifi,Not Specified,Non spcifi,Romance,Romance,...,Not Available,non disponible,Not Available,non disponible,'Not Available / non disponible,Victim,Victime,1,$298.00,298.0
3,4,2021-01-02,CAFC Website,CAFC site web,United States,tats-Unis,California,Californie,Foreign Money Offer,Offre dargent de ltranger,...,Male,Homme,English,Anglais,'60 - 69,Attempt,Tentative,0,$0.00,0.0
4,5,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Ontario,Ontario,Merchandise,Marchandise,...,Female,Femme,English,Anglais,'20 - 29,Victim,Victime,1,$50.00,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313971,313972,2024-12-31,Phone,Tlphone,Canada,Canada,Alberta,Alberta,Service,Service,...,Not Available,non disponible,English,Anglais,'70 - 79,Victim,Victime,1,$0.00,0.0
313972,313973,2024-12-31,Phone,Tlphone,Canada,Canada,Ontario,Ontario,Identity Fraud,Fraude l'identit,...,Not Available,non disponible,English,Anglais,'30 - 39,Victim,Victime,1,$0.00,0.0
313973,313974,2024-12-31,Phone,Tlphone,Canada,Canada,Quebec,Qubec,Service,Service,...,Male,Homme,French,Franais,'60 - 69,Victim,Victime,1,$0.00,0.0
313974,313975,2024-12-31,Phone,Tlphone,Canada,Canada,Quebec,Qubec,Extortion,Extorsion,...,Female,Femme,French,Franais,'70 - 79,Victim,Victime,1,$0.00,0.0


In [3]:
# 1. Distribution of dollar losses (Histogram with KDE)
# Filter out zero and null values for better visualization
dollar_loss_data = df[df['Dollar Loss Clean'] > 0]['Dollar Loss Clean']

fig1 = go.Figure()

# Add histogram
fig1.add_trace(go.Histogram(
    x=dollar_loss_data,
    name='Dollar Loss',
    nbinsx=50,
    marker_color='rgb(67, 147, 195)'
))

fig1.update_layout(
    title={
        'text': 'Distribution of Dollar Losses',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='Dollar Loss ($)',
    yaxis_title='Frequency',
    template='plotly_white',
    showlegend=False,
    plot_bgcolor='white',
    height=500,
    bargap=0.1
)

# Use log scale for better distribution visualization
fig1.update_xaxes(type='log', tickformat='$,.0f')

fig1.show()

In [4]:
# 2. Average dollar loss per complaint type (Bar Chart)
complaint_stats = df.groupby('Complaint Type').agg({
    'Dollar Loss Clean': ['mean', 'count', 'sum']
}).reset_index()

# Flatten column names
complaint_stats.columns = ['Complaint Type', 'Average Loss', 'Count', 'Total Loss']

# Sort by average loss and filter for types with significant cases
complaint_stats = complaint_stats[complaint_stats['Count'] >= 5]
complaint_stats = complaint_stats.sort_values('Average Loss', ascending=True)

fig2 = go.Figure()

# Add horizontal bar chart
fig2.add_trace(go.Bar(
    x=complaint_stats['Average Loss'],
    y=complaint_stats['Complaint Type'],
    orientation='h',
    marker=dict(
        color=complaint_stats['Average Loss'],
        colorscale='Viridis'
    ),
    text=complaint_stats['Count'].apply(lambda x: f'n={x}'),
    textposition='inside'
))

fig2.update_layout(
    title={
        'text': 'Average Dollar Loss by Complaint Type',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='Average Dollar Loss ($)',
    yaxis_title='Complaint Type',
    template='plotly_white',
    showlegend=False,
    plot_bgcolor='white',
    height=600,
    xaxis=dict(tickformat='$,.0f')
)

fig2.show()

In [5]:
# 3. Number of victims vs. Dollar loss (Scatter Plot)
fig3 = px.scatter(
    df[df['Dollar Loss Clean'] > 0],  # Filter out zero losses
    x='Number of Victims / Nombre de victimes',
    y='Dollar Loss Clean',
    color='Complaint Type',
    size='Dollar Loss Clean',
    size_max=30,
    opacity=0.7,
    title='Number of Victims vs. Dollar Loss',
    template='plotly_white',
    hover_data={
        'Number of Victims / Nombre de victimes': True,
        'Dollar Loss Clean': ':$,.2f',
        'Complaint Type': True
    }
)

fig3.update_layout(
    title={
        'text': 'Relationship between Number of Victims and Dollar Loss',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title='Number of Victims',
    yaxis_title='Dollar Loss ($)',
    plot_bgcolor='white',
    height=600,
    legend_title='Complaint Type',
    legend=dict(
        yanchor='top',
        y=0.99,
        xanchor='left',
        x=1.02
    ),
    yaxis=dict(type='log', tickformat='$,.0f')
)

# Update marker appearance
fig3.update_traces(
    marker=dict(line=dict(width=1, color='DarkSlateGrey'))
)

fig3.show()