In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
df = pd.read_csv("Anti_Fraud_Centre_Data.csv")
df

Unnamed: 0,Numro d'identification / Number ID,Date Received / Date reue,Complaint Received Type,Type de plainte reue,Country,Pays,Province/State,Province/tat,Fraud and Cybercrime Thematic Categories,Catgories thmatiques sur la fraude et la cybercriminalit,...,Mthode de sollicitation,Gender,Genre,Language of Correspondence,Langue de correspondance,Victim Age Range / Tranche d'ge des victimes,Complaint Type,Type de plainte,Number of Victims / Nombre de victimes,Dollar Loss /pertes financires
0,1,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Nova Scotia,Nouvelle-cosse,Phishing,Hameonnage,...,Messages texte,Female,Femme,English,Anglais,'30 - 39,Attempt,Tentative,0,$0.00
1,2,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,British Columbia,Colombie-Britanique,Identity Fraud,Fraude l'identit,...,Autre/inconnu,Female,Femme,English,Anglais,'70 - 79,Victim,Victime,1,$0.00
2,3,2021-01-02,CAFC Website,CAFC site web,Not Specified,Non spcifi,Not Specified,Non spcifi,Romance,Romance,...,Autre/inconnu,Not Available,non disponible,Not Available,non disponible,'Not Available / non disponible,Victim,Victime,1,$298.00
3,4,2021-01-02,CAFC Website,CAFC site web,United States,tats-Unis,California,Californie,Foreign Money Offer,Offre dargent de ltranger,...,Courrier,Male,Homme,English,Anglais,'60 - 69,Attempt,Tentative,0,$0.00
4,5,2021-01-02,CAFC Website,CAFC site web,Canada,Canada,Ontario,Ontario,Merchandise,Marchandise,...,Internet,Female,Femme,English,Anglais,'20 - 29,Victim,Victime,1,$50.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313971,313972,2024-12-31,Phone,Tlphone,Canada,Canada,Alberta,Alberta,Service,Service,...,Appel direct,Not Available,non disponible,English,Anglais,'70 - 79,Victim,Victime,1,$0.00
313972,313973,2024-12-31,Phone,Tlphone,Canada,Canada,Ontario,Ontario,Identity Fraud,Fraude l'identit,...,Autre/inconnu,Not Available,non disponible,English,Anglais,'30 - 39,Victim,Victime,1,$0.00
313973,313974,2024-12-31,Phone,Tlphone,Canada,Canada,Quebec,Qubec,Service,Service,...,Appel direct,Male,Homme,French,Franais,'60 - 69,Victim,Victime,1,$0.00
313974,313975,2024-12-31,Phone,Tlphone,Canada,Canada,Quebec,Qubec,Extortion,Extorsion,...,Courriel,Female,Femme,French,Franais,'70 - 79,Victim,Victime,1,$0.00


In [3]:
# Display column names
print('Column names in the dataset:')
print(df.columns.tolist())

Column names in the dataset:
["Numro d'identification / Number ID", 'Date Received / Date reue', 'Complaint Received Type', 'Type de plainte reue', 'Country', 'Pays', 'Province/State', 'Province/tat', 'Fraud and Cybercrime Thematic Categories', 'Catgories thmatiques sur la fraude et la cybercriminalit', 'Solicitation Method', 'Mthode de sollicitation', 'Gender', 'Genre', 'Language of Correspondence', 'Langue de correspondance', "Victim Age Range / Tranche d'ge des victimes", 'Complaint Type', 'Type de plainte', 'Number of Victims / Nombre de victimes', 'Dollar Loss /pertes financires']


In [4]:
# Inspect the dataset structure
print('Dataset Info:')
print(df.info())
print('\nFirst few rows:')
print(df.head())
print('\nColumn names:')
print(df.columns.tolist())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313976 entries, 0 to 313975
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count   Dtype 
---  ------                                                    --------------   ----- 
 0   Numro d'identification / Number ID                        313976 non-null  int64 
 1   Date Received / Date reue                                 313976 non-null  object
 2   Complaint Received Type                                   313976 non-null  object
 3   Type de plainte reue                                      313976 non-null  object
 4   Country                                                   313976 non-null  object
 5   Pays                                                      313976 non-null  object
 6   Province/State                                            313976 non-null  object
 7   Province/tat                                              313976 non-null  object
 8   

In [5]:
# 1. Cases by age group (Bar Chart)
age_group_counts = df.groupby("Victim Age Range / Tranche d'ge des victimes").size().reset_index(name='Count')

fig1 = px.bar(age_group_counts, 
             x="Victim Age Range / Tranche d'ge des victimes", 
             y='Count',
             title='Cases by Age Group',
             template='plotly_white',
             color='Count',
             color_continuous_scale='Viridis')

fig1.update_layout(
    title_x=0.5,
    xaxis_title='Age Group',
    yaxis_title='Number of Cases',
    showlegend=False,
    plot_bgcolor='white'
)

fig1.show()

In [6]:
# 2. Count by gender (Pie Chart)
gender_counts = df.groupby('Gender').size().reset_index(name='Count')

fig2 = px.pie(gender_counts, 
             values='Count', 
             names='Gender',
             title='Distribution of Cases by Gender',
             hole=0.4,
             color_discrete_sequence=px.colors.qualitative.Set3)

fig2.update_layout(
    title_x=0.5,
    showlegend=True
)

fig2.show()

In [7]:
# Clean and convert Dollar Loss column to numeric
# Remove any currency symbols, commas and convert to float
df['Dollar Loss Clean'] = df['Dollar Loss /pertes financires'].astype(str).str.replace('$', '')
df['Dollar Loss Clean'] = df['Dollar Loss Clean'].str.replace(',', '')
df['Dollar Loss Clean'] = pd.to_numeric(df['Dollar Loss Clean'], errors='coerce')

# 3. Cross-tab: Age group vs. Dollar loss (Heatmap)
# Create pivot table for age group vs dollar loss
pivot_table = df.pivot_table(
    values='Dollar Loss Clean',
    index="Victim Age Range / Tranche d'ge des victimes",
    aggfunc='mean',
    fill_value=0
).round(2)

fig3 = px.imshow(pivot_table,
                aspect='auto',
                title='Average Dollar Loss by Age Group',
                color_continuous_scale='RdYlBu_r')

fig3.update_layout(
    title_x=0.5,
    xaxis_title='',
    yaxis_title='Age Group',
    coloraxis_colorbar_title='Avg Dollar Loss ($)',
    height=400  # Make the heatmap taller for better visibility
)

fig3.show()

In [8]:
# Check the data type and sample values of Dollar Loss column
print('Data type of Dollar Loss column:')
print(df['Dollar Loss /pertes financires'].dtype)
print('\nSample values:')
print(df['Dollar Loss /pertes financires'].head())
print('\nUnique values:')
print(df['Dollar Loss /pertes financires'].unique()[:10])

Data type of Dollar Loss column:
object

Sample values:
0      $0.00
1      $0.00
2    $298.00
3      $0.00
4     $50.00
Name: Dollar Loss /pertes financires, dtype: object

Unique values:
['$0.00' '$298.00' '$50.00' '$95.40' '$66.28' '$640.00' '$24,466.34'
 '$1,000.00' '$439.94' '$400.00']


In [9]:
# 4. Language Distribution (Pie Chart)
lang_counts = df.groupby('Language of Correspondence').size().reset_index(name='Count')

fig4 = px.pie(lang_counts,
             values='Count',
             names='Language of Correspondence',
             title='Distribution by Language of Correspondence',
             color_discrete_sequence=px.colors.qualitative.Set2)

fig4.update_layout(
    title_x=0.5,
    showlegend=True
)

fig4.show()

In [10]:
# 5. Age Group vs Gender Distribution (Stacked Bar Chart)
age_gender_dist = pd.crosstab(df["Victim Age Range / Tranche d'ge des victimes"], df['Gender'])

fig5 = px.bar(age_gender_dist,
             title='Age Group Distribution by Gender',
             template='plotly_white',
             barmode='stack',
             color_discrete_sequence=px.colors.qualitative.Safe)

fig5.update_layout(
    title_x=0.5,
    xaxis_title='Age Group',
    yaxis_title='Count',
    showlegend=True,
    legend_title='Gender',
    plot_bgcolor='white'
)

fig5.show()

In [11]:
# 6. Average Number of Victims by Age Group (Bar Chart)
avg_victims = df.groupby("Victim Age Range / Tranche d'ge des victimes")['Number of Victims / Nombre de victimes'].mean().reset_index()

fig6 = px.bar(avg_victims,
             x="Victim Age Range / Tranche d'ge des victimes",
             y='Number of Victims / Nombre de victimes',
             title='Average Number of Victims by Age Group',
             template='plotly_white',
             color='Number of Victims / Nombre de victimes',
             color_continuous_scale='Viridis')

fig6.update_layout(
    title_x=0.5,
    xaxis_title='Age Group',
    yaxis_title='Average Number of Victims',
    showlegend=False,
    plot_bgcolor='white'
)

fig6.show()

In [12]:
# 7. Language and Gender Relationship (Grouped Bar Chart)
lang_gender_dist = pd.crosstab(df['Language of Correspondence'], df['Gender'])

fig7 = px.bar(lang_gender_dist,
             title='Distribution of Cases by Language and Gender',
             template='plotly_white',
             barmode='group',
             color_discrete_sequence=px.colors.qualitative.Set3)

fig7.update_layout(
    title_x=0.5,
    xaxis_title='Language of Correspondence',
    yaxis_title='Count',
    showlegend=True,
    legend_title='Gender',
    plot_bgcolor='white'
)

fig7.show()