# Predictive Analytics for Detecting Fraudulent Transactions in Credit Card Data

# Import necessary libraries

In [1]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import scipy
from sklearn.preprocessing import LabelEncoder



# Importing the dataset

In [2]:
df=pd.read_csv("fraudtest.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,21-06-2020 12:14,2290000000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,19-03-1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,21-06-2020 12:14,3570000000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",17-01-1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,21-06-2020 12:14,3600000000000000.0,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",21-10-1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,21-06-2020 12:15,3590000000000000.0,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,25-07-1987,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,21-06-2020 12:15,3530000000000000.0,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,06-07-1955,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# Displaying Last Entries of the Dataset

In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
555714,555714,31-12-2020 23:59,30600000000000.0,fraud_Reilly and Sons,health_fitness,43.77,Michael,Olson,M,558 Michael Estates,...,40.4931,-91.8912,519,Town planner,13-02-1966,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
555715,555715,31-12-2020 23:59,3560000000000000.0,fraud_Hoppe-Parisian,kids_pets,111.84,Jose,Vasquez,M,572 Davis Mountains,...,29.0393,-95.4401,28739,Futures trader,27-12-1999,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
555716,555716,31-12-2020 23:59,6010000000000000.0,fraud_Rau-Robel,kids_pets,86.88,Ann,Lawson,F,144 Evans Islands Apt. 683,...,46.1966,-118.9017,3684,Musician,29-11-1981,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.65834,-119.715054,0
555717,555717,31-12-2020 23:59,4080000000000.0,fraud_Breitenberg LLC,travel,7.99,Eric,Preston,M,7020 Doyle Stream Apt. 951,...,44.6255,-116.4493,129,Cartographer,15-12-1965,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0
555718,555718,31-12-2020 23:59,4170000000000000.0,fraud_Dare-Marvin,entertainment,38.13,Samuel,Frey,M,830 Myers Plaza Apt. 384,...,35.6665,-97.4798,116001,Media buyer,10-05-1993,1765bb45b3aa3224b4cdcb6e7a96cee3,1388534374,36.210097,-97.036372,0


# General Information about the Dataset

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  float64
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [6]:
df.corr

<bound method DataFrame.corr of         Unnamed: 0 trans_date_trans_time        cc_num  \
0                0      21-06-2020 12:14  2.290000e+15   
1                1      21-06-2020 12:14  3.570000e+15   
2                2      21-06-2020 12:14  3.600000e+15   
3                3      21-06-2020 12:15  3.590000e+15   
4                4      21-06-2020 12:15  3.530000e+15   
...            ...                   ...           ...   
555714      555714      31-12-2020 23:59  3.060000e+13   
555715      555715      31-12-2020 23:59  3.560000e+15   
555716      555716      31-12-2020 23:59  6.010000e+15   
555717      555717      31-12-2020 23:59  4.080000e+12   
555718      555718      31-12-2020 23:59  4.170000e+15   

                                    merchant        category     amt    first  \
0                      fraud_Kirlin and Sons   personal_care    2.86     Jeff   
1                       fraud_Sporer-Keebler   personal_care   29.84   Joanne   
2       fraud_Swaniawski, Ni

In [7]:
df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,277859.0,4.178627e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,1.30992e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,180000000000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,3520000000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,4640000000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,555718.0,4.99e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [8]:
df.shape

(555719, 23)

In [9]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [10]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [11]:
df['is_fraud'].value_counts() # Count of non-fraud and fraud transactions

0    553574
1      2145
Name: is_fraud, dtype: int64

In [12]:
df['job'].value_counts() # count of unique job categories

Film/video editor                4119
Exhibition designer              3968
Surveyor, land/geomatics         3756
Naval architect                  3750
Designer, ceramics/pottery       3463
                                 ... 
Estate manager/land agent         195
Engineer, civil (consulting)      194
Operational investment banker      11
Software engineer                  11
Engineer, water                     8
Name: job, Length: 478, dtype: int64

In [13]:
df['merchant'].value_counts() #how many transactions are associated with each merchant.

fraud_Kilback LLC                        1859
fraud_Cormier LLC                        1597
fraud_Schumm PLC                         1561
fraud_Kuhn LLC                           1521
fraud_Dickinson Ltd                      1519
                                         ... 
fraud_Treutel-King                        323
fraud_Satterfield-Lowe                    319
fraud_Kessler Group                       318
fraud_Jerde-Hermann                       312
fraud_Ritchie, Bradtke and Stiedemann     304
Name: merchant, Length: 693, dtype: int64

In [14]:
df.nunique()

Unnamed: 0               555719
trans_date_trans_time    226976
cc_num                      319
merchant                    693
category                     14
amt                       37256
first                       341
last                        471
gender                        2
street                      924
city                        849
state                        50
zip                         912
lat                         910
long                        910
city_pop                    835
job                         478
dob                         910
trans_num                555719
unix_time                544760
merch_lat                546490
merch_long               551770
is_fraud                      2
dtype: int64

# Data Cleaning

In [15]:
print(len(df[df.duplicated()])) # Check for duplicate

0


In [16]:
duplicate_rows = df.duplicated()
duplicate_rows

0         False
1         False
2         False
3         False
4         False
          ...  
555714    False
555715    False
555716    False
555717    False
555718    False
Length: 555719, dtype: bool

In [17]:
duplicate_columns = df.columns.duplicated()
duplicate_columns

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [18]:
df.isnull().sum() # Check for null values

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [19]:
df.rename(columns={"Unnamed: 0":"id"}, inplace=True) # rename the column

In [20]:
df['merchant'] = df['merchant'].str.replace("fraud_", "") # Remove the 'fraud_' prefix from the strings in the 'merchant' column

In [21]:
df[['trans_date', 'trans_time']] = df['trans_date_trans_time'].str.split(' ', expand=True) # Split the 'trans_date_trans_time' column into 'trans_date' and 'trans_time'

df.head(5) # Display the first 5 rows of the DataFrame to verify the split

Unnamed: 0,id,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_time
0,0,21-06-2020 12:14,2290000000000000.0,Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,333497,Mechanical engineer,19-03-1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,21-06-2020,12:14
1,1,21-06-2020 12:14,3570000000000000.0,Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,302,"Sales professional, IT",17-01-1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,21-06-2020,12:14
2,2,21-06-2020 12:14,3600000000000000.0,"Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,34496,"Librarian, public",21-10-1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,21-06-2020,12:14
3,3,21-06-2020 12:15,3590000000000000.0,Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,54767,Set designer,25-07-1987,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,21-06-2020,12:15
4,4,21-06-2020 12:15,3530000000000000.0,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,1126,Furniture designer,06-07-1955,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,21-06-2020,12:15


In [None]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time']) # Convert the 'trans_date_trans_time' column to datetime format

df['trans_date'] = pd.to_datetime(df['trans_date']) # Convert the 'trans_date' column to datetime format (only the date part)

df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M').dt.time # Convert the 'trans_time' column to time format using the '%H:%M' format (hour and minute)

df['trans_time_group'] = df['trans_date_trans_time'].dt.hour # Create a new column 'trans_time_group' that extracts the hour from the 'trans_date_trans_time'

df['trans_month'] = df['trans_date'].dt.to_period('M').astype("str") # Create a new column 'trans_month' to extract the month in 'YYYY-MM' format as a string

df['trans_dayOfWeek'] = df['trans_date'].dt.day_name() # Create a new column 'trans_dayOfWeek' to extract the day of the week (e.g., Monday, Tuesday)

In [None]:
df['dob'] = pd.to_datetime(df['dob']) #calculate age
df['age'] = (pd.Timestamp.now().year  - df['dob'].dt.year)

In [None]:
df['age'].unique()

In [None]:
print("Min age", df['age'].min())
print("Max age", df['age'].max())

In [None]:
def apply_age_group(age):
    if(age <= 18):
        return 'Teenager'
    elif (age <= 25):
        return "Young Adult"
    elif (age <= 64):
        return "Adult"
    else:
        return "Elder"

In [None]:
df['age_group'] = df['age'].apply(apply_age_group)

In [None]:
df['is_not_fraud']= df['is_fraud'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
df.info()

In [None]:
# Drop unnecessary columns
df = df.drop(['street', 'zip', 'city_pop', 'trans_num', 'unix_time', 'merch_lat', 'merch_long','first','last','dob','lat','long'], axis=1)

In [None]:
df.sample(5)

# Analysis

In [None]:
def annotate_bar(ax, custom_y_func, font_size = 14):
    for p in ax.patches:
        # Calculate annotation
        value = str(round(p.get_height(), 1))
        x = (p.get_x() + p.get_width() / 2) * 0.99
        y = ((p.get_y() + p.get_height() / 2) * 0.99)
        
        y = custom_y_func(y)
        ax.annotate(
            value,
            (x,y),
            color="black",
            size= font_size, ha='center', va='center'
        )

In [None]:
# Ensure correct calculation for fraud and non-fraud counts
fraud_transaction_count = df['is_fraud'].value_counts().reset_index()
fraud_transaction_count.columns = ['is_fraud', 'transaction_count']
fraud_transaction_count['fraud_status'] = fraud_transaction_count['is_fraud'].apply(lambda x: "Fraud" if x == 1 else "Not Fraud")

# Plot 1: Total Number of Transactions (Fraud vs. Not Fraud)
fig1 = px.bar(
    fraud_transaction_count,
    x='fraud_status',
    y='transaction_count',
    title='Total Number of Transactions (Fraud vs. Not Fraud)',
    labels={'fraud_status': 'Fraud Status', 'transaction_count': 'Transaction Count'},
    color='fraud_status',
    color_discrete_map={'Fraud': '#f28b82', 'Not Fraud': '#c6def8'}
)

fig1.update_layout(
    title={'x': 0.5},
    yaxis_title='Transaction Count',
    xaxis_title='Fraud Status'
)

# Ensure correct calculation for total transaction amount by fraud status
fraud_transaction_amount = df.groupby('is_fraud')['amt'].sum().reset_index()
fraud_transaction_amount['fraud_status'] = fraud_transaction_amount['is_fraud'].apply(lambda x: "Fraud" if x == 1 else "Not Fraud")

# Plot 2: Total Transaction Amount (Fraud vs. Not Fraud)
fig2 = px.bar(
    fraud_transaction_amount,
    x='fraud_status',
    y='amt',
    title='Total Transaction Amount (Fraud vs. Not Fraud)',
    labels={'fraud_status': 'Fraud Status', 'amt': 'Transaction Amount'},
    color='fraud_status',
    color_discrete_map={'Fraud': '#f28b82', 'Not Fraud': '#c6def8'}
)

fig2.update_layout(
    title={'x': 0.5},
    yaxis_title='Transaction Amount',
    xaxis_title='Fraud Status'
)

# Show the plots
fig1.show()
fig2.show()

There are a total of 555,719 transactions. Of these, 553,574 transactions are valid, accounting for 99.61% of the total. The remaining 0.39% represents fraudulent transactions.

In [None]:
# Calculate the average transaction amount for fraudulent and non-fraudulent transactions
average_transaction_amount = df.groupby('is_fraud')['amt'].mean().reset_index()
average_transaction_amount['fraud_status'] = average_transaction_amount['is_fraud'].apply(lambda x: "Fraud" if x == 1 else "Not Fraud")

# Create the bar plot using Plotly Express
fig = px.bar(
    average_transaction_amount,
    x='fraud_status',
    y='amt',
    title='Average Transaction Amount (Fraud vs. Not Fraud)',
    labels={'fraud_status': 'Fraud Status', 'amt': 'Average Transaction Amount'},
    color='fraud_status',
    color_discrete_map={'Fraud': '#f28b82', 'Not Fraud': '#c6def8'}
)

# Update layout for better visualization
fig.update_layout(
    title={'x': 0.5},
    xaxis_tickangle=0,
    yaxis_title='Average Transaction Amount',
    xaxis_title='Fraud Status'
)

# Show the plot
fig.show()

# Ploting the overview of dataset by month, Dayofweek, gender, age and category

In [None]:
# Adding a fraud label for visualization purposes
df['fraud_label'] = df['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud')

# Attributes to iterate over and their corresponding names
attributes = ['trans_month', 'trans_dayOfWeek', 'gender', 'category', 'age', 'age_group']
attribute_names = ['Month', 'Day of Week', 'Gender', 'Category', 'Age', 'Age Group']

# Define distinct color palettes for fraud and non-fraud charts
colors_for_fraud = ['#FF4500', '#FF7F50']  # Shades of red/orange for Fraud
colors_for_non_fraud = ['#32CD32', '#7CFC00']  # Shades of green for Non-Fraud

# Iterate over each attribute to create side-by-side visualizations for Fraud and Non-Fraud
for index, attribute in enumerate(attributes):
    attribute_name = attribute_names[index]

    # Determine whether the subplot type is pie or xy based on the attribute
    if attribute in ['gender', 'age_group']:
        # Use domain type for pie charts
        specs = [[{'type': 'domain'}, {'type': 'domain'}]]
    else:
        # Use xy type for histograms
        specs = [[{'type': 'xy'}, {'type': 'xy'}]]

    # Create subplots to display Fraud and Non-Fraud side by side
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(f"Non-Fraud Transactions by {attribute_name}", f"Fraud Transactions by {attribute_name}"),
        horizontal_spacing=0.15,
        specs=specs
    )

    # Non-Fraud Plot
    non_fraud_data = df[df['fraud_label'] == 'Not Fraud']
    if attribute in ['trans_month', 'trans_dayOfWeek', 'category', 'age']:
        non_fraud_fig = px.histogram(
            non_fraud_data,
            x=attribute,
            labels={attribute: attribute_name, 'count': 'Transaction Count'},
            color_discrete_sequence=[colors_for_non_fraud[0]],
            opacity=0.85
        )
        for trace in non_fraud_fig.data:
            fig.add_trace(trace, row=1, col=1)

    elif attribute == 'gender' or attribute == 'age_group':
        non_fraud_fig = px.pie(
            non_fraud_data,
            names=attribute,
            color_discrete_sequence=[colors_for_non_fraud[1]],
            hole=0.4
        )
        fig.add_trace(
            go.Pie(
                labels=non_fraud_fig.data[0]['labels'],
                values=non_fraud_fig.data[0]['values'],
                marker=dict(colors=[colors_for_non_fraud[1]]),
                hole=0.4,
                textinfo='percent+label'
            ),
            row=1, col=1
        )

    # Fraud Plot
    fraud_data = df[df['fraud_label'] == 'Fraud']
    if attribute in ['trans_month', 'trans_dayOfWeek', 'category', 'age']:
        fraud_fig = px.histogram(
            fraud_data,
            x=attribute,
            labels={attribute: attribute_name, 'count': 'Transaction Count'},
            color_discrete_sequence=[colors_for_fraud[0]],
            opacity=0.85
        )
        for trace in fraud_fig.data:
            fig.add_trace(trace, row=1, col=2)

    elif attribute == 'gender' or attribute == 'age_group':
        fraud_fig = px.pie(
            fraud_data,
            names=attribute,
            color_discrete_sequence=[colors_for_fraud[1]],
            hole=0.4
        )
        fig.add_trace(
            go.Pie(
                labels=fraud_fig.data[0]['labels'],
                values=fraud_fig.data[0]['values'],
                marker=dict(colors=[colors_for_fraud[1]]),
                hole=0.4,
                textinfo='percent+label'
            ),
            row=1, col=2
        )

    # Update layout to make graphs side-by-side and visually distinct
    fig.update_layout(
        title_text=f"Distribution of Transactions by {attribute_name}",
        title_x=0.5,
        height=600,
        width=1000,
        template='plotly_white',
        font=dict(size=14)
    )

    # Update axis titles and adjust layouts for subplots
    if attribute not in ['gender', 'age_group']:
        fig.update_xaxes(title_text=attribute_name, row=1, col=1)
        fig.update_yaxes(title_text='Transaction Count', row=1, col=1)
        fig.update_xaxes(title_text=attribute_name, row=1, col=2)
        fig.update_yaxes(title_text='Transaction Count', row=1, col=2)

    # Show the Plotly figure
    fig.show()

## By months
For 'Not Fraud' cases, June 2020 had the lowest number of transactions, followed by a steady increase, peaking in December. The significant rise in December aligns with the holiday season, particularly Christmas, and the typical year-end boost in consumer demand.

For fraudulent transactions, July had the fewest cases, with a steady increase reaching a peak in August. After August, fraud transactions gradually declined, though remained relatively high from August to October.

## Day of week
The two charts show a similar pattern, with Sunday, Monday, and Tuesday having the highest number of transactions for 
both fraud and non-fraud cases.This indicates we should pay more attentions to transactions happen on these at as 
they are more likely to be fraud.

## By Gender
In both fraudulent and non-fraudulent transactions, females conduct more transactions than males, although the difference is not very large.

## By Category
For non-fraudulent transactions, the top three categories are gas_transport, grocery_pos, and home, with gas_transport being the highest.
For fraudulent transactions, the leading categories are grocery_pos, shopping_net, and misc_net.
Notably, grocery_pos appears in both categories, indicating it warrants closer scrutiny.

## By Age
The distribution of transactions by age is similar for both fraudulent and non-fraudulent cases. Adults (26-65 years) have the highest transaction count, while teenagers have the fewest.
This is expected, as teenagers typically lack a stable income and rely on family support, while adults are in their prime earning years.

In [None]:
# Adding a fraud label for visualization purposes
df['fraud_label'] = df['is_fraud'].apply(lambda x: "Fraud" if x == 1 else 'Not Fraud')

# Iterate over each category to create a line plot
for category_name in ['Not Fraud', 'Fraud']:
    # Filter data for each category
    filtered_data = df[df['fraud_label'] == category_name].groupby('trans_time_group').size().reset_index(name='transaction_count')
    
    # Create line plot using Plotly Express
    fig = px.line(
        filtered_data,
        x='trans_time_group',
        y='transaction_count',
        title=f"Distribution of Transactions by Time: {category_name}",
        labels={'trans_time_group': 'Time', 'transaction_count': 'Transaction Count'},
        template='plotly_white',
        markers=True  # Adding markers to make points more visible
    )
    
    # Customize layout for better visibility
    fig.update_layout(
        title={'x': 0.5},
        xaxis_title='Time',
        yaxis_title='Transaction Count',
        xaxis=dict(tickmode='linear'),  # Show each time group as a tick
        font=dict(size=14)
    )

    # Customize the line attributes for better aesthetics
    fig.update_traces(
        line=dict(width=3),  # Set line width
        marker=dict(size=8)  # Set marker size
    )

    # Show the Plotly figure
    fig.show()

From 12 AM to 11 AM, the number of transactions remains relatively stable.
From 11 AM to midnight, there's a noticeable increase in transaction activity, indicating that people are more active during this period.
For fraudulent transactions, most occur late at night (10 PM to midnight) or early in the morning (12 AM to 4 AM), suggesting that individuals with malicious intent are more likely to act during times of reduced human monitoring.

# Plotting the data by transaction for Potential features like job, state, city, merchant 

In [None]:
# Adding a fraud label for visualization purposes
df['fraud_label'] = df['is_fraud'].apply(lambda x: "Fraudulent" if x == 1 else "Non-Fraudulent")

# Columns and their corresponding names for visualization
columns = ['job', 'state', 'city', 'merchant']
columns_name = ['Job', 'State', 'City', 'Merchant']
palette = {
    'Non-Fraudulent': '#89CFF0',  # Light blue for Non-Fraudulent
    'Fraudulent': '#FFCCCB'       # Light red for Fraudulent
}

# Iterate over each column to create side by side visualizations
for idx, col in enumerate(columns):
    column_title = columns_name[idx]

    # Create a subplot figure with 1 row and 2 columns
    fig = make_subplots(rows=1, cols=2, subplot_titles=(f"Non-Fraudulent - {column_title}", f"Fraudulent - {column_title}"))

    for col_idx, label in enumerate(['Non-Fraudulent', 'Fraudulent']):
        # Filter the DataFrame for Fraudulent or Non-Fraudulent transactions
        filtered_df = df[df['fraud_label'] == label]

        # Get top 10 values for the current column
        top_transactions = filtered_df[col].value_counts().nlargest(10).reset_index()
        top_transactions.columns = [col, 'transaction_count']

        # Create the bar trace for each category
        trace = go.Bar(
            x=top_transactions[col],
            y=top_transactions['transaction_count'],
            name=label,
            marker=dict(color=palette[label])
        )

        # Add the trace to the appropriate subplot
        fig.add_trace(trace, row=1, col=col_idx + 1)

    # Update layout to enhance appearance
    fig.update_layout(
        title_text=f"Top 10 Transactions by {column_title}",
        title_x=0.5,
        height=600,
        width=1000,
        template='plotly_white',
        xaxis_tickangle=45,
        showlegend=False
    )

    # Show the Plotly figure
    fig.show()

# Plotting the data by transaction amount for Potential features like job, state, city, merchant

In [None]:
# Adding a fraud label for visualization purposes
df['fraud_label'] = df['is_fraud'].apply(lambda x: "Fraudulent" if x == 1 else "Non-Fraudulent")

# Columns and their corresponding labels for visualization
columns = ['job', 'state', 'city', 'merchant']
column_labels = ['Job', 'State', 'City', 'Merchant']
color_palette = {
    'Non-Fraudulent': '#7FB3D5',  # Light blue for Non-Fraud
    'Fraudulent': '#E74C3C'       # Red for Fraud
}

# Iterate over each column to create visualizations for top transaction amounts
for idx, col in enumerate(columns):
    category_label = column_labels[idx]

    # Create a subplot figure with 1 row and 2 columns for side-by-side comparison
    fig = make_subplots(rows=1, cols=2, subplot_titles=(f"Non-Fraudulent - {category_label}", f"Fraudulent - {category_label}"))

    for col_idx, label in enumerate(['Non-Fraudulent', 'Fraudulent']):
        # Filter the DataFrame for Fraudulent or Non-Fraudulent transactions
        filtered_df = df[df['fraud_label'] == label]

        # Get top 10 transaction amounts for the current column
        top_transactions = filtered_df.groupby(col)['amt'].sum().nlargest(10).reset_index()

        # Create the bar trace for each category
        trace = go.Bar(
            x=top_transactions[col],
            y=top_transactions['amt'],
            name=label,
            marker=dict(color=color_palette[label])
        )

        # Add the trace to the appropriate subplot
        fig.add_trace(trace, row=1, col=col_idx + 1)

    # Update layout to enhance appearance
    fig.update_layout(
        title_text=f"Top 10 Transaction Amount by {category_label}",
        title_x=0.5,
        height=600,
        width=1000,
        template='plotly_white',
        xaxis_tickangle=45,
        showlegend=False,
        yaxis_title='Transaction Amount (USD)',
        font=dict(size=14)
    )

    # Show the Plotly figure
    fig.show()

When observing 'Top 10 transaction' and 'Top 10 transaction amount' charts, we can see that that share similar pattern. High number of transaction also have high number of amount

# Analysis of potentially suspicious transactions

In [None]:
df[df['age'] >= 80]['is_fraud'].value_counts()

In [None]:
df_over80 = df[(df['age'] >= 80) & (df['is_fraud'] == 0)].sort_values(by="trans_time", ascending=False)
df_over80

In [None]:
df_over80['category'].value_counts()

In [None]:
df_over80_category = df_over80.pivot_table(index="trans_time_group",columns="category",aggfunc='count').fillna(0)

In [None]:
df_over80['trans_time_group'].value_counts()

In [None]:
# Group and sort the data for transactions by time group
transaction_data_over80 = df_over80.groupby('trans_time_group').size().reset_index(name='transaction_count').sort_values(by='trans_time_group')

# Create a line plot using Plotly Express
fig = px.line(
    transaction_data_over80,
    x='trans_time_group',
    y='transaction_count',
    title='Distribution of Transactions by Time',
    labels={'trans_time_group': 'Time', 'transaction_count': 'Transaction Count'},
    template='plotly_white',
    markers=True,  # Adding markers for better visibility of data points
    line_shape='spline'  # Smoothen line for better aesthetics
)

# Customize layout for better visualization
fig.update_layout(
    title={'x': 0.5},
    xaxis_title='Time',
    yaxis_title='Transaction Count',
    xaxis=dict(tickmode='linear'),  # Show each time group as a tick
    font=dict(size=14)
)

# Show the Plotly figure
fig.show()

# Correlation

It's quite unusual for elder >80 to make transaction at around 0-2h or 22-23h. We should consider taking a deeper look a these.

In [None]:
# Dropping unnecessary columns
df.drop(columns=['trans_date','trans_time','age_group'],inplace=True)
df_train = df.copy()

le = LabelEncoder()
# Encoding categorical columns in the training dataset
for columns in df.columns:
    if df_train[columns].dtype == 'object':
        df_train[columns] = le.fit_transform(df_train[columns])

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df_train.corr(),annot=True)
plt.show()