In [None]:
# ============================================================
# INTERACTIVE EDA - ATO Risk Profiler
# ============================================================
# Goal: Explore fraud patterns using interactive visualizations
#   - Altair for interactive charts with tooltips/filters
#   - Plotly for hover details and zoom capabilities
#   - ipywidgets for dynamic filtering
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import altair as alt
from ipywidgets import interact, widgets, Layout
from IPython.display import display
import warnings

# Configuration
warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
alt.data_transformers.disable_max_rows()

# Load dataset
df = pd.read_csv('../data/simulated_transactions.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['day_name'] = df['timestamp'].dt.day_name()

print("Dataset loaded successfully")
print(f"Shape: {df.shape}")
print(f"Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# ============================================================
# BASIC STATISTICS OVERVIEW
# ============================================================

print("=" * 60)
print("FRAUD DISTRIBUTION")
print("=" * 60)
print(df['is_fraud'].value_counts())
print(f"\nFraud Rate: {df['is_fraud'].mean():.4f} ({df['is_fraud'].mean()*100:.2f}%)")

print("\n" + "=" * 60)
print("FRAUD TYPES BREAKDOWN")
print("=" * 60)
fraud_types = df[df['is_fraud']==1]['fraud_type'].value_counts()
print(fraud_types)
print(f"\nTotal Fraudulent Transactions: {fraud_types.sum()}")

print("\n" + "=" * 60)
print("AMOUNT STATISTICS (EUR)")
print("=" * 60)
print(df.groupby('is_fraud')['amount'].describe())

print("\n" + "=" * 60)
print("DEVICE TYPE DISTRIBUTION")
print("=" * 60)
print(pd.crosstab(df['device_type'], df['is_fraud'], normalize='columns'))

In [None]:
# ============================================================
# AMOUNT DISTRIBUTION ANALYSIS
# ============================================================

# Separate legitimate and fraud transactions
df_legit = df[df['is_fraud'] == 0]
df_fraud = df[df['is_fraud'] == 1]

# Create overlapping histograms
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df_legit['amount'],
    name='Legitimate',
    opacity=0.7,
    marker_color='green',
    nbinsx=50
))

fig.add_trace(go.Histogram(
    x=df_fraud['amount'],
    name='Fraudulent',
    opacity=0.7,
    marker_color='red',
    nbinsx=50
))

fig.update_layout(
    title='Transaction Amount Distribution: Fraud vs Legitimate',
    xaxis_title='Amount (EUR)',
    yaxis_title='Frequency',
    barmode='overlay',
    height=500,
    hovermode='x unified'
)

fig.show()

In [None]:
# ============================================================
# TEMPORAL PATTERNS: HOUR OF DAY
# ============================================================

# Aggregate by hour and fraud status
hourly_fraud = df.groupby(['hour', 'is_fraud']).size().reset_index(name='count')

# Altair interactive chart
chart = alt.Chart(hourly_fraud).mark_bar().encode(
    x=alt.X('hour:O', title='Hour of Day'),
    y=alt.Y('count:Q', title='Number of Transactions'),
    color=alt.Color('is_fraud:N', 
                    scale=alt.Scale(domain=[0, 1], range=['green', 'red']),
                    legend=alt.Legend(title='Fraud Status')),
    tooltip=['hour:O', 'is_fraud:N', 'count:Q']
).properties(
    title='Transaction Volume by Hour of Day',
    width=700,
    height=400
).interactive()

chart

In [None]:
# ============================================================
# GEOGRAPHIC ANALYSIS
# ============================================================

# Count transactions by country and fraud status
country_stats = df.groupby(['merchant_country', 'is_fraud']).size().reset_index(name='count')
country_fraud_rate = df.groupby('merchant_country').agg({
    'is_fraud': ['sum', 'count', 'mean']
}).reset_index()
country_fraud_rate.columns = ['country', 'fraud_count', 'total_count', 'fraud_rate']

# Convert fraud rate to percentage for display
country_fraud_rate['fraud_rate_pct'] = country_fraud_rate['fraud_rate'] * 100

# Plotly Choropleth Map
fig = px.choropleth(
    country_fraud_rate,
    locations="country",
    locationmode="country names", # Matches country codes like 'DE', 'US', etc. if they are ISO alpha-2
    # NOTE: If 'merchant_country' has 2-letter codes (DE, US), use locationmode='ISO-3' and convert first, 
    # OR use a custom geojson. For simplicity, Plotly understands 2-letter codes often if set correctly.
    # Let's try ISO-3 conversion or just map to 'country names' if full names used.
    # Assuming 'DE', 'US' are in dataset:
    color="fraud_rate_pct",
    hover_name="country",
    hover_data=["fraud_count", "total_count"],
    color_continuous_scale="Reds",
    title="Global Fraud Rate Heatmap (%)",
    projection="natural earth"
)

# Fix for ISO Alpha-2 codes if needed:
# If map is empty, we need to map 2-letter codes to 3-letter ISO codes.
# Let's use a simpler Scatter Geo which works great with coordinates or simple codes.

fig = px.scatter_geo(
    country_fraud_rate,
    locations="country",
    locationmode="ISO-3", # Try ISO-3, usually requires conversion
    # BETTER APPROACH FOR 2-LETTER CODES:
    # Use built-in 'world' scope.
    size="total_count", 
    color="fraud_rate_pct",
    hover_name="country",
    size_max=30,
    title="Global Transactions & Fraud Rate (Size=Volume, Color=Risk)",
    color_continuous_scale="Reds",
    projection="natural earth"
)

# Since our data has 2-letter codes (DE, US), let's ensure mapping works.
# Simple dictionary mapping for demo (expand as needed)
iso_map = {
    'DE': 'DEU', 'US': 'USA', 'MX': 'MEX', 'AR': 'ARG', 'BR': 'BRA',
    'ES': 'ESP', 'IT': 'ITA', 'UK': 'GBR', 'FR': 'FRA', 'CA': 'CAN'
}
country_fraud_rate['iso_alpha'] = country_fraud_rate['country'].map(iso_map)

fig = px.choropleth(
    country_fraud_rate,
    locations="iso_alpha",
    color="fraud_rate_pct",
    hover_name="country",
    hover_data=["fraud_count", "total_count"],
    color_continuous_scale="Reds",
    title="Global Fraud Risk Map (%)"
)

fig.update_layout(
    height=500,
    geo=dict(showframe=False, showcoastlines=True, projection_type='equirectangular')
)

fig.show()

In [None]:
# ============================================================
# FRAUD TYPE ANALYSIS - TREEMAP
# ============================================================

# Prepare data
fraud_breakdown = df[df['is_fraud']==1].groupby(['fraud_type', 'device_type']).size().reset_index(name='count')
fraud_breakdown['pct'] = fraud_breakdown['count'] / fraud_breakdown['count'].sum() * 100

# Plotly Treemap
fig = px.treemap(
    fraud_breakdown,
    path=[px.Constant("All Fraud"), 'fraud_type', 'device_type'],
    values='count',
    color='fraud_type',
    title='Fraud Taxonomy: Type & Device Hierarchical View',
    color_discrete_sequence=px.colors.qualitative.Prism,
    hover_data=['pct']
)

fig.update_traces(
    textinfo="label+value+percent entry", # Show rich info on tiles
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Share: %{customdata[0]:.1f}%'
)

fig.update_layout(height=600, margin=dict(t=50, l=25, r=25, b=25))
fig.show()

In [None]:
# ============================================================
# CORRELATION ANALYSIS
# ============================================================

# Create numeric features for correlation
df_corr = df.copy()
df_corr['device_mobile'] = (df_corr['device_type'] == 'mobile').astype(int)
df_corr['device_desktop'] = (df_corr['device_type'] == 'desktop').astype(int)

# Select numeric columns
numeric_cols = ['amount', 'hour', 'day_of_week', 'is_fraud', 'device_mobile', 'device_desktop']
corr_matrix = df_corr[numeric_cols].corr()

# Plotly heatmap
fig = px.imshow(
    corr_matrix,
    text_auto='.2f',
    aspect='auto',
    title='Feature Correlation Matrix',
    color_continuous_scale='RdBu_r',
    zmin=-1,
    zmax=1
)

fig.update_layout(height=600, width=700)
fig.show()

In [None]:
# ============================================================
# INTERACTIVE FILTERING WITH WIDGETS
# ============================================================

def filter_and_plot(amount_range, country, fraud_status):
    """
    Interactive function to filter dataset and display results
    """
    # Unpack tuple from range slider
    amount_min, amount_max = amount_range
    
    # Apply filters
    filtered_df = df.copy()
    
    filtered_df = filtered_df[
        (filtered_df['amount'] >= amount_min) & 
        (filtered_df['amount'] <= amount_max)
    ]
    
    if country != 'All':
        filtered_df = filtered_df[filtered_df['merchant_country'] == country]
    
    if fraud_status != 'All':
        fraud_val = 1 if fraud_status == 'Fraudulent' else 0
        filtered_df = filtered_df[filtered_df['is_fraud'] == fraud_val]
    
    # Display summary
    print(f"Filtered Transactions: {len(filtered_df)}")
    if len(filtered_df) > 0:
        print(f"Fraud Rate: {filtered_df['is_fraud'].mean():.4f}")
        print(f"Avg Amount: {filtered_df['amount'].mean():.2f} EUR")
    else:
        print("No transactions found with these filters.")
    
    # Plot hourly distribution
    if not filtered_df.empty:
        hourly = filtered_df.groupby('hour').size().reset_index(name='count')
        
        fig = px.bar(
            hourly,
            x='hour',
            y='count',
            title=f'Hourly Distribution (Filtered)',
            labels={'hour': 'Hour of Day', 'count': 'Transaction Count'}
        )
        fig.show()

# Create widgets
amount_slider = widgets.FloatRangeSlider(
    value=[0, 100],
    min=0,
    max=df['amount'].max(),
    step=10,
    description='Amount (EUR):',
    layout=Layout(width='500px'),
    continuous_update=False # Performance optimization
)

country_dropdown = widgets.Dropdown(
    options=['All'] + sorted(df['merchant_country'].unique().tolist()),
    value='All',
    description='Country:'
)

fraud_dropdown = widgets.Dropdown(
    options=['All', 'Fraudulent', 'Legitimate'],
    value='All',
    description='Status:'
)

# Interactive widget
interact(filter_and_plot, 
         amount_range=amount_slider,  # Changed argument name matches function
         country=country_dropdown,
         fraud_status=fraud_dropdown)

In [None]:
# ============================================================
# TIME SERIES ANALYSIS - ADVANCED (CLEAN LINES)
# ============================================================

# Daily aggregation
daily_stats = df.groupby(df['timestamp'].dt.date).agg({
    'is_fraud': ['sum', 'count', 'mean']
}).reset_index()
daily_stats.columns = ['date', 'fraud_count', 'total_count', 'fraud_rate']
daily_stats['date'] = pd.to_datetime(daily_stats['date'])

# Add 7-day rolling average
daily_stats['fraud_rate_ma'] = daily_stats['fraud_rate'].rolling(window=7).mean()

# Dual Axis Plot
fig = make_subplots(specs=[[{"secondary_y": True}]])

# 1. VOLUME (Background Area)
fig.add_trace(
    go.Scatter(
        x=daily_stats['date'], y=daily_stats['total_count'],
        mode='none', 
        fill='tozeroy',
        name='Transaction Volume',
        fillcolor='rgba(31, 119, 180, 0.15)' # Light Blue transparent
    ),
    secondary_y=False
)

# 2. DAILY FRAUD RATE (Thin Line)
fig.add_trace(
    go.Scatter(
        x=daily_stats['date'], y=daily_stats['fraud_rate'],
        mode='lines',
        name='Daily Rate (Raw)',
        line=dict(color='rgba(255, 165, 0, 0.4)', width=1) # Thin Orange, semi-transparent
    ),
    secondary_y=True
)

# 3. TREND LINE (Bold Focus)
fig.add_trace(
    go.Scatter(
        x=daily_stats['date'], y=daily_stats['fraud_rate_ma'],
        mode='lines',
        name='7-Day Trend',
        line=dict(color='#D62728', width=3) # Bold Red
    ),
    secondary_y=True
)

fig.update_layout(
    title='<b>Transaction Volume vs. Fraud Risk Trend</b>',
    height=550,
    hovermode='x unified',
    plot_bgcolor='white',
    legend=dict(orientation="h", y=1.1, x=0.5, xanchor='center'),
    margin=dict(t=80)
)

fig.update_yaxes(
    title_text="Daily Volume", 
    secondary_y=False, 
    showgrid=True, 
    gridcolor='rgba(0,0,0,0.05)',
    range=[0, daily_stats['total_count'].max() * 1.2]
)
fig.update_yaxes(
    title_text="Fraud Rate", 
    secondary_y=True, 
    showgrid=False, 
    tickformat='.1%',
    range=[0, daily_stats['fraud_rate'].max() * 1.3]
)

fig.show()

In [None]:
# ============================================================
# SUMMARY STATISTICS TABLE
# ============================================================

summary = pd.DataFrame({
    'Metric': [
        'Total Transactions',
        'Fraudulent Transactions',
        'Legitimate Transactions',
        'Fraud Rate (%)',
        'Unique Users',
        'Compromised Users',
        'Avg Transaction Amount (EUR)',
        'Avg Fraud Amount (EUR)',
        'Most Common Device',
        'Riskiest Country'
    ],
    'Value': [
        f"{len(df):,}",
        f"{df['is_fraud'].sum():,}",
        f"{(df['is_fraud']==0).sum():,}",
        f"{df['is_fraud'].mean()*100:.2f}",
        f"{df['user_id'].nunique():,}",
        f"{df[df['is_fraud']==1]['user_id'].nunique():,}",
        f"{df['amount'].mean():.2f}",
        f"{df[df['is_fraud']==1]['amount'].mean():.2f}",
        df['device_type'].mode()[0],
        df.groupby('merchant_country')['is_fraud'].mean().idxmax()
    ]
})

print("=" * 60)
print("EDA SUMMARY STATISTICS")
print("=" * 60)
print(summary.to_string(index=False))