In [4]:
# import required modules
import pandas as pd
import numpy as np
from dask import dataframe as dd
import holidays
from datetime import datetime
from typing import Dict, List, Optional
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore warnings
warnings.filterwarnings('ignore')

In [5]:
#######  REMOVE ######
fraud_2020_sample = pd.read_csv(r'C:\Course\Repo\Files\Fraud detection\credit_card_fraud_2020_sample.csv')
df = fraud_2020_sample.copy()

#### <font color='Indigo'>add card_brand and card_type from credit card number

In [None]:
##### helper function to get issuer and card type from credit card number 

def identify_card(card_number: str) -> dict:
    """
    Identify card type and brand from card number using BIN patterns
    Returns dictionary with card_type and brand
    """
    # Remove any spaces or non-digit characters
    card_number = ''.join(filter(str.isdigit, str(card_number)))
    
    # Check if card number is valid length
    if not (12 <= len(card_number) <= 19):
        return {"card_type": "Invalid", "brand": "Unknown"}
    
    # Common card patterns using first few digits (BIN)
    card_patterns = {
        "4": {"brand": "Visa", "card_type": "Credit"}, 
        "4026": {"brand": "Visa Electron", "card_type": "Debit"},
        "414170": {"brand": "Visa Rewards", "card_type": "Credit"},
        "4571": {"brand": "Visa Debit", "card_type": "Debit"}, 
        "4929": {"brand": "Visa Purchase", "card_type": "Credit"}, 
        "4908": {"brand": "Visa Platinum", "card_type": "Credit"}, 
        "4910": {"brand": "Visa Gold", "card_type": "Credit"},  
        "50": {"brand": "Maestro", "card_type": "Debit"},
        "56": {"brand": "Maestro", "card_type": "Debit"},
        "57":  {"brand": "Maestro", "card_type": "Debit"},
        "58":  {"brand": "Maestro", "card_type": "Debit"},           
        "51": {"card_type": "Credit", "brand": "Mastercard"},
        "52": {"card_type": "Credit", "brand": "Mastercard"},
        "53": {"card_type": "Credit", "brand": "Mastercard"},
        "54": {"card_type": "Credit", "brand": "Mastercard"},
        "55": {"card_type": "Credit", "brand": "Mastercard"},
		"55": {"card_type": "Credit", "brand": "Mastercard"},
        "2221-2720": {"card_type": "Credit", "brand": "Mastercard"},
        "34": {"card_type": "Charge", "brand": "American Express"},
        "37": {"card_type": "Charge", "brand": "American Express"},
        "6011": {"card_type": "Credit", "brand": "Discover"},
        "644-649": {"card_type": "Credit", "brand": "Discover"},
        "65": {"card_type": "Credit", "brand": "Discover"},
		"35": {"brand": "JCB", "card_type": "Credit"}, 
        "30": {"brand": "Diners Club", "card_type": "Credit"},
        "36": {"brand": "Diners Club", "card_type": "Credit"},   
        "38": {"brand": "Diners Club", "card_type": "Credit"},
        "39": {"brand": "Diners Club", "card_type": "Credit"},
        "300-305": {"card_type": "Credit", "brand": "Diners Club"},
        "3095": {"card_type": "Credit", "brand": "Diners Club"},
        "2131": {"brand": "JCB", "card_type": "Credit"}, 
        "1800": {"brand": "JCB", "card_type": "Credit"}, 
        "62": {"card_type": "Credit", "brand": "UnionPay"},
        "5018": {"card_type": "Debit", "brand": "Maestro"},
        "5020": {"card_type": "Debit", "brand": "Maestro"},
        "5038": {"card_type": "Debit", "brand": "Maestro"},
        "5893": {"card_type": "Debit", "brand": "Maestro"},
        "6304": {"card_type": "Debit", "brand": "Maestro"},
        "6759": {"card_type": "Debit", "brand": "Maestro"},
        "6761": {"card_type": "Debit", "brand": "Maestro"},
        "6762": {"card_type": "Debit", "brand": "Maestro"},
        "6763": {"card_type": "Debit", "brand": "Maestro"},
         "6":  {"brand": "Maestro", "card_type": "Debit"},
        "637": {"brand": "InstaPayment", "card_type": "Credit"},
        "638": {"brand": "InstaPayment", "card_type": "Credit"},
        "639": {"brand": "InstaPayment", "card_type": "Credit"},
        "622126-622925": {"brand": "Discover", "card_type": "Credit"}
    }
    
    # Prepaid patterns
    prepaid_patterns = {
        "604": {"card_type": "Prepaid", "brand": "Visa Gift Card"},
        "5100": {"card_type": "Prepaid", "brand": "Mastercard"}
    }
    
    # Check prepaid first
    for prefix, info in prepaid_patterns.items():
        if card_number.startswith(prefix):
            return info
    
    # Then general patterns
    for prefix, info in card_patterns.items():
        if "-" in prefix:  # Handle ranges
            start, end = prefix.split("-")
            if len(prefix) <= len(card_number):
                card_start = card_number[:len(start)]
                if start <= card_start <= end:
                    return info
        else:
            if card_number.startswith(prefix):
                return info
    
    return {"card_type": "Unknown", "brand": "Unknown"}

def validate_card(card_number: str) -> bool:
    """
    Validate card number using Luhn algorithm
    """
    digits = [int(x) for x in str(card_number) if x.isdigit()]
    if not digits:
        return False
        
    checksum = 0
    even = False
    for digit in digits[::-1]:
        if even:
            doubled = digit * 2
            checksum += doubled if doubled <= 9 else doubled - 9
        else:
            checksum += digit
        even = not even
        
    return checksum % 10 == 0

def add_cc_info(df: pd.DataFrame, cc_column: str = 'cc_num') -> pd.DataFrame:
    """
    Update a DataFrame with card type, brand, and validity information
    """
    # Ensure cc_num is string type
    df[cc_column] = df[cc_column].astype(str)
    
    # Apply card identification
    card_info = df[cc_column].apply(identify_card)
    
    # Extract type and brand into new columns
    df['card_type'] = card_info.apply(lambda x: x['card_type'])
    df['card_brand'] = card_info.apply(lambda x: x['brand'])
    
    return df


In [None]:
df = add_cc_info(df, 'cc_num')

#####  <font color='blue'> view card brand And Type distribution 

In [None]:
# Group by card_brand and card_type and count the occurrences
card_counts = df.groupby(['card_brand', 'card_type']).size().unstack().fillna(0)

# Create the figure
fig, ax = plt.subplots(figsize=(14, 4))

# Plot the grouped bar chart
card_counts.plot(kind='bar', stacked=False, cmap='tab10', ax=ax, width=0.8, edgecolor='black')

# Apply logarithmic scale to y-axis
ax.set_yscale('log')

# Add labels and title
ax.set_xlabel('Card Brand', fontsize=14, fontweight='bold')
ax.set_ylabel('Number of Cards (Log Scale)', fontsize=14, fontweight='bold')
ax.set_title('Distribution of Card Type by Card Brand', fontsize=16, fontweight='bold', pad=15)

# Improve tick labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right', fontsize=12)

# Add grid lines for better readability
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Add vertical separators between brands
xticks = ax.get_xticks()
for i in range(len(xticks) - 1):
    ax.axvline(x=(xticks[i] + xticks[i + 1]) / 2, color='gray', linestyle='--', alpha=0.5)

# Add legend with better placement
ax.legend(title='Card Type', fontsize=12, title_fontsize=13, loc='upper right', frameon=True)

# Remove unnecessary border
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add value labels on bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.0f', fontsize=10, label_type='edge', padding=3, color='black')

# Show the plot
plt.tight_layout()
plt.show()


##### <font color='blue'> plot fraud by card_brnd

In [None]:
# Group by card_brand and count the number of fraud transactions
fraud_counts = df[df['is_fraud'] == 1]['card_brand'].value_counts()

# Group by card_brand and count the total number of transactions (fraud + non-fraud) for each brand
total_counts = df['card_brand'].value_counts()

# Calculate the fraud percentage for each card brand (fraud transactions / total transactions for the brand) * 100
fraud_percentages = (fraud_counts / total_counts) * 100

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot the number of fraud transactions
sns.barplot(x=fraud_counts.index, y=fraud_counts.values, palette='viridis', ax=axes[0])
axes[0].set_xlabel('Card Brand', fontsize=12)
axes[0].set_ylabel('Number of Fraud Transactions', fontsize=12)
axes[0].set_title('Number of Fraud Transactions by Card Brand', fontsize=14)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', linestyle='--', alpha=0.7)

# Show values on bars for the first plot
for index, value in enumerate(fraud_counts.values):
    axes[0].text(index, value + 10, f"{value:,}", ha='center', fontsize=10)

# Plot the fraud percentage for each card brand
sns.barplot(x=fraud_percentages.index, y=fraud_percentages.values, palette='magma', ax=axes[1])
axes[1].set_xlabel('Card Brand', fontsize=12)
axes[1].set_ylabel('Fraud Percentage (%)', fontsize=12)
axes[1].set_title('Fraud Percentage by Card Brand (Relative to All Transactions)', fontsize=14)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', linestyle='--', alpha=0.7)

# Show values on bars for the second plot
for index, value in enumerate(fraud_percentages.values):
    axes[1].text(index, value + 0.5, f"{value:.2f}%", ha='center', fontsize=10)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


#### <font color='Indigo'>group distance population 

In [None]:
df_tmp = calculate_distances(df)
# Use pd.cut() to group 'distance_km' into 10 equal-width bins and label them directly
labels = ["Very Close", "Close", "Somewhat Close", "Moderate", "Intermediate", 
          "Far", "Very Far", "Extremely Far", "Distant", "Very Distant"]

df['distance_category'] = pd.cut(df_tmp['distance_km'], bins=10, labels=labels)

In [None]:
palette_colors = sns.color_palette("Set2", len(labels))  

# Create a countplot with colors
plt.figure(figsize=(12, 6))
sns.countplot(x='distance_category', data=df, order=labels, palette=palette_colors)

# Improve plot aesthetics
plt.title('Count of Distance Groups', fontsize=14, fontweight='bold', pad=10)
plt.xlabel('Distance Range', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.xticks(rotation=20)  # Rotate labels if needed

# Show the plot
plt.show()

#### <font color='Indigo'>group city population 

In [6]:
city_pop_min = df['city_pop'].min()
city_pop_max = df['city_pop'].max()

print(f"City Population Range: {city_pop_min} - {city_pop_max}")
print(df['city_pop'].nunique())

NameError: name 'df' is not defined

In [None]:
labels = [
    "Very Small Town", "Small Town", "Midsize Town", "Large Town",
    "Small City", "Midsize City", "Large City", "Major City",
    "Metropolitan Area", "Large Metropolitan Area"
]

df['city_pop_range'] = pd.qcut(df['city_pop'], q=10, labels=labels)

In [None]:
palette_colors = sns.color_palette("Set2", len(labels))  

# Create a countplot with colors
plt.figure(figsize=(12, 6))
sns.countplot(x='city_pop_range', data=df, order=labels, palette=palette_colors)

# Improve plot aesthetics
plt.title('Count of city population Groups', fontsize=14, fontweight='bold', pad=10)
plt.xlabel('city population Range', fontsize=12, fontweight='bold')
plt.ylabel('Count', fontsize=12, fontweight='bold')
plt.xticks(rotation=20)  # Rotate labels if needed

# Show the plot
plt.show()

#### <font color='Indigo'>group age

In [None]:
# Categorize the ages into bins with labels
df['age_range'] = pd.cut(df['age'],bins=[0, 18, 22, 35, 50, 65, float('inf')], labels=['Teen', 'Student', 'Young Adult', 'Midlife', 'Mature Adult', 'Senior'], right=False)

In [None]:
# Aggregate the age groups
age_group_aggregation = df.groupby('age_range').size().reset_index(name='count')

age_group_aggregation

In [None]:
# Create a countplot
plt.figure(figsize=(10, 6))
sns.countplot(x='age_range', data=df, order=['Teen', 'Student', 'Young Adult', 'Midlife', 'Mature Adult', 'Senior'])
plt.title('Count of Age Groups')
plt.xlabel('Age Range')
plt.ylabel('Count')
plt.show()

##### <font color='blue'> plot card fraud distribution by distance from merchant

In [None]:
# Filter the DataFrame for fraud transactions (is_fraud == 1)
fraud_df = df[df['is_fraud'] == 1]

# Plot the count of fraud transactions by 'distance_bin_category'
plt.figure(figsize=(10, 4))
sns.countplot(x='distance_category', data=fraud_df, palette='viridis')

# Add labels and title
plt.xlabel('Distance Category', fontsize=12)
plt.ylabel('Number of Fraud Transactions', fontsize=12)
plt.title('Fraud Transactions by Distance From Merchant Category', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()