# Exploratory Data Analysis - Fraud_Data.csv

This notebook performs exploratory data analysis on the Fraud_Data.csv dataset.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the dataset
fraud_data_path = '../data/raw/Fraud_Data.csv'
fraud_df = pd.read_csv(fraud_data_path)

# Display first few rows
fraud_df.head()

In [None]:
# Basic information about the dataset
print("Shape of the dataset:", fraud_df.shape)
print("\nData types:")
print(fraud_df.dtypes)
print("\nMissing values:")
print(fraud_df.isnull().sum())
print("\nSummary statistics:")
fraud_df.describe()

In [None]:
# Class distribution
plt.figure(figsize=(8, 6))
fraud_df['class'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.show()

print("Class distribution:")
print(fraud_df['class'].value_counts())
print("\nPercentage:")
print(fraud_df['class'].value_counts(normalize=True) * 100)

In [None]:
# Univariate analysis - Numerical features
numerical_cols = ['purchase_value', 'age']

for col in numerical_cols:
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    sns.histplot(fraud_df[col], kde=True)
    plt.title(f'Distribution of {col}')
    
    plt.subplot(1, 2, 2)
    sns.boxplot(x=fraud_df[col])
    plt.title(f'Boxplot of {col}')
    
    plt.show()

In [None]:
# Univariate analysis - Categorical features
categorical_cols = ['source', 'browser', 'sex']

for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    fraud_df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

In [None]:
# Bivariate analysis - Numerical features vs Target
for col in numerical_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='class', y=col, data=fraud_df)
    plt.title(f'{col} vs Class')
    plt.show()

In [None]:
# Bivariate analysis - Categorical features vs Target
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    pd.crosstab(fraud_df[col], fraud_df['class']).plot(kind='bar', stacked=True)
    plt.title(f'{col} vs Class')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Correlation heatmap
# Select numerical columns including class
corr_cols = numerical_cols + ['class']
corr_matrix = fraud_df[corr_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Convert timestamps to datetime
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Extract time-based features
fraud_df['signup_hour'] = fraud_df['signup_time'].dt.hour
fraud_df['signup_day'] = fraud_df['signup_time'].dt.day
fraud_df['signup_month'] = fraud_df['signup_time'].dt.month

fraud_df['purchase_hour'] = fraud_df['purchase_time'].dt.hour
fraud_df['purchase_day'] = fraud_df['purchase_time'].dt.day
fraud_df['purchase_month'] = fraud_df['purchase_time'].dt.month

# Time since signup
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600  # in hours

print("New time-based features added.")

In [None]:
# Analyze time_since_signup
plt.figure(figsize=(12, 6))
sns.histplot(data=fraud_df, x='time_since_signup', hue='class', kde=True, alpha=0.7)
plt.title('Distribution of Time Since Signup by Class')
plt.xlabel('Time Since Signup (hours)')
plt.show()

# Boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(x='class', y='time_since_signup', data=fraud_df)
plt.title('Time Since Signup vs Class')
plt.ylabel('Time Since Signup (hours)')
plt.show()

In [None]:
# Load IP to Country mapping
ip_country_path = '../data/raw/IpAddress_to_Country.csv'
ip_df = pd.read_csv(ip_country_path)

# Convert IP addresses to integers for range lookup
import ipaddress

def ip_to_int(ip):
    try:
        return int(ipaddress.ip_address(ip))
    except:
        return np.nan

fraud_df['ip_address_int'] = fraud_df['ip_address'].apply(ip_to_int)

# Merge with country data
# Note: This is a range-based merge, need to find the country for each IP
def find_country(ip_int):
    if pd.isna(ip_int):
        return 'Unknown'
    matches = ip_df[(ip_df['lower_bound_ip_address'] <= ip_int) & (ip_df['upper_bound_ip_address'] >= ip_int)]
    if not matches.empty:
        return matches['country'].iloc[0]
    return 'Unknown'

fraud_df['country'] = fraud_df['ip_address_int'].apply(find_country)

print("Country mapping completed.")
print("Top 10 countries by transaction count:")
print(fraud_df['country'].value_counts().head(10))

In [None]:
# Analyze fraud patterns by country
country_fraud = fraud_df.groupby('country')['class'].agg(['count', 'sum', 'mean']).sort_values('count', ascending=False).head(20)
country_fraud['fraud_rate'] = country_fraud['mean'] * 100
country_fraud = country_fraud.rename(columns={'count': 'total_transactions', 'sum': 'fraud_transactions', 'mean': 'fraud_rate_raw'})

plt.figure(figsize=(12, 8))
country_fraud['fraud_rate'].plot(kind='bar')
plt.title('Fraud Rate by Country (Top 20 Countries by Transaction Volume)')
plt.xlabel('Country')
plt.ylabel('Fraud Rate (%)')
plt.xticks(rotation=45, ha='right')
plt.show()

print(country_fraud)

In [None]:
# Save the processed data
processed_path = '../data/processed/fraud_data_processed.csv'
fraud_df.to_csv(processed_path, index=False)
print(f"Processed data saved to {processed_path}")