In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

In [None]:
#Load Datasets
try:
    fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
    ip_map = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
    credit_card = pd.read_csv('../data/raw/creditcard.csv')
    print("Datasets loaded successfully!")
except FileNotFoundError as e:
    print(f"Error loading datasets: {e}")
    print("Please ensure the data files are in the 'data/raw/' directory.")

In [None]:
# Initial Inspection - Fraud_Data.csv
print("--- Fraud_Data Info ---")
fraud_data.info()
print("\n--- First 5 Rows ---")
print(fraud_data.head())
print(f"\n--- Duplicates: {fraud_data.duplicated().sum()} ---")

In [None]:
# Data Cleaning - Fraud_Data.csv
# Correcting data types for time columns
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
print("\n'signup_time' and 'purchase_time' converted to datetime objects.")

# Check for missing values
print(f"\n--- Missing Values ---\n{fraud_data.isnull().sum()}")

In [None]:
#Initial Inspection - creditcard.csv
print("\n\n--- creditcard.csv Info ---")
credit_card.info()
print("\n--- First 5 Rows ---")
print(credit_card.head())


In [None]:
#Data Cleaning - creditcard.csv
# Check for missing values
print(f"\n--- Missing Values ---\n{credit_card.isnull().sum()}")

# Check for duplicates and remove them
num_duplicates = credit_card.duplicated().sum()
print(f"\n--- Duplicates found: {num_duplicates} ---")
if num_duplicates > 0:
    credit_card.drop_duplicates(inplace=True)
    print(f"Removed {num_duplicates} duplicates. New shape: {credit_card.shape}")


In [None]:
# Initial Inspection - IpAddress_to_Country.csv
print("\n\n--- IpAddress_to_Country.csv Info ---")
ip_map.info()
print("\n--- First 5 Rows ---")
print(ip_map.head())

In [None]:
#EDA - Credit Card Data - Target Variable Distribution
print("--- Credit Card Data: Class Distribution ---")
class_counts = credit_card['Class'].value_counts()
class_perc = credit_card['Class'].value_counts(normalize=True) * 100

print(f"Non-Fraudulent (0): {class_counts[0]} ({class_perc[0]:.4f}%)")
print(f"Fraudulent (1):     {class_counts[1]} ({class_perc[1]:.4f}%)")

plt.figure(figsize=(8, 6))
sns.countplot(x='Class', data=credit_card)
plt.title('Credit Card Transaction Class Distribution', fontsize=16)
plt.ylabel('Number of Transactions')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.savefig('../reports/figures/creditcard_class_distribution.png')
plt.show()

In [None]:
#EDA - Credit Card Data - Transaction Amount & Time Analysis
fig, axes = plt.subplots(1, 2, figsize=(18, 5))

# Plot for Transaction Amount
sns.histplot(credit_card['Amount'], bins=50, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Transaction Amount')
axes[0].set_xlim(0, 1000) # Limiting for better visibility

# Plot for Time
sns.histplot(credit_card['Time'], bins=50, kde=True, ax=axes[1])
axes[1].set_title('Distribution of Transaction Time')

plt.suptitle('Amount and Time Distributions', fontsize=16)
plt.show()

# Compare Amount for Fraud vs. Non-Fraud
plt.figure(figsize=(10, 7))
sns.boxplot(x='Class', y='Amount', data=credit_card)
plt.title('Transaction Amount by Class')
plt.ylim(0, 500) # Limiting for better visibility
plt.show()

In [None]:
#EDA - E-commerce Data - Target Variable Distribution
print("\n--- E-commerce Data: Class Distribution ---")
class_counts_ecom = fraud_data['class'].value_counts()
class_perc_ecom = fraud_data['class'].value_counts(normalize=True) * 100

print(f"Non-Fraudulent (0): {class_counts_ecom[0]} ({class_perc_ecom[0]:.2f}%)")
print(f"Fraudulent (1):     {class_counts_ecom[1]} ({class_perc_ecom[1]:.2f}%)")

plt.figure(figsize=(8, 6))
sns.countplot(x='class', data=fraud_data)
plt.title('E-commerce Transaction Class Distribution', fontsize=16)
plt.savefig('../reports/figures/ecommerce_class_distribution.png')
plt.show()

In [None]:
#EDA - E-commerce Data - Categorical Feature Analysis
categorical_features = ['source', 'browser', 'sex']
fig, axes = plt.subplots(1, 3, figsize=(22, 6))

for i, feature in enumerate(categorical_features):
    sns.countplot(x=feature, data=fraud_data, ax=axes[i], order=fraud_data[feature].value_counts().index)
    axes[i].set_title(f'Distribution of {feature.capitalize()}')
    axes[i].tick_params(axis='x', rotation=45)

plt.suptitle('Categorical Feature Distributions', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('../reports/figures/ecommerce_categorical_distributions.png')
plt.show()

# Let's analyze fraud rate by category
for feature in categorical_features:
    fraud_rate = fraud_data.groupby(feature)['class'].mean().sort_values(ascending=False)
    print(f"\n--- Fraud Rate by {feature.capitalize()} ---")
    print(fraud_rate)