In [None]:
import pandas as pd

# Load data
fraud_df = pd.read_csv('data/raw/Fraud_Data.csv')

# Check missing values
print(fraud_df.isnull().sum())  # Typically none, but if any, impute with median for numerical or mode for categorical
# Example imputation (if needed): fraud_df['purchase_value'].fillna(fraud_df['purchase_value'].median(), inplace=True)

# Remove duplicates
fraud_df.drop_duplicates(inplace=True)

# Correct data types
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['user_id'] = fraud_df['user_id'].astype(int)
fraud_df['purchase_value'] = fraud_df['purchase_value'].astype(float)
fraud_df['age'] = fraud_df['age'].astype(int)
fraud_df['class'] = fraud_df['class'].astype(int)  # Target as int

# Save cleaned version
fraud_df.to_csv('data/processed/cleaned_fraud_data.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Univariate: Distributions
plt.figure(figsize=(10, 5))
sns.histplot(fraud_df['purchase_value'], bins=50, kde=True)
plt.title('Distribution of Purchase Value')
plt.savefig('figures/purchase_value_dist.png')  # Save for report

sns.histplot(fraud_df['age'], bins=30, kde=True)
plt.title('Distribution of Age')
plt.savefig('figures/age_dist.png')

# Bivariate: Relationships with target
plt.figure(figsize=(10, 5))
sns.boxplot(x='class', y='purchase_value', data=fraud_df)
plt.title('Purchase Value vs Fraud Class')
plt.savefig('figures/purchase_vs_class.png')

sns.boxplot(x='class', y='age', data=fraud_df)
plt.title('Age vs Fraud Class')
plt.savefig('figures/age_vs_class.png')

# Class distribution
class_dist = fraud_df['class'].value_counts(normalize=True) * 100
print(class_dist)  # e.g., 0: 90.6%, 1: 9.4%
plt.figure(figsize=(6, 4))
sns.countplot(x='class', data=fraud_df)
plt.title('Class Distribution')
plt.savefig('figures/class_dist_fraud.png')