# Fraud Prediction â€” Data Exploration

This notebook explores the sample transaction dataset and demonstrates model training.

In [None]:
import sys
sys.path.insert(0, '..')

import matplotlib.pyplot as plt
import seaborn as sns

from src.preprocessing import load_data, get_features_and_target
from src.model import train_model

sns.set_theme(style='whitegrid')

## 1. Load data

In [None]:
df = load_data('../data/sample_data.csv')
print(df.shape)
df.head()

## 2. Class distribution

In [None]:
df['fraud'].value_counts(normalize=True).rename({0: 'Legitimate', 1: 'Fraudulent'}).plot(
    kind='bar', title='Class Distribution', rot=0
)
plt.ylabel('Proportion')
plt.tight_layout()
plt.show()

## 3. Feature distributions

In [None]:
X, y = get_features_and_target(df)
X['fraud'] = y

numeric_cols = ['amount', 'distance_from_home', 'distance_from_last_transaction',
                'ratio_to_median_purchase_price']

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for ax, col in zip(axes.flat, numeric_cols):
    for label, grp in X.groupby('fraud'):
        grp[col].hist(ax=ax, alpha=0.6, bins=40,
                      label=['Legitimate', 'Fraudulent'][label])
    ax.set_title(col)
    ax.legend()
plt.tight_layout()
plt.show()

## 4. Correlation heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.drop(columns=['transaction_id']).corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 5. Train model and evaluate

In [None]:
df2 = load_data('../data/sample_data.csv')
model, scaler, report, cm, importances = train_model(df2)

print('Accuracy:', report['accuracy'])
print('Fraud precision:', report['1']['precision'])
print('Fraud recall:', report['1']['recall'])
print('Fraud F1:', report['1']['f1-score'])

In [None]:
imp_df = pd.Series(importances).sort_values(ascending=True)
imp_df.plot(kind='barh', title='Feature Importances', figsize=(8, 5))
plt.tight_layout()
plt.show()