# 📓 01_baseline_model.ipynb
**Detecting Fraud — Baseline Model & Evaluation**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

sns.set(style='whitegrid')

In [None]:
# Load the dataset
df = pd.read_csv('../data/imbl_fraud.csv')
df.head()

In [None]:
# Explore class distribution
fraud_counts = df['isFraud'].value_counts(normalize=True)
print("\nClass Distribution (isFraud):")
print(fraud_counts)

sns.countplot(data=df, x='isFraud')
plt.title("Class Distribution: Fraud vs. Not Fraud")
plt.xlabel("isFraud")
plt.ylabel("Count")
plt.show()

In [None]:
# Optional EDA: amount by type
plt.figure(figsize=(8,4))
sns.boxplot(data=df, x='type', y='amount')
plt.yscale('log')
plt.title("Transaction Amounts by Type (log scale)")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Prepare data for modeling
X = df.drop(columns=['isFraud', 'nameOrig', 'nameDest', 'oldbalanceOrg', 
                     'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])
X = pd.get_dummies(X, columns=['type'], drop_first=True)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
# Train a baseline model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate model
from utils.metrics_helpers import plot_confusion_matrix

y_pred = model.predict(X_test)
plot_confusion_matrix(y_test, y_pred, labels=[0, 1])

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

y_scores = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_scores)
print(f"\nROC-AUC Score: {roc_auc:.4f}")

In [None]:
print("""
🤔 Discussion Prompt:
- Our model got high accuracy. Is that enough?
- Which metric would matter most to a bank investigating fraud cases?
- How well is the model doing on detecting the fraud class (1)?

We'll explore better methods in the next notebook!
""")