# Initial Data Exploration: Banking Analytics Pipeline

This notebook performs an in-depth exploration of the simulated banking data loaded into the Dockerized PostgreSQL database. It connects using the project's robust SQLAlchemy connection utility, reads data into Pandas DataFrames, performs data profiling, integrity checks, descriptive statistics, and creates visualizations to understand the dataset's characteristics.

**Dataset:** Customers, Accounts, Transactions
**Environment:** Jupyter Notebook (VS Code / Jupyter Lab)
**Connection:** `src.database.db_connection`

In [None]:
# --- Project Initialization and Environment Setup ---
import sys
import os
from pathlib import Path

# --- Robustly Add Project Root to sys.path ---
# Ensures modules within the 'src' directory are importable.
# Assumes notebook runs from <project_root>/notebooks/
project_root = Path.cwd().parent.resolve()
project_root_str = str(project_root)
if project_root_str not in sys.path:
    sys.path.insert(0, project_root_str)

# --- Core Imports ---
# Standard library and third-party
import logging
from pathlib import Path
# Visualization (matplotlib inline often handled by VS Code Jupyter)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import text

# Project-specific import (should now work due to path setup)
from src.database.db_connection import get_database_engine

# --- Configuration and Logging ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("notebook_exploration")

# Define output directory relative to the notebook's location (notebooks/)
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)

# Visualization styling
plt.style.use('seaborn-v0_8-darkgrid') # Or a preferred style
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = (10, 6)

print("✅ Libraries imported, paths configured, and base settings applied.")

# --- Database Connection ---
try:
    engine = get_database_engine()
    logger.info("Database engine acquired.")
    # Test connection and retrieve basic info
    with engine.connect() as conn:
        result = conn.execute(text("SELECT current_database(), current_user;"))
        db_info_row = result.fetchone()

        if db_info_row is None:
            # This should not happen for this specific query, but good practice
            logger.warning("Database info query returned no rows.")
            db_name = "Unknown"
            db_user = "Unknown"
            print("⚠️  Connected to database (info retrieval failed).")
        else:
            # Safely unpack the row
            db_name, db_user = db_info_row
            print(f"✅ Connected to database '{db_name}' as user '{db_user}'.")

except Exception as e:
    # Catch any errors during engine creation, connection, execution, or fetching
    logger.error(f"Database connection or info retrieval failed: {e}", exc_info=True)
    print(f"❌ Error connecting to or querying database: {e}")
    raise # Halt execution as the database is essential



INFO:src.database.db_connection:Creating database engine for database=banking_analytics_db host=localhost port=5432 driver=postgresql+psycopg
INFO:notebook_exploration:Database engine acquired.


✅ Libraries imported, paths configured, and base settings applied.
✅ Connected to database 'banking_analytics_db' as user 'bank_user'.
