In [None]:
# ====================
# CAPSTONE PROJECT EDA
# ====================

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            classification_report)

# Feature importance & explainability
import shap

# Settings
# pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
RANDOM_STATE = 42

print("Environment ready! ✓")

Environment ready! ✓


In [None]:
# =======================
# STEP 1: PROBLEM FRAMING
# =======================

# Document your problem statement here
PROBLEM_STATEMENT = """
BUSINESS PROBLEM:
[Air pollution is a growing global challenge with severe consequences for public health and economic sustainability. 
According to the World Health Organization (WHO), air pollution is responsible for millions of premature deaths each year 
and is a major risk factor for lung diseases such as asthma, chronic obstructive pulmonary disease (COPD), lung cancer and 
respiratory infections. The WHO recommends strict limits on pollutants like fine particulate matter, nitrogen dioxide (NO₂) 
and ozone, emphasizing that reducing air pollution is one of the most effective ways to prevent non-communicable diseases 
and lower healthcare burdens.

Analyzing and understanding daily air quality data can play a crucial role in addressing these challenges. To do so, it is 
essential to identify which factors — such as temperature, humidity, time of day, and seasonal patterns — influence air quality 
levels.
By gaining these insights, actionable recommendations can be provided to the public to reduce exposure to harmful pollutants. 
For example, guidance can be given on when to avoid leaving the house or which times of day are safest for outdoor physical 
activities such as exercise.
In addition, weather forecasts combined with air quality predictions can be used proactively by authorities to manage traffic 
and industrial activity before pollution levels rise too high and exceed WHO-recommended thresholds. This preventive approach 
can help protect public health while reducing long-term healthcare costs.]

ML TASK TYPES:
[Classification / Regression / Clustering / Anomaly Detection / Recommendation]

SUCCESS METRICS:
- Technical: [Accuracy > 0.75, F1 > 0.75]
- Business: [Provide recommendations to the public on whether it is safe to leave the house:
	YES (Green)     -> Safe to go outside
	AVOID (Yellow)  -> Limit outdoor activities
	DANGER (Red)    -> Stay indoors, WHO-recommended threshold was exceeded

TARGET VARIABLE:
[Air pollution levels exceeded the WHO-recommended threshold for this toxin]
"""

print(PROBLEM_STATEMENT)


BUSINESS PROBLEM:
[Air pollution is a growing global challenge with severe consequences for public health and economic sustainability. 
According to the World Health Organization (WHO), air pollution is responsible for millions of premature deaths each year 
and is a major risk factor for lung diseases such as asthma, chronic obstructive pulmonary disease (COPD), lung cancer and 
respiratory infections. The WHO recommends strict limits on pollutants like fine particulate matter, nitrogen dioxide (NO₂) 
and ozone, emphasizing that reducing air pollution is one of the most effective ways to prevent non-communicable diseases 
and lower healthcare burdens.

Analyzing and understanding daily air quality data can play a crucial role in addressing these challenges. To do so, it is 
essential to identify which factors — such as temperature, humidity, time of day, and seasonal patterns — influence air quality 
levels.
By gaining these insights, actionable recommendations can be provided to the

In [None]:
# =============================================================================
# STEP 2: DATA LOADING & INITIAL EXPLORATION
# =============================================================================

# Load your dataset
# df = pd.read_csv('data/raw/your_dataset.csv')

# Quick overview
def data_overview(df):
    """Print comprehensive data overview."""
    print("="*60)
    print("DATASET OVERVIEW")
    print("="*60)
    print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
    print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"\nColumn Types:\n{df.dtypes.value_counts()}")
    print(f"\nMissing Values:\n{df.isnull().sum()[df.isnull().sum() > 0]}")
    print(f"\nFirst 5 Rows:")
    display(df.head())

# Uncomment when you have data loaded:
# data_overview(df)

In [None]:
# =============================================================================
# STEP 3: EDA & FEATURE ENGINEERING (Template)
# =============================================================================

# 3a. Missing Value Analysis
def analyze_missing(df):
    """Analyze missing values in dataset."""
    missing = df.isnull().sum()
    missing_pct = 100 * missing / len(df)
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    return missing_df[missing_df['Missing Count'] > 0].sort_values(
        'Missing %', ascending=False
    )

# 3b. Distribution plotting
def plot_distributions(df, columns, figsize=(15, 5)):
    """Plot distributions of specified columns."""
    n_cols = min(3, len(columns))
    n_rows = (len(columns) + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]

    for idx, col in enumerate(columns):
        if df[col].dtype in ['int64', 'float64']:
            axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black')
        else:
            df[col].value_counts().plot(kind='bar', ax=axes[idx])
        axes[idx].set_title(col)

    plt.tight_layout()
    plt.show()

print("EDA functions defined ✓")