# Air Quality Data Exploration

This notebook provides exploratory data analysis for the air quality forecasting dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Add project root to path
import sys
sys.path.append('..')

from config.settings import settings
from src.utils.logger import setup_logging, get_logger

# Setup logging
setup_logging()
logger = get_logger(__name__)

## 1. Load and Inspect Data

In [None]:
# Load your dataset here
# Replace 'your_data.csv' with the actual path to your dataset
data_path = '../data/raw/your_data.csv'

# Check if file exists
if Path(data_path).exists():
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully: {df.shape}")
else:
    print(f"Data file not found: {data_path}")
    print("Please place your dataset in the data/raw/ directory")
    
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    
    dates = pd.date_range('2020-01-01', '2023-12-31', freq='H')
    n_samples = len(dates)
    
    # Generate synthetic data
    np.random.seed(42)
    
    df = pd.DataFrame({
        'datetime': dates,
        'latitude': np.random.normal(28.6, 0.1, n_samples),
        'longitude': np.random.normal(77.2, 0.1, n_samples),
        'NO2': np.random.lognormal(3.5, 0.5, n_samples),
        'O3': np.random.lognormal(3.8, 0.4, n_samples),
        'temperature_2m': np.random.normal(25, 8, n_samples) + 273.15,  # Kelvin
        'relative_humidity_2m': np.random.beta(2, 2, n_samples) * 100,
        'surface_pressure': np.random.normal(101325, 1000, n_samples),
        'wind_speed_10m': np.random.exponential(3, n_samples),
        'wind_direction_10m': np.random.uniform(0, 360, n_samples),
        'boundary_layer_height': np.random.lognormal(6.5, 0.5, n_samples),
        'solar_radiation': np.maximum(0, np.random.normal(200, 150, n_samples)),
        'precipitation': np.random.exponential(0.1, n_samples)
    })
    
    print(f"Sample data created: {df.shape}")

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Columns: {list(df.columns)}")

# Display first few rows
df.head()

In [None]:
# Data types and missing values
print("Data Types and Missing Values:")
info_df = pd.DataFrame({
    'Data Type': df.dtypes,
    'Missing Count': df.isnull().sum(),
    'Missing %': (df.isnull().sum() / len(df)) * 100
})
print(info_df)

## 2. Statistical Summary

In [None]:
# Statistical summary for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols].describe()

## 3. Target Variables Analysis

In [None]:
# Distribution of target variables
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# NO2 distribution
axes[0, 0].hist(df['NO2'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('NO2 Distribution')
axes[0, 0].set_xlabel('NO2 (μg/m³)')
axes[0, 0].set_ylabel('Frequency')

# O3 distribution
axes[0, 1].hist(df['O3'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('O3 Distribution')
axes[0, 1].set_xlabel('O3 (μg/m³)')
axes[0, 1].set_ylabel('Frequency')

# NO2 box plot
axes[1, 0].boxplot(df['NO2'])
axes[1, 0].set_title('NO2 Box Plot')
axes[1, 0].set_ylabel('NO2 (μg/m³)')

# O3 box plot
axes[1, 1].boxplot(df['O3'])
axes[1, 1].set_title('O3 Box Plot')
axes[1, 1].set_ylabel('O3 (μg/m³)')

plt.tight_layout()
plt.show()

In [None]:
# Correlation between NO2 and O3
plt.figure(figsize=(8, 6))
plt.scatter(df['NO2'], df['O3'], alpha=0.5)
plt.xlabel('NO2 (μg/m³)')
plt.ylabel('O3 (μg/m³)')
plt.title('NO2 vs O3 Correlation')

# Add correlation coefficient
correlation = df['NO2'].corr(df['O3'])
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
         transform=plt.gca().transAxes, fontsize=12,
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.grid(True, alpha=0.3)
plt.show()

## 4. Temporal Patterns

In [None]:
# Convert datetime column
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year

In [None]:
# Time series plot
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot subset of data for clarity
subset = df.iloc[:24*30]  # First 30 days

axes[0].plot(subset['datetime'], subset['NO2'], label='NO2', alpha=0.8)
axes[0].set_title('NO2 Time Series (First 30 Days)')
axes[0].set_ylabel('NO2 (μg/m³)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(subset['datetime'], subset['O3'], label='O3', color='orange', alpha=0.8)
axes[1].set_title('O3 Time Series (First 30 Days)')
axes[1].set_ylabel('O3 (μg/m³)')
axes[1].set_xlabel('Date')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Hourly patterns
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# NO2 hourly pattern
hourly_no2 = df.groupby('hour')['NO2'].mean()
axes[0].plot(hourly_no2.index, hourly_no2.values, marker='o')
axes[0].set_title('Average NO2 by Hour of Day')
axes[0].set_xlabel('Hour')
axes[0].set_ylabel('NO2 (μg/m³)')
axes[0].grid(True, alpha=0.3)

# O3 hourly pattern
hourly_o3 = df.groupby('hour')['O3'].mean()
axes[1].plot(hourly_o3.index, hourly_o3.values, marker='o', color='orange')
axes[1].set_title('Average O3 by Hour of Day')
axes[1].set_xlabel('Hour')
axes[1].set_ylabel('O3 (μg/m³)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Monthly patterns
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# NO2 monthly pattern
monthly_no2 = df.groupby('month')['NO2'].mean()
axes[0].plot(monthly_no2.index, monthly_no2.values, marker='o')
axes[0].set_title('Average NO2 by Month')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('NO2 (μg/m³)')
axes[0].set_xticks(range(1, 13))
axes[0].grid(True, alpha=0.3)

# O3 monthly pattern
monthly_o3 = df.groupby('month')['O3'].mean()
axes[1].plot(monthly_o3.index, monthly_o3.values, marker='o', color='orange')
axes[1].set_title('Average O3 by Month')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('O3 (μg/m³)')
axes[1].set_xticks(range(1, 13))
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Meteorological Variables Analysis

In [None]:
# Correlation matrix
met_vars = ['temperature_2m', 'relative_humidity_2m', 'surface_pressure', 
            'wind_speed_10m', 'boundary_layer_height', 'solar_radiation']
target_vars = ['NO2', 'O3']

corr_matrix = df[met_vars + target_vars].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Correlation Matrix: Meteorological Variables vs Pollutants')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots: Meteorological variables vs pollutants
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, met_var in enumerate(met_vars):
    # NO2 vs meteorological variable
    axes[i].scatter(df[met_var], df['NO2'], alpha=0.3, s=1, label='NO2')
    axes[i].scatter(df[met_var], df['O3'], alpha=0.3, s=1, label='O3')
    axes[i].set_xlabel(met_var)
    axes[i].set_ylabel('Concentration (μg/m³)')
    axes[i].set_title(f'{met_var} vs Pollutants')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Data Quality Assessment

In [None]:
# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), (len(outliers) / len(df)) * 100

print("Outlier Analysis:")
for col in ['NO2', 'O3'] + met_vars:
    outlier_count, outlier_pct = detect_outliers(df, col)
    print(f"{col}: {outlier_count} outliers ({outlier_pct:.2f}%)")

In [None]:
# Check for data gaps
df_sorted = df.sort_values('datetime')
time_diff = df_sorted['datetime'].diff()
expected_freq = pd.Timedelta(hours=1)  # Assuming hourly data

gaps = time_diff[time_diff > expected_freq]
print(f"Data gaps found: {len(gaps)}")

if len(gaps) > 0:
    print("\nLargest gaps:")
    print(gaps.nlargest(5))

## 7. Summary and Recommendations

In [None]:
print("DATA EXPLORATION SUMMARY")
print("=" * 50)
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Total missing values: {df.isnull().sum().sum()}")
print(f"Missing percentage: {(df.isnull().sum().sum() / df.size) * 100:.2f}%")

print("\nTARGET VARIABLES:")
print(f"NO2 - Mean: {df['NO2'].mean():.2f}, Std: {df['NO2'].std():.2f}, Range: [{df['NO2'].min():.2f}, {df['NO2'].max():.2f}]")
print(f"O3 - Mean: {df['O3'].mean():.2f}, Std: {df['O3'].std():.2f}, Range: [{df['O3'].min():.2f}, {df['O3'].max():.2f}]")
print(f"NO2-O3 Correlation: {df['NO2'].corr(df['O3']):.3f}")

print("\nRECOMMENDations:")
print("1. Handle missing values using appropriate imputation methods")
print("2. Consider outlier treatment for extreme values")
print("3. Engineer temporal features (hour, day, season)")
print("4. Create lag features for time series modeling")
print("5. Normalize/standardize features before modeling")
print("6. Consider meteorological variables with high correlation to targets")