# Climate Data Analytics

This notebook performs advanced analytics on the processed climate data, including trend analysis, correlation studies, and predictive modeling to extract insights for Singapore's climate resilience.

## Setup and Imports

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# In a real Databricks environment, we would use:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName("ClimateDataAnalytics").getOrCreate()

# Define data directories - in Databricks these would typically be in DBFS
PROCESSED_DIR = "/dbfs/FileStore/climate_resilience/processed"
ANALYTICS_DIR = "/dbfs/FileStore/climate_resilience/analytics"
os.makedirs(ANALYTICS_DIR, exist_ok=True)

# Set up plotting style
plt.style.use('seaborn-whitegrid')
sns.set_palette("viridis")

print("Climate Data Analytics environment initialized.")

## Load Processed Data

This function loads the processed climate data from Delta Lake.

In [None]:
def load_processed_data():
    """
    Loads the processed climate data from Delta Lake
    """
    print("Loading processed climate data for analytics...")
    
    data_dict = {}
    
    try:
        # In a real Databricks environment, we would use:
        df_long = spark.read.format("delta").load("/dbfs/FileStore/climate_resilience/delta/climate_data_long").toPandas()
        df_wide = spark.read.format("delta").load("/dbfs/FileStore/climate_resilience/delta/climate_data_wide").toPandas()
        
        data_dict['long'] = df_long
        data_dict['wide'] = df_wide
        
        print("Loaded data from Delta Lake")
        return data_dict
    except Exception as e:
        print(f"Error loading data from Delta Lake: {e}")
        print("Falling back to CSV files...")
        
        try:
            # Try to load the CSV files
            long_path = os.path.join(PROCESSED_DIR, "climate_data_long.csv")
            wide_path = os.path.join(PROCESSED_DIR, "climate_data_wide.csv")
            
            if os.path.exists(long_path):
                df_long = pd.read_csv(long_path)
                data_dict['long'] = df_long
                print(f"Loaded long format data from {long_path}")
            
            if os.path.exists(wide_path):
                df_wide = pd.read_csv(wide_path)
                data_dict['wide'] = df_wide
                print(f"Loaded wide format data from {wide_path}")
                
            if data_dict:
                return data_dict
            else:
                print("No processed data files found")
                return None
        except Exception as e:
            print(f"Error loading processed data: {e}")
            return None

## Perform Trend Analysis

This function analyzes trends in climate indicators over time.

In [None]:
def perform_trend_analysis(data_dict):
    """
    Analyzes trends in climate indicators over time
    """
    print("Performing trend analysis on climate indicators...")
    
    if data_dict is None or 'long' not in data_dict or data_dict['long'] is None:
        print("No data for trend analysis")
        return None
    
    df_long = data_dict['long']
    
    # Ensure Year is in datetime format
    if 'Year' in df_long.columns and df_long['Year'].dtype != 'datetime64[ns]':
        df_long['Year'] = pd.to_datetime(df_long['Year'].astype(str), format='%Y')
    
    # Extract numeric year for regression
    df_long['Year_Numeric'] = df_long['Year'].dt.year
    
    # Initialize results dataframe
    trend_results = []
    
    # Analyze trends for each indicator
    for indicator in df_long['Indicator'].unique():
        indicator_data = df_long[df_long['Indicator'] == indicator].copy()
        
        # Skip if too few data points
        if len(indicator_data) < 5:
            continue
        
        # Prepare data for regression
        X = indicator_data['Year_Numeric'].values.reshape(-1, 1)
        y = indicator_data['Value'].values
        
        # Fit linear regression model
        model = LinearRegression()
        model.fit(X, y)
        
        # Get model metrics
        y_pred = model.predict(X)
        r_squared = r2_score(y, y_pred)
        slope = model.coef_[0]
        
        # Calculate additional trend metrics
        earliest_year = indicator_data['Year_Numeric'].min()
        latest_year = indicator_data['Year_Numeric'].max()
        earliest_value = indicator_data[indicator_data['Year_Numeric'] == earliest_year]['Value'].iloc[0]
        latest_value = indicator_data[indicator_data['Year_Numeric'] == latest_year]['Value'].iloc[0]
        
        total_change = latest_value - earliest_value
        percent_change = (total_change / earliest_value) * 100 if earliest_value != 0 else float('inf')
        avg_annual_change = total_change / (latest_year - earliest_year) if latest_year > earliest_year else 0
        
        # Store results
        trend_results.append({
            'Indicator': indicator,
            'Slope': slope,
            'R_Squared': r_squared,
            'Total_Change': total_change,
            'Percent_Change': percent_change,
            'Avg_Annual_Change': avg_annual_change,
            'Start_Year': earliest_year,
            'End_Year': latest_year,
            'Start_Value': earliest_value,
            'End_Value': latest_value
        })
        
        # Create trend visualization
        plt.figure(figsize=(10, 6))
        plt.scatter(indicator_data['Year_Numeric'], indicator_data['Value'], alpha=0.7)
        plt.plot(indicator_data['Year_Numeric'], y_pred, color='red', linewidth=2)
        
        plt.title(f'Trend Analysis: {indicator} in Singapore (1960-2023)')
        plt.xlabel('Year')
        plt.ylabel(indicator)
        plt.grid(True)
        
        # Add trend information to plot
        trend_direction = "Increasing" if slope > 0 else "Decreasing"
        plt.figtext(0.15, 0.85, 
                   f"Trend: {trend_direction}\nAnnual Change: {avg_annual_change:.4f}\nR²: {r_squared:.4f}", 
                   bbox=dict(facecolor='white', alpha=0.8))
        
        # Save the plot
        plot_path = os.path.join(ANALYTICS_DIR, f"{indicator.replace(' ', '_')}_trend.png")
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
    
    # Create trend results dataframe
    trend_df = pd.DataFrame(trend_results)
    
    # Save trend results
    trend_path = os.path.join(ANALYTICS_DIR, "trend_analysis_results.csv")
    trend_df.to_csv(trend_path, index=False)
    print(f"Saved trend analysis results to {trend_path}")
    
    return trend_df

## Analyze Correlations

This function analyzes correlations between different climate indicators.

In [None]:
def analyze_correlations(data_dict):
    """
    Analyzes correlations between different climate indicators
    """
    print("Analyzing correlations between climate indicators...")
    
    if data_dict is None or 'wide' not in data_dict or data_dict['wide'] is None:
        print("No data for correlation analysis")
        return None
    
    df_wide = data_dict['wide']
    
    # Get numeric columns (indicators)
    indicator_cols = df_wide.select_dtypes(include=['number']).columns
    indicator_cols = [col for col in indicator_cols if col not in ['Year_Numeric', 'Decade']]
    
    # Calculate correlation matrix
    if len(indicator_cols) > 1:
        corr_matrix = df_wide[indicator_cols].corr()
        
        # Create correlation heatmap
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap="coolwarm", 
                   vmin=-1, vmax=1, square=True, linewidths=.5)
        
        plt.title('Correlation Matrix of Climate Indicators for Singapore')
        plt.tight_layout()
        
        # Save the plot
        plot_path = os.path.join(ANALYTICS_DIR, "correlation_heatmap.png")
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save correlation matrix
        corr_path = os.path.join(ANALYTICS_DIR, "indicator_correlations.csv")
        corr_matrix.to_csv(corr_path)
        print(f"Saved correlation matrix to {corr_path}")
        
        # Find strongest correlations
        strong_correlations = []
        
        for i in range(len(indicator_cols)):
            for j in range(i+1, len(indicator_cols)):
                indicator1 = indicator_cols[i]
                indicator2 = indicator_cols[j]
                correlation = corr_matrix.iloc[i, j]
                
                if abs(correlation) > 0.5:  # Only include strong correlations
                    strong_correlations.append({
                        'Indicator1': indicator1,
                        'Indicator2': indicator2,
                        'Correlation': correlation,
                        'Strength': "Strong Positive" if correlation > 0.7 else 
                                   "Moderate Positive" if correlation > 0.3 else
                                   "Strong Negative" if correlation < -0.7 else
                                   "Moderate Negative"
                    })
        
        # Create strong correlations dataframe
        strong_corr_df = pd.DataFrame(strong_correlations)
        
        # Save strong correlations
        strong_corr_path = os.path.join(ANALYTICS_DIR, "strong_correlations.csv")
        strong_corr_df.to_csv(strong_corr_path, index=False)
        print(f"Saved strong correlations to {strong_corr_path}")
        
        return corr_matrix
    else:
        print("Not enough indicators for correlation analysis")
        return None

## Build Predictive Models

This function builds predictive models for climate indicators.

In [None]:
def build_predictive_models(data_dict):
    """
    Builds predictive models for climate indicators
    """
    print("Building predictive models for climate indicators...")
    
    if data_dict is None or 'long' not in data_dict or data_dict['long'] is None:
        print("No data for predictive modeling")
        return None
    
    df_long = data_dict['long']
    
    # Ensure Year is in datetime format
    if 'Year' in df_long.columns and df_long['Year'].dtype != 'datetime64[ns]':
        df_long['Year'] = pd.to_datetime(df_long['Year'].astype(str), format='%Y')
    
    # Extract numeric year for modeling
    df_long['Year_Numeric'] = df_long['Year'].dt.year
    
    # Initialize results
    model_results = []
    future_predictions = []
    
    # Build models for each indicator
    for indicator in df_long['Indicator'].unique():
        indicator_data = df_long[df_long['Indicator'] == indicator].copy()
        
        # Skip if too few data points
        if len(indicator_data) < 10:
            continue
        
        print(f"Building model for {indicator}...")
        
        # Prepare data for modeling
        X = indicator_data['Year_Numeric'].values.reshape(-1, 1)
        y = indicator_data['Value'].values
        
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Evaluate model
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Store model results
        model_results.append({
            'Indicator': indicator,
            'MSE': mse,
            'R_Squared': r2,
            'Coefficient': model.coef_[0],
            'Intercept': model.intercept_
        })
        
        # Generate future predictions
        future_years = np.array(range(2024, 2051)).reshape(-1, 1)
        future_values = model.predict(future_years)
        
        # Store future predictions
        for i, year in enumerate(future_years.flatten()):
            future_predictions.append({
                'Indicator': indicator,
                'Year': int(year),
                'Predicted_Value': future_values[i]
            })
        
        # Create prediction visualization
        plt.figure(figsize=(12, 6))
        
        # Plot historical data
        plt.scatter(indicator_data['Year_Numeric'], indicator_data['Value'], 
                   alpha=0.7, label='Historical Data')
        
        # Plot model fit on all data
        all_years = indicator_data['Year_Numeric'].values.reshape(-1, 1)
        all_pred = model.predict(all_years)
        plt.plot(all_years, all_pred, color='blue', linewidth=2, label='Model Fit')
        
        # Plot future predictions
        plt.plot(future_years, future_values, color='red', linestyle='--', 
                linewidth=2, label='Future Predictions')
        
        # Add confidence interval (simplified)
        plt.fill_between(future_years.flatten(), 
                        future_values - 1.96 * np.sqrt(mse), 
                        future_values + 1.96 * np.sqrt(mse), 
                        color='red', alpha=0.2, label='95% Confidence Interval')
        
        plt.title(f'Predictive Model: {indicator} in Singapore (1960-2050)')
        plt.xlabel('Year')
        plt.ylabel(indicator)
        plt.grid(True)
        plt.legend()
        
        # Add model metrics to plot
        plt.figtext(0.15, 0.85, 
                   f"Model R²: {r2:.4f}\nMSE: {mse:.4f}", 
                   bbox=dict(facecolor='white', alpha=0.8))
        
        # Save the plot
        plot_path = os.path.join(ANALYTICS_DIR, f"{indicator.replace(' ', '_')}_prediction.png")
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
    
    # Create model results dataframe
    model_df = pd.DataFrame(model_results)
    
    # Save model results
    model_path = os.path.join(ANALYTICS_DIR, "predictive_model_results.csv")
    model_df.to_csv(model_path, index=False)
    print(f"Saved model results to {model_path}")
    
    # Create future predictions dataframe
    future_df = pd.DataFrame(future_predictions)
    
    # Save future predictions
    future_path = os.path.join(ANALYTICS_DIR, "future_predictions.csv")
    future_df.to_csv(future_path, index=False)
    print(f"Saved future predictions to {future_path}")
    
    return {'model_results': model_df, 'future_predictions': future_df}

## Calculate Climate Vulnerability Index

This function calculates and analyzes the Climate Vulnerability Index.

In [None]:
def calculate_vulnerability_index(data_dict):
    """
    Calculates and analyzes the Climate Vulnerability Index
    """
    print("Calculating Climate Vulnerability Index...")
    
    if data_dict is None or 'wide' not in data_dict or data_dict['wide'] is None:
        print("No data for vulnerability index calculation")
        return None
    
    df_wide = data_dict['wide']
    
    # Check if Climate Vulnerability Index already exists
    if 'Climate_Vulnerability_Index' in df_wide.columns:
        print("Climate Vulnerability Index already calculated")
        
        # Ensure Year is in datetime format
        if 'Year' in df_wide.columns and df_wide['Year'].dtype != 'datetime64[ns]':
            df_wide['Year'] = pd.to_datetime(df_wide['Year'].astype(str), format='%Y')
        
        # Extract numeric year
        df_wide['Year_Numeric'] = df_wide['Year'].dt.year
        
        # Create vulnerability index visualization
        plt.figure(figsize=(12, 6))
        plt.plot(df_wide['Year_Numeric'], df_wide['Climate_Vulnerability_Index'], 
                marker='o', linestyle='-', linewidth=2, color='darkred')
        
        plt.title('Climate Vulnerability Index for Singapore (1960-2023)')
        plt.xlabel('Year')
        plt.ylabel('Vulnerability Index')
        plt.grid(True)
        
        # Add reference lines for risk levels
        plt.axhline(y=75, color='red', linestyle='--', alpha=0.7, label='High Risk')
        plt.axhline(y=50, color='orange', linestyle='--', alpha=0.7, label='Medium Risk')
        plt.axhline(y=25, color='green', linestyle='--', alpha=0.7, label='Low Risk')
        plt.legend()
        
        # Save the plot
        plot_path = os.path.join(ANALYTICS_DIR, "vulnerability_index.png")
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save vulnerability index data
        vuln_df = df_wide[['Year', 'Year_Numeric', 'Climate_Vulnerability_Index']]
        
        # Add component columns if they exist
        component_cols = [col for col in df_wide.columns if '_Norm' in col]
        if component_cols:
            vuln_df = pd.concat([vuln_df, df_wide[component_cols]], axis=1)
        
        vuln_path = os.path.join(ANALYTICS_DIR, "climate_vulnerability_index.csv")
        vuln_df.to_csv(vuln_path, index=False)
        print(f"Saved vulnerability index data to {vuln_path}")
        
        return vuln_df
    else:
        print("Climate Vulnerability Index not found in data")
        return None

## Main Function

This function orchestrates the analytics process.

In [None]:
def main():
    """
    Main function to orchestrate the analytics process
    """
    print(f"Starting climate data analytics at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Load the processed data
    data_dict = load_processed_data()
    
    if data_dict is not None:
        # Perform trend analysis
        trend_df = perform_trend_analysis(data_dict)
        
        # Analyze correlations
        corr_matrix = analyze_correlations(data_dict)
        
        # Build predictive models
        model_results = build_predictive_models(data_dict)
        
        # Calculate vulnerability index
        vuln_df = calculate_vulnerability_index(data_dict)
        
        print(f"Climate data analytics completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("Analytics results are now ready for visualization and interpretation")
        
        return {
            'trend_analysis': trend_df,
            'correlations': corr_matrix,
            'model_results': model_results,
            'vulnerability_index': vuln_df
        }
    else:
        print("Analytics failed: No data available")
        return None

## Execute Analytics

In [None]:
# Run the analytics process
analytics_results = main()

# Display trend analysis results if available
if analytics_results is not None and 'trend_analysis' in analytics_results and analytics_results['trend_analysis'] is not None:
    analytics_results['trend_analysis'].head()