# Task 4: Forecasting Access and Usage (2025-2027)

## Objective
Forecast Ethiopia's financial inclusion indicators for 2025-2027:
- **Access**: Account Ownership Rate
- **Usage**: Digital Payment Adoption Rate

## Approach
1. Define forecast targets
2. Generate baseline trend forecasts
3. Apply event impacts from Task 3
4. Create scenario analysis (optimistic/base/pessimistic)
5. Calculate confidence intervals
6. Interpret and validate results

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import sys
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.loader import load_data
from src.forecast import generate_baseline_forecast
from src.scenarios import (
    generate_all_scenarios,
    calculate_forecast_metrics,
    create_future_events
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)

print("Libraries imported successfully!")

## 1. Load Data and Define Targets

In [None]:
# Load data
data = load_data('../data/raw/ethiopia_fi_unified_data.xlsx')

observations = data['observations']
events = data['events']
targets = data['targets']
impacts = data['impacts']

print(f"Loaded {len(observations)} observations")
print(f"Loaded {len(events)} events")
print(f"Loaded {len(impacts)} impact links")

In [None]:
# Define forecast targets
FORECAST_YEARS = [2025, 2026, 2027]

PRIMARY_INDICATORS = {
    'ACC_OWNERSHIP': 'Account Ownership Rate (Access)',
    'USG_DIGITAL_PAYMENT': 'Digital Payment Usage'
}

print("\n=== FORECAST TARGETS ===")
print(f"Years: {FORECAST_YEARS}")
print(f"\nPrimary Indicators:")
for code, name in PRIMARY_INDICATORS.items():
    print(f"  - {code}: {name}")

# Show policy targets
print("\n=== POLICY TARGETS (2027) ===")
for _, target in targets.iterrows():
    print(f"{target['indicator']}: {target['value_numeric']}%")

## 2. Historical Trend Analysis

In [None]:
# Analyze historical trends for primary indicators
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, (code, name) in enumerate(PRIMARY_INDICATORS.items()):
    hist_data = observations[observations['indicator_code'] == code].copy()
    hist_data = hist_data.sort_values('observation_date')
    
    if not hist_data.empty:
        axes[idx].plot(hist_data['observation_date'], hist_data['value_numeric'], 
                      marker='o', linewidth=2, markersize=8, label='Historical Data')
        axes[idx].set_title(f'{name}\nHistorical Trajectory', fontsize=13, fontweight='bold')
        axes[idx].set_xlabel('Year', fontsize=11)
        axes[idx].set_ylabel('Percentage (%)', fontsize=11)
        axes[idx].grid(True, alpha=0.3)
        axes[idx].legend()
        
        # Calculate CAGR
        if len(hist_data) >= 2:
            first_val = hist_data.iloc[0]['value_numeric']
            last_val = hist_data.iloc[-1]['value_numeric']
            years = (hist_data.iloc[-1]['observation_date'] - hist_data.iloc[0]['observation_date']).days / 365.25
            cagr = ((last_val / first_val) ** (1/years) - 1) * 100
            
            axes[idx].text(0.05, 0.95, f'CAGR: {cagr:.1f}%', 
                          transform=axes[idx].transAxes, 
                          bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
                          verticalalignment='top')

plt.tight_layout()
plt.savefig('../reports/figures/task4_historical_trends.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Historical trends chart saved")

## 3. Generate Baseline Forecasts

Using linear regression on historical data to project "business as usual" trends.

In [None]:
# Generate baseline forecasts for primary indicators
baseline_forecasts = {}

for code, name in PRIMARY_INDICATORS.items():
    baseline = generate_baseline_forecast(observations, code, FORECAST_YEARS)
    
    if not baseline.empty:
        baseline_forecasts[code] = baseline
        print(f"\n=== BASELINE FORECAST: {name} ===")
        for _, row in baseline.iterrows():
            print(f"{row['observation_date'].year}: {row['value_numeric']:.1f}%")
    else:
        print(f"\nWarning: No baseline forecast generated for {code}")

## 4. Generate Scenario Forecasts

Create three scenarios incorporating event impacts:
- **Optimistic**: Events have 30% higher impact than expected
- **Base**: Events have expected impact
- **Pessimistic**: Events have 30% lower impact than expected

In [None]:
# Generate all scenarios for primary indicators
scenario_forecasts = {}

for code, name in PRIMARY_INDICATORS.items():
    if code in baseline_forecasts:
        scenarios = generate_all_scenarios(
            baseline_forecasts[code],
            observations,
            impacts,
            events,
            code
        )
        scenario_forecasts[code] = scenarios
        
        print(f"\n=== SCENARIO FORECASTS: {name} ===")
        
        for scenario_type in ['pessimistic', 'base', 'optimistic']:
            scenario_data = scenarios[scenarios['scenario'] == scenario_type]
            if not scenario_data.empty:
                final_value = scenario_data.iloc[-1]['value_numeric']
                print(f"{scenario_type.capitalize()}: {final_value:.1f}% by 2027")
        
        # Show confidence interval for base case
        base_2027 = scenarios[(scenarios['scenario'] == 'base') & 
                             (scenarios['observation_date'].dt.year == 2027)]
        if not base_2027.empty and 'ci_lower' in base_2027.columns:
            ci_lower = base_2027.iloc[0]['ci_lower']
            ci_upper = base_2027.iloc[0]['ci_upper']
            print(f"\n95% Confidence Interval (2027): [{ci_lower:.1f}%, {ci_upper:.1f}%]")

## 5. Visualize Forecast Scenarios

In [None]:
# Create comprehensive forecast visualization
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

colors = {
    'optimistic': '#27AE60',
    'base': '#3498DB',
    'pessimistic': '#E74C3C'
}

for idx, (code, name) in enumerate(PRIMARY_INDICATORS.items()):
    ax = axes[idx]
    
    # Plot historical data
    hist_data = observations[observations['indicator_code'] == code].copy()
    hist_data = hist_data.sort_values('observation_date')
    
    if not hist_data.empty:
        ax.plot(hist_data['observation_date'], hist_data['value_numeric'], 
               marker='o', linewidth=2.5, markersize=10, label='Historical', 
               color='#34495E', zorder=5)
    
    # Plot scenarios
    if code in scenario_forecasts:
        scenarios = scenario_forecasts[code]
        
        for scenario_type in ['pessimistic', 'base', 'optimistic']:
            scenario_data = scenarios[scenarios['scenario'] == scenario_type].sort_values('observation_date')
            
            if not scenario_data.empty:
                linestyle = '-' if scenario_type == 'base' else '--'
                linewidth = 2.5 if scenario_type == 'base' else 2
                
                ax.plot(scenario_data['observation_date'], scenario_data['value_numeric'],
                       linestyle=linestyle, linewidth=linewidth, 
                       label=f'{scenario_type.capitalize()} Scenario',
                       color=colors[scenario_type], alpha=0.8)
        
        # Add confidence interval for base scenario
        base_data = scenarios[scenarios['scenario'] == 'base'].sort_values('observation_date')
        if not base_data.empty and 'ci_lower' in base_data.columns:
            ax.fill_between(base_data['observation_date'], 
                           base_data['ci_lower'], 
                           base_data['ci_upper'],
                           alpha=0.2, color=colors['base'], 
                           label='95% Confidence Interval')
    
    # Add target line if exists
    target_row = targets[targets['indicator_code'].str.contains(code.split('_')[1], case=False, na=False)]
    if not target_row.empty:
        target_value = target_row.iloc[0]['value_numeric']
        ax.axhline(y=target_value, color='red', linestyle=':', linewidth=2, 
                  label=f'2027 Target ({target_value}%)', alpha=0.7)
    
    ax.set_title(f'{name}\nForecast Scenarios (2025-2027)', fontsize=14, fontweight='bold', pad=15)
    ax.set_xlabel('Year', fontsize=12, fontweight='bold')
    ax.set_ylabel('Percentage (%)', fontsize=12, fontweight='bold')
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/task4_scenario_forecasts.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Scenario forecasts chart saved")

## 6. Calculate Forecast Metrics

In [None]:
# Calculate detailed metrics for each indicator
print("\n" + "="*80)
print("FORECAST METRICS AND ANALYSIS")
print("="*80)

for code, name in PRIMARY_INDICATORS.items():
    print(f"\n{'='*80}")
    print(f"{name}")
    print(f"{'='*80}")
    
    if code in scenario_forecasts:
        # Get target value if exists
        target_row = targets[targets['indicator_code'].str.contains(code.split('_')[1], case=False, na=False)]
        target_value = target_row.iloc[0]['value_numeric'] if not target_row.empty else None
        
        metrics = calculate_forecast_metrics(scenario_forecasts[code], target_value)
        
        print(f"\nBase Case Forecast (2027): {metrics.get('final_forecast', 'N/A'):.1f}%")
        print(f"Total Growth (2025-2027): +{metrics.get('total_growth', 0):.1f} percentage points")
        print(f"Average Annual Growth: +{metrics.get('avg_annual_growth', 0):.1f} pp/year")
        
        if 'best_case' in metrics:
            print(f"\nScenario Range:")
            print(f"  Optimistic: {metrics['best_case']:.1f}%")
            print(f"  Base: {metrics['final_forecast']:.1f}%")
            print(f"  Pessimistic: {metrics['worst_case']:.1f}%")
            print(f"  Range: {metrics['scenario_range']:.1f} pp")
        
        if target_value:
            print(f"\nTarget Comparison:")
            print(f"  2027 Target: {target_value:.1f}%")
            print(f"  Gap to Target: {metrics.get('gap_to_target', 0):.1f} pp")
            print(f"  On Track: {'Yes ✓' if metrics.get('on_track', False) else 'No ✗'}")

## 7. Key Drivers Analysis

In [None]:
# Analyze which events have the largest impact on forecasts
print("\n" + "="*80)
print("KEY DRIVERS OF FORECAST")
print("="*80)

for code, name in PRIMARY_INDICATORS.items():
    print(f"\n{name}:")
    print("-" * 80)
    
    # Get impacts for this indicator
    indicator_impacts = impacts[impacts['indicator'] == code].copy()
    
    if not indicator_impacts.empty:
        # Merge with event details
        impact_details = indicator_impacts.merge(
            events[['record_id', 'indicator', 'observation_date']], 
            left_on='parent_id', 
            right_on='record_id',
            suffixes=('', '_event')
        )
        
        # Sort by impact magnitude
        impact_details = impact_details.sort_values('impact_estimate', ascending=False)
        
        for _, imp in impact_details.iterrows():
            print(f"  • {imp['indicator_event']}: +{imp['impact_estimate']:.1f} pp (lag: {imp['lag_months']} months)")
    else:
        print("  No direct event impacts modeled")

## 8. Uncertainty Quantification

In [None]:
print("\n" + "="*80)
print("UNCERTAINTY ANALYSIS")
print("="*80)

print("""
KEY UNCERTAINTIES:

1. DATA LIMITATIONS
   - Findex surveys only every 3 years (limited data points)
   - Recent events (M-Pesa, Interoperability) lack validation data
   - Active vs. registered account gap not fully captured
   - Regional and demographic variations not modeled

2. MODEL ASSUMPTIONS
   - Linear trend extrapolation may not hold
   - Event impacts assumed additive (no interactions)
   - Fixed lag periods (reality varies)
   - No saturation effects modeled

3. EXTERNAL FACTORS NOT MODELED
   - Macroeconomic conditions (inflation, growth)
   - Political stability and policy continuity
   - Competitive dynamics between providers
   - Technology adoption rates
   - Agent network quality and distribution

4. COMPARABLE EVIDENCE TRANSFERABILITY
   - Kenya/Tanzania contexts differ from Ethiopia
   - Market maturity differences
   - Regulatory environment variations
   - Consumer behavior differences

CONFIDENCE ASSESSMENT:
- 2025 forecasts: MODERATE confidence (near-term, most events already occurred)
- 2026 forecasts: MODERATE-LOW confidence (medium-term uncertainty increases)
- 2027 forecasts: LOW-MODERATE confidence (long-term, many unknowns)

RECOMMENDATION:
Use scenario ranges rather than point estimates for decision-making.
Update forecasts as new Findex data becomes available.
""")

## 9. Export Forecast Results

In [None]:
# Combine all scenario forecasts
all_forecasts = []

for code, scenarios in scenario_forecasts.items():
    scenarios['indicator_code'] = code
    all_forecasts.append(scenarios)

if all_forecasts:
    combined_forecasts = pd.concat(all_forecasts, ignore_index=True)
    
    # Save to CSV
    output_path = '../data/processed/task4_forecasts_2025_2027.csv'
    combined_forecasts.to_csv(output_path, index=False)
    print(f"✓ Forecasts saved to {output_path}")
    
    # Create summary table
    summary_data = []
    
    for code, name in PRIMARY_INDICATORS.items():
        for year in FORECAST_YEARS:
            for scenario in ['pessimistic', 'base', 'optimistic']:
                row = combined_forecasts[
                    (combined_forecasts['indicator_code'] == code) &
                    (combined_forecasts['observation_date'].dt.year == year) &
                    (combined_forecasts['scenario'] == scenario)
                ]
                
                if not row.empty:
                    summary_data.append({
                        'Indicator': name,
                        'Year': year,
                        'Scenario': scenario.capitalize(),
                        'Forecast (%)': round(row.iloc[0]['value_numeric'], 1)
                    })
    
    summary_df = pd.DataFrame(summary_data)
    summary_pivot = summary_df.pivot_table(
        index=['Indicator', 'Scenario'],
        columns='Year',
        values='Forecast (%)'
    )
    
    print("\n=== FORECAST SUMMARY TABLE ===")
    print(summary_pivot)
    
    summary_pivot.to_csv('../reports/task4_forecast_summary.csv')
    print("\n✓ Summary table saved to reports/task4_forecast_summary.csv")

print("\n✅ Task 4 Complete! Forecasts ready for dashboard visualization.")