In [None]:
pip install dtaidistance

In [None]:
### 1. Import Necessary Libraries ###

import pandas as pd
import numpy as np
from dtaidistance import dtw
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Suppress scientific notation in NumPy
np.set_printoptions(suppress=True)

In [None]:
### 2. Load the Data from CSV File ###

data = pd.read_csv('growth_forecast_data2.csv')

# Display the first few rows of the dataset
print(data.head())

In [None]:
### 3. Define the Base Period ###

# Variables to use for comparisons
variables = ['GDP_cap', 'Cap_OR', 'Con_OR', 'LP', 'G_rgdp', 'G_pop']

"""
KEY:

GDP_cap = 'GDP per capita'
Cap_OR = 'Capital-output ratio'
Con_OR = 'Consumption-output ratio'
LP = 'Labor productivity'
G_rgdp = 'Real GDP growth'
G_pop = 'Population growth'

"""

# Calculate first differences for the required variables
for var in variables:
    data[var + '_diff'] = data.groupby('Country')[var].diff()

# Drop rows with NaN values resulting from the diff calculation
data.dropna(inplace=True)

# Normalize the data using StandardScaler
scaler = StandardScaler()
for var in variables:
    data[var + '_diff_norm'] = scaler.fit_transform(data[var + '_diff'].values.reshape(-1, 1))

# Define the target country and the base period
target_country = 'Cambodia'
base_period_end = 2019
base_period_start = base_period_end - 4  # 5 years base period

# Extract the base period data for the target country
base_period_data = data[(data['Country'] == target_country) & 
                        (data['Year'] >= base_period_start) & 
                        (data['Year'] <= base_period_end)]

In [None]:
### 4. Compute DTW Distances and Identify Similar Periods in Other Countries' Histories ###

# Calculate DTW distances for each variable's normalized first differences
distances = []
countries = data['Country'].unique()

for country in countries:
    if country == target_country:
        continue
    country_data = data[data['Country'] == country]
    for start_year in range(country_data['Year'].min(), country_data['Year'].max() - 4):
        end_year = start_year + 4
        period_data = country_data[(country_data['Year'] >= start_year) & 
                                   (country_data['Year'] <= end_year)]
        if len(period_data) < 5:
            continue
        distance = 0
        for var in variables:
            distance += dtw.distance(base_period_data[var + '_diff_norm'].values, 
                                     period_data[var + '_diff_norm'].values)
        distances.append([country, start_year, end_year, distance])

# Create a DataFrame to store the distances
distance_df = pd.DataFrame(distances, columns=['Country', 'Start Year', 'End Year', 'Distance'])
distance_df.sort_values('Distance', inplace=True)

# Display top 50 similar country-periods based on DTW distances
top_50 = distance_df.head(50)
print(top_50)

In [None]:
### 5. Forecast Future Growth Based on the Most Similar Periods ###

# Calculate the forecasted growth paths
forecast_horizon = 11
top_percentages = [0.2, 0.5, 1, 2]
forecasts = {}

for top_percentage in top_percentages:
    top_n = int(len(distance_df) * top_percentage / 100)
    selected_periods = distance_df.head(top_n)
    
    gdp_growth_forecasts = []
    for _, row in selected_periods.iterrows():
        country = row['Country']
        start_year = row['Start Year']
        forecast_period = data[(data['Country'] == country) & 
                               (data['Year'] > start_year) & 
                               (data['Year'] <= start_year + forecast_horizon)]
        forecast_values = forecast_period['G_rgdp'].values[:forecast_horizon]
        if len(forecast_values) < forecast_horizon:
            forecast_values = np.pad(forecast_values, (0, forecast_horizon - len(forecast_values)), 'constant', constant_values=np.nan)
        gdp_growth_forecasts.append(forecast_values)
    
    forecast_matrix = np.array([f for f in gdp_growth_forecasts if len(f) == forecast_horizon])
    
    forecasts[top_percentage] = {
        'Median': np.nanmedian(forecast_matrix, axis=0),
        'Average': np.nanmean(forecast_matrix, axis=0),
        'Weighted Average': np.average(forecast_matrix, axis=0, weights=1 / np.array(selected_periods['Distance'][:len(forecast_matrix)])),
        '45th Percentile': np.nanpercentile(forecast_matrix, 45, axis=0),
        '55th Percentile': np.nanpercentile(forecast_matrix, 55, axis=0),
        '60th Percentile': np.nanpercentile(forecast_matrix, 60, axis=0),
        '65th Percentile': np.nanpercentile(forecast_matrix, 65, axis=0),
        '70th Percentile': np.nanpercentile(forecast_matrix, 70, axis=0)
    }

In [None]:
### 6. Calculate RMSE for Different Forecast Options ###

# Extract actual growth for comparison
actual_growth = data[(data['Country'] == target_country) & 
                     (data['Year'] > base_period_end) & 
                     (data['Year'] <= base_period_end + forecast_horizon)]['G_rgdp'].values

# Ensure actual_growth has the correct length by padding with np.nan if necessary
if len(actual_growth) < forecast_horizon:
    actual_growth = np.pad(actual_growth, (0, forecast_horizon - len(actual_growth)), 'constant', constant_values=np.nan)

# Ensure there are no NaN values in the RMSE calculation by filtering them out
def safe_rmse(a, b):
    mask = ~np.isnan(a) & ~np.isnan(b)
    if np.any(mask):
        return np.sqrt(np.mean((a[mask] - b[mask]) ** 2))
    else:
        return np.nan

# Calculate RMSE for different top percentages
rmse_values = {}
for top_percentage, forecast in forecasts.items():
    rmse_values[top_percentage] = {key: safe_rmse(actual_growth, value) 
                                   for key, value in forecast.items()}

rmse_df = pd.DataFrame(rmse_values)
print(rmse_df)

In [None]:
### 7. Visualize the Results ###

# Plot the forecasted growth paths for each top percentage
years = np.arange(base_period_end + 1, base_period_end + forecast_horizon + 1)

for top_percentage in top_percentages:
    plt.figure(figsize=(16, 8))
    for label, values in forecasts[top_percentage].items():
        plt.plot(years, values, label=f'{label} Forecast', linestyle='--')
    
    plt.xlabel('Year')
    plt.ylabel('Real GDP Growth')
    plt.title(f'Forecasted Real GDP Growth of Cambodia (Top {top_percentage}%)')
    plt.grid(True)
    plt.legend()
    plt.show()