In [9]:
import os
import json
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_squared_error

def train_prophet(train_df, test_y):
    train_df['ds'] = pd.to_datetime(train_df['ds'])  # Ensure datetime format
    
    param_grid = {
        'changepoint_prior_scale': [0.01, 0.1, 0.5],
        'seasonality_mode': ['additive', 'multiplicative'],
    }

    best_rmse = float('inf')
    best_params = None

    for changepoint_prior in param_grid['changepoint_prior_scale']:
        for seasonality_mode in param_grid['seasonality_mode']:
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=False,
                daily_seasonality=False,
                changepoint_prior_scale=changepoint_prior,
                seasonality_mode=seasonality_mode
            )
            model.fit(train_df)

            future = model.make_future_dataframe(periods=len(test_y), freq='Y')
            forecast = model.predict(future)
            predictions = forecast['yhat'].iloc[-len(test_y):].values
            rmse = np.sqrt(mean_squared_error(test_y, predictions))

            if rmse < best_rmse:
                best_rmse = rmse
                best_params = (changepoint_prior, seasonality_mode)

    # Train best model
    best_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=best_params[0],
        seasonality_mode=best_params[1]
    )
    best_model.fit(train_df)
    future = best_model.make_future_dataframe(periods=len(test_y), freq='Y')
    forecast = best_model.predict(future)
    predictions = forecast['yhat'].iloc[-len(test_y):].values

    return np.sqrt(mean_squared_error(test_y, predictions))


# Load country and indicator data
with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df = df.drop('Indicator', axis = 1)
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()

                # Train-Test Split
                train_size = int(len(df) * 0.8)
                train, test = df.iloc[:train_size], df.iloc[train_size:]

                # Prepare data for Prophet
                prophet_train_df = train.reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})
                prophet_test_df = test.reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})

                # Convert 'ds' to datetime explicitly
                prophet_train_df['ds'] = pd.to_datetime(prophet_train_df['ds'])
                prophet_test_df['ds'] = pd.to_datetime(prophet_test_df['ds'])

                # Train and evaluate Prophet model
                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['Prophet'] = train_prophet(prophet_train_df, prophet_test_df['y'])
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

# Save results
log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_PROPHET_error_log5.csv", index=False)


13:36:37 - cmdstanpy - INFO - Chain [1] start processing
13:36:38 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:38 - cmdstanpy - INFO - Chain [1] start processing
13:36:38 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:38 - cmdstanpy - INFO - Chain [1] start processing
13:36:38 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:38 - cmdstanpy - INFO - Chain [1] start processing
13:36:38 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:38 - cmdstanpy - INFO - Chain [1] start processing
13:36:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:39 - cmdstanpy - INFO - Chain [1] start processing
13:36:39 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:39 - cmdstanpy - INFO - Chain [1] start processing
13:36:40 - cmdstanpy - INFO - Chain [1] done processing
  dates = pd.date_range(
13:36:40 - cmdstanpy - INFO - Chai

In [None]:
import os
import json
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error

def train_holt_winters(train, test_y):
    param_grid = {
        'trend': [None, 'add', 'mul'],
        'seasonal': [None, 'add', 'mul'],
        'seasonal_periods': [None, 12]
    }

    best_rmse = float('inf')
    best_params = None

    for trend in param_grid['trend']:
        for seasonal in param_grid['seasonal']:
            for seasonal_periods in param_grid['seasonal_periods']:
                try:
                    model = ExponentialSmoothing(train, trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods)
                    fitted_model = model.fit()
                    predictions = fitted_model.forecast(len(test_y))
                    rmse = np.sqrt(mean_squared_error(test_y, predictions))

                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params = (trend, seasonal, seasonal_periods)
                except:
                    continue

    best_model = ExponentialSmoothing(train, trend=best_params[0], seasonal=best_params[1], seasonal_periods=best_params[2])
    best_fitted_model = best_model.fit()
    predictions = best_fitted_model.forecast(len(test_y))

    return np.sqrt(mean_squared_error(test_y, predictions))

with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df = df.drop('Indicator', axis=1)
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()

                train_size = int(len(df) * 0.8)
                train, test = df.iloc[:train_size], df.iloc[train_size:]

                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['Holt-Winters'] = train_holt_winters(train['Value'], test['Value'])
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_HOLT_WINTERS_error_log.csv", index=False)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [None]:
import os
import json
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.layers import Bidirectional, GRU
from keras.optimizers import Nadam
from keras.callbacks import LearningRateScheduler, EarlyStopping
from keras.layers import Input, LSTM, Dropout, Dense, Bidirectional, BatchNormalization
from keras.optimizers import Nadam
from keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau

def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        labels.append(data[i + seq_length])
    return np.array(sequences), np.array(labels)

def train_lstm(train, test_y, seq_length=5, epochs=50, batch_size=16):
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train.values.reshape(-1, 1))
    test_scaled = scaler.transform(test_y.values.reshape(-1, 1))
    
    X_train, y_train = create_sequences(train_scaled, seq_length)
    X_test, y_test = create_sequences(test_scaled, seq_length)
    
    model = Sequential([
        LSTM(100, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.3),
        LSTM(100, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=0, callbacks=[early_stop])
    
    predictions = model.predict(X_test).flatten()
    predictions = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
    
    rmse = np.sqrt(mean_squared_error(test_y[seq_length:], predictions))

    
    return rmse

with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df = df.drop('Indicator', axis=1)
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()

                train_size = int(len(df) * 0.8)
                train, test = df.iloc[:train_size], df.iloc[train_size:]

                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['LSTM'] = train_lstm(train['Value'], test['Value'])
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_LSTM_error_log.csv", index=False)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 535ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 605ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 653ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 570ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 558ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 640ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 