In [8]:
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense , Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""

    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    if model_name == "Prophet":
        plt.plot(train['ds'], train['y'], label='Train Data', color='green', linestyle='--')
        plt.plot(test_y['ds'], test_y['y'], label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y['ds'], predictions, label='Predicted', color='red', linestyle='-', marker='x')
    else:
        plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
        plt.plot(test_y.index, test_y, label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y.index, predictions, label='Predicted', color='red', linestyle='-', marker='x')
    

    
    plt.title(f'Predicted vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()






def train_prophet(train_df, test_y,country,indicator):
    #train_df['ds'] = pd.to_datetime(train_df['ds'])  # Ensure datetime format
    
    param_grid = {
        'changepoint_prior_scale': [0.01, 0.1, 0.5],
        'seasonality_mode': ['additive', 'multiplicative'],
    }

    best_rmse = float('inf')
    best_params = None

    for changepoint_prior in param_grid['changepoint_prior_scale']:
        for seasonality_mode in param_grid['seasonality_mode']:
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=False,
                daily_seasonality=False,
                changepoint_prior_scale=changepoint_prior,
                seasonality_mode=seasonality_mode
            )
            model.fit(train_df)

            future = model.make_future_dataframe(periods=len(test_y['y']), freq='Y')
            forecast = model.predict(future)
            predictions = forecast['yhat'].iloc[-len(test_y['y']):].values
            rmse = np.sqrt(mean_squared_error(test_y['y'], predictions))

            if rmse < best_rmse:
                best_rmse = rmse
                best_params = (changepoint_prior, seasonality_mode)

    # Train best model
    best_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=best_params[0],
        seasonality_mode=best_params[1]
    )
    best_model.fit(train_df)
    future = best_model.make_future_dataframe(periods=len(test_y['y']), freq='Y')
    forecast = best_model.predict(future)
    predictions = forecast['yhat'].iloc[-len(test_y['y']):].values

    save_plot(train_df, test_y, predictions, country, indicator, model_name="Prophet")
    return np.sqrt(mean_squared_error(test_y['y'], predictions))




with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()
                df = df.drop('Indicator', axis = 1)
                df_original = df.copy()
                
                
                
                #df = df.dropna()
                train_size = int(len(df) * 0.8)
                
                
                model_errors_rmse[(country, indicator)] = {}


                prophet_train_df = df_original.iloc[:train_size].reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})
                prophet_test_df = df_original.iloc[train_size:].reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})
                prophet_train_df['ds'] = pd.to_datetime(prophet_train_df['ds'])
                prophet_test_df['ds'] = pd.to_datetime(prophet_test_df['ds'])
                model_errors_rmse[(country, indicator)]['Prophet'] = train_prophet(prophet_train_df, prophet_test_df,
                                                                                   country,indicator)
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_error_log.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm


AttributeError: partially initialized module 'patsy' has no attribute 'highlevel' (most likely due to a circular import)

In [37]:
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense , Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""

    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    if model_name == "Prophet":
        plt.plot(train['ds'], train['y'], label='Train Data', color='green', linestyle='--')
        plt.plot(test_y['ds'], test_y['y'], label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y['ds'], predictions, label='Predicted', color='red', linestyle='-', marker='x')
    else:
        plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
        plt.plot(test_y.index, test_y, label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y.index, predictions, label='Predicted', color='red', linestyle='-', marker='x')
    

    
    plt.title(f'Predicted vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()


def train_xgboost(train_x, train_y, test_x, test_y,country,indicator):
    # Define the parameter grid for GridSearchCV
    top_n_features=4
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5]
    }

    # Initialize the XGBoost regressor
    model = xgb.XGBRegressor(objective='reg:squarederror')

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(train_x, train_y)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Perform Recursive Feature Elimination (RFE)
    selector = RFE(estimator=best_model, n_features_to_select=top_n_features)
    selector = selector.fit(train_x, train_y)

    # Select the top N important features based on RFE
    selected_train_x = selector.transform(train_x)
    selected_test_x = selector.transform(test_x)

    # Train the model again with the selected features
    best_model.fit(selected_train_x, train_y)

    # Make predictions on the test set using the selected features
    predictions = best_model.predict(selected_test_x)

    save_plot(train_y, test_y, predictions, country, indicator, model_name="XGBoost")

    # Calculate the RMSE on the test set
    return np.sqrt(mean_squared_error(test_y, predictions))




with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()
                df = df.drop('Indicator', axis = 1)
                df_original = df.copy()
                
                for lag in range(1, 6):  
                    df[f'lag_{lag}'] = df['Value'].shift(lag)
                
                df['expanding_mean'] = df['Value'].expanding().mean()
                df['expanding_std'] = df['Value'].expanding().std()
                df['expanding_max'] = df['Value'].expanding().max()
                df['expanding_min'] = df['Value'].expanding().min()
                
                window_size = 3  
                df['rolling_mean'] = df['Value'].rolling(window=window_size, min_periods=1).mean()
                df['rolling_std'] = df['Value'].rolling(window=window_size, min_periods=1).std()
                df['rolling_max'] = df['Value'].rolling(window=window_size, min_periods=1).max()
                df['rolling_min'] = df['Value'].rolling(window=window_size, min_periods=1).min()
                
                
                #df = df.dropna()
                train_size = int(len(df) * 0.8)
                
                train_xgb, test_xgb = df.iloc[:train_size], df.iloc[train_size:]
                feature_columns = ['rolling_mean', 'rolling_std', 'rolling_max', 'rolling_min', 
                                   'expanding_mean', 'expanding_std', 'expanding_max', 'expanding_min',
                                   'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']
                train_x, train_y = train_xgb[feature_columns], train_xgb['Value']
                test_x, test_y = test_xgb[feature_columns], test_xgb['Value']
                
                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['XGBoost'] = train_xgboost(train_x, train_y, test_x, test_y , country,indicator)

                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_error_log.csv", index=False)


In [None]:
train_df['ds']

In [None]:
import os
import json
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""
    
    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
    
    # Connect actual and predicted values with a continuous line
    plt.plot(test_y.index, test_y, label='Actual', color='blue', linestyle='-', marker='o')
    plt.plot(test_y.index, predictions, label='Predicted', color='red', linestyle='-', marker='x')
    
    plt.title(f'Predicted vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()

def train_holt_winters(train, test_y, country, indicator):
    param_grid = {
        'trend': [None, 'add', 'mul'],
        'seasonal': [None, 'add', 'mul'],
        'seasonal_periods': [None, 12]
    }

    best_rmse = float('inf')
    best_params = None

    for trend in param_grid['trend']:
        for seasonal in param_grid['seasonal']:
            for seasonal_periods in param_grid['seasonal_periods']:
                try:
                    model = ExponentialSmoothing(train, trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods)
                    fitted_model = model.fit()
                    predictions = fitted_model.forecast(len(test_y))
                    rmse = np.sqrt(mean_squared_error(test_y, predictions))

                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params = (trend, seasonal, seasonal_periods)
                except:
                    continue

    best_model = ExponentialSmoothing(train, trend=best_params[0], seasonal=best_params[1], seasonal_periods=best_params[2])
    best_fitted_model = best_model.fit()
    predictions = best_fitted_model.forecast(len(test_y))
    
    # Call the save_plot function to save the plot
    save_plot(train, test_y, predictions, country, indicator, model_name="Holt_Winters")

    return np.sqrt(mean_squared_error(test_y, predictions))

with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df = df.drop('Indicator', axis=1)
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()

                train_size = int(len(df) * 0.8)
                train, test = df.iloc[:train_size], df.iloc[train_size:]

                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['Holt-Winters'] = train_holt_winters(train['Value'], test['Value'], country, indicator)
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_HOLT_WINTERS_error_log.csv", index=False)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""
    
    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
    
    # Connect actual and predicted values with a continuous line
    plt.plot(test_y.index, test_y, label='Actual', color='blue', linestyle='-', marker='o')
    plt.plot(test_y.index, predictions, label='Predicted', color='red', linestyle='-', marker='x')
    
    plt.title(f'Predicted vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()

def prepare_data(df, n_timesteps=1):
    """Prepare the data for LSTM by scaling and reshaping."""
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(df.values.reshape(-1, 1))
    
    X, y = [], []
    for i in range(n_timesteps, len(scaled_data)):
        X.append(scaled_data[i-n_timesteps:i, 0])
        y.append(scaled_data[i, 0])

    X = np.array(X)
    y = np.array(y)
    
    # Reshape X to be suitable for LSTM input: [samples, time steps, features]
    X = X.reshape((X.shape[0], X.shape[1], 1))
    return X, y, scaler

def build_lstm_model(n_timesteps):
    """Build and compile the LSTM model."""
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(n_timesteps, 1)))
    model.add(Dense(1))
    model.compile(optimizer=Adam(), loss='mean_squared_error')
    return model

def train_lstm(train, test_y, country, indicator, n_timesteps=1):
    # Prepare the data
    X_train, y_train, scaler = prepare_data(train, n_timesteps)
    X_test, y_test, _ = prepare_data(test_y, n_timesteps)
    
    # Build the LSTM model
    model = build_lstm_model(n_timesteps)
    
    # Fit the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    
    # Make predictions
    predictions = model.predict(X_test)
    
    # Inverse transform to get the actual scale values
    predictions = scaler.inverse_transform(predictions)
    test_y_actual = scaler.inverse_transform(y_test.reshape(-1, 1))
    
    rmse = np.sqrt(mean_squared_error(test_y_actual, predictions))
    
    # Call the save_plot function to save the plot
    #save_plot(train, test_y_actual, predictions, country, indicator, model_name="LSTM")

    return rmse

with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "data"
model_errors_rmse = {}
log_data = []

for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df = df.drop('Indicator', axis=1)
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()

                train_size = int(len(df) * 0.8)
                train, test = df.iloc[:train_size], df.iloc[train_size:]

                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['LSTM'] = train_lstm(train['Value'], test['Value'], country, indicator)
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_LSTM_error_log.csv", index=False)


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step


  super().__init__(**kwargs)


In [8]:
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense , Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""

    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    if model_name == "Prophet":
        plt.plot(train['ds'], train['y'], label='Train Data', color='green', linestyle='--')
        plt.plot(test_y['ds'], test_y['y'], label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y['ds'], predictions, label='Predicted', color='red', linestyle='-', marker='x')
    else:
        plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
        plt.plot(test_y.index, test_y, label='Actual', color='blue', linestyle='-', marker='o')
        plt.plot(test_y.index, predictions, label='Predicted', color='red', linestyle='-', marker='x')
    

    
    plt.title(f'Predicted vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()

def train_xgboost(train_x, train_y, test_x, test_y,country,indicator):
    # Define the parameter grid for GridSearchCV
    top_n_features=4
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 5]
    }

    # Initialize the XGBoost regressor
    model = xgb.XGBRegressor(objective='reg:squarederror')

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(train_x, train_y)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Perform Recursive Feature Elimination (RFE)
    selector = RFE(estimator=best_model, n_features_to_select=top_n_features)
    selector = selector.fit(train_x, train_y)

    # Select the top N important features based on RFE
    selected_train_x = selector.transform(train_x)
    selected_test_x = selector.transform(test_x)

    # Train the model again with the selected features
    best_model.fit(selected_train_x, train_y)

    # Make predictions on the test set using the selected features
    predictions = best_model.predict(selected_test_x)

    save_plot(train_y, test_y, predictions, country, indicator, model_name="XGBoost")

    # Calculate the RMSE on the test set
    return np.sqrt(mean_squared_error(test_y, predictions)), predictions


def train_arima(train_y, test_y, country, indicator):
    best_rmse = float('inf')
    best_order = None
    best_predictions = None

    for p in range(4):
        for d in range(4):
            for q in range(4):
                try:
                    # Fit the ARIMA model
                    model = ARIMA(train_y, order=(p, d, q))
                    model_fit = model.fit()
                    
                    predictions = model_fit.forecast(steps=len(test_y))
                    rmse = np.sqrt(mean_squared_error(test_y, predictions))

                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_order = (p, d, q)
                        best_predictions = predictions
                except Exception as e:
                    continue

    # After finding the best model, save the predictions plot
    if best_predictions is not None:
        save_plot(train_y, test_y, best_predictions, country, indicator, model_name="ARIMA")

    return best_rmse , best_predictions


def train_prophet(train_df, test_y,country,indicator):
    #train_df['ds'] = pd.to_datetime(train_df['ds'])  # Ensure datetime format
    
    param_grid = {
        'changepoint_prior_scale': [0.01, 0.1, 0.5],
        'seasonality_mode': ['additive', 'multiplicative'],
    }

    best_rmse = float('inf')
    best_params = None

    for changepoint_prior in param_grid['changepoint_prior_scale']:
        for seasonality_mode in param_grid['seasonality_mode']:
            model = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=False,
                daily_seasonality=False,
                changepoint_prior_scale=changepoint_prior,
                seasonality_mode=seasonality_mode
            )
            model.fit(train_df)

            future = model.make_future_dataframe(periods=len(test_y['y']), freq='Y')
            forecast = model.predict(future)
            predictions = forecast['yhat'].iloc[-len(test_y['y']):].values
            rmse = np.sqrt(mean_squared_error(test_y['y'], predictions))

            if rmse < best_rmse:
                best_rmse = rmse
                best_params = (changepoint_prior, seasonality_mode)

    # Train best model
    best_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False,
        changepoint_prior_scale=best_params[0],
        seasonality_mode=best_params[1]
    )
    best_model.fit(train_df)
    future = best_model.make_future_dataframe(periods=len(test_y['y']), freq='Y')
    forecast = best_model.predict(future)
    predictions = forecast['yhat'].iloc[-len(test_y['y']):].values

    save_plot(train_df, test_y, predictions, country, indicator, model_name="Prophet")
    return np.sqrt(mean_squared_error(test_y['y'], predictions)) , predictions

def train_holt_winters(train, test_y, country, indicator):
    param_grid = {
        'trend': [None, 'add', 'mul'],
        'seasonal': [None, 'add', 'mul'],
        'seasonal_periods': [None, 12]
    }

    best_rmse = float('inf')
    best_params = None

    for trend in param_grid['trend']:
        for seasonal in param_grid['seasonal']:
            for seasonal_periods in param_grid['seasonal_periods']:
                try:
                    model = ExponentialSmoothing(train, trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods)
                    fitted_model = model.fit()
                    predictions = fitted_model.forecast(len(test_y))
                    rmse = np.sqrt(mean_squared_error(test_y, predictions))

                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_params = (trend, seasonal, seasonal_periods)
                except:
                    continue

    best_model = ExponentialSmoothing(train, trend=best_params[0], seasonal=best_params[1], seasonal_periods=best_params[2])
    best_fitted_model = best_model.fit()
    predictions = best_fitted_model.forecast(len(test_y))
    
    # Call the save_plot function to save the plot
    save_plot(train, test_y, predictions, country, indicator, model_name="Holt_Winters")

    return np.sqrt(mean_squared_error(test_y, predictions)) , predictions



def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        labels.append(data[i + seq_length])
    return np.array(sequences), np.array(labels)

def train_lstm(train, test_y, country, indicator, seq_length=5, epochs=50, batch_size=16):
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train.values.reshape(-1, 1))
    test_scaled = scaler.transform(test_y.values.reshape(-1, 1))

    X_train, y_train = create_sequences(train_scaled, seq_length)

    # Define LSTM model
    model = Sequential([
        LSTM(100, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.3),
        LSTM(100, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[early_stop])

    # **Rolling Prediction to Use Entire `test_y`**
    predictions = []
    input_seq = train_scaled[-seq_length:].tolist()  # Start with last known sequence

    for _ in range(len(test_y)):  # Predict for every step in test_y
        X_input = np.array(input_seq[-seq_length:]).reshape(1, seq_length, 1)
        y_pred = model.predict(X_input, verbose=0).flatten()[0]  # Predict next value
        predictions.append(y_pred)  # Store predicted value
        input_seq.append([y_pred])  # Append prediction to sequence for next step

    # Convert predictions back to original scale
    predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()

    # Calculate RMSE using full `test_y`
    rmse = np.sqrt(mean_squared_error(test_y, predictions))

    # Save plot
    save_plot(train, test_y, predictions, country, indicator, model_name="LSTM")
    return rmse , predictions


with open("countries.json", "r") as f:
    country_names = json.load(f)

with open("indicators.json", "r") as f:
    indicators = json.load(f)

indicators = {
        "GDP per Capita (USD)": "NY.GDP.PCAP.CD",
    "GDP (USD)": "NY.GDP.MKTP.CD",
}

country_names = {
        "Czech Republic": "CZ",
    "Hungary": "HU",
}
data_folder = "data"
model_errors_rmse = {}
log_data = []
country_indicators_plots = {}
for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()
                df = df.drop('Indicator', axis = 1)
                df_original = df.copy()
                
                for lag in range(1, 6):  
                    df[f'lag_{lag}'] = df['Value'].shift(lag)
                
                df['expanding_mean'] = df['Value'].expanding().mean()
                df['expanding_std'] = df['Value'].expanding().std()
                df['expanding_max'] = df['Value'].expanding().max()
                df['expanding_min'] = df['Value'].expanding().min()
                
                window_size = 3  
                df['rolling_mean'] = df['Value'].rolling(window=window_size, min_periods=1).mean()
                df['rolling_std'] = df['Value'].rolling(window=window_size, min_periods=1).std()
                df['rolling_max'] = df['Value'].rolling(window=window_size, min_periods=1).max()
                df['rolling_min'] = df['Value'].rolling(window=window_size, min_periods=1).min()
                
                
                #df = df.dropna()
                train_size = int(len(df) * 0.8)
                
                train_xgb, test_xgb = df.iloc[:train_size], df.iloc[train_size:]
                feature_columns = ['rolling_mean', 'rolling_std', 'rolling_max', 'rolling_min', 
                                   'expanding_mean', 'expanding_std', 'expanding_max', 'expanding_min',
                                   'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']
                train_x, train_y = train_xgb[feature_columns], train_xgb['Value']
                test_x, test_y = test_xgb[feature_columns], test_xgb['Value']
                
                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['XGBoost'], xgb_error = train_xgboost(train_x, train_y, test_x, test_y , country,indicator)
                model_errors_rmse[(country, indicator)]['ARIMA'],arima_error = train_arima(df_original.iloc[:train_size]['Value'], 
                                                                               df_original.iloc[train_size:]['Value'], 
                                                                               country,indicator)
                model_errors_rmse[(country, indicator)]['Holt-Winters'] , es_error = train_holt_winters(df_original.iloc[:train_size]['Value'], 
                                                                                             df_original.iloc[train_size:]['Value'],
                                                                                             country,indicator)
                model_errors_rmse[(country, indicator)]['LSTM'] , lstm_error = train_lstm(df_original.iloc[:train_size]['Value'], 
                                                                             df_original.iloc[train_size:]['Value'],
                                                                             country,indicator)


                prophet_train_df = df_original.iloc[:train_size].reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})
                prophet_test_df = df_original.iloc[train_size:].reset_index().rename(columns={'Year': 'ds', 'Value': 'y'})
                prophet_train_df['ds'] = pd.to_datetime(prophet_train_df['ds'])
                prophet_test_df['ds'] = pd.to_datetime(prophet_test_df['ds'])
                model_errors_rmse[(country, indicator)]['Prophet'] , prop_error = train_prophet(prophet_train_df, prophet_test_df,
                                                                                   country,indicator)
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])

                
                model_ranks = {(entry[0], entry[1], entry[2]): entry[4] for entry in log_data}
                plt.figure(figsize=(10, 6))
                plt.plot(df_original.iloc[:train_size].index, df_original.iloc[:train_size]['Value'], 
                         label='Train Data', color='green', linestyle='--')
                plt.plot(df_original.iloc[train_size:].index, df_original.iloc[train_size:]['Value'],
                          label='Test Data', color='red', linestyle='--')
                
                plt.plot(df_original.iloc[train_size:].index, arima_error, 
                         label=f'ARIMA ({model_ranks.get((country, indicator, "ARIMA"), "N/A")})', 
                         color='blue', linestyle='-', marker='o')
                
                plt.plot(df_original.iloc[train_size:].index, es_error, 
                          label=f'Holt-Winters ({model_ranks.get((country, indicator, "Holt-Winters"), "N/A")})',
                         color='yellow',linestyle='-', marker='o')
                
                plt.plot(df_original.iloc[train_size:].index, lstm_error, 
                         label=f'LSTM ({model_ranks.get((country, indicator, "LSTM"), "N/A")})', 
                         color='black', linestyle='-', marker='o')
                
                plt.plot(df_original.iloc[train_size:].index, xgb_error, 
                         label=f'XGBoost ({model_ranks.get((country, indicator, "XGBoost"), "N/A")})', 
                         color='pink', linestyle='-', marker='o')
                
                plt.plot(df_original.iloc[train_size:].index, prop_error, 
                         label=f'Prophet ({model_ranks.get((country, indicator, "Prophet"), "N/A")})', 
                         color='brown', linestyle='-', marker='o')
                
                
                plt.title(f'Predicted vs Actual for {country} - {indicator}')
                plt.xlabel('Year')
                plt.ylabel('Value')
                plt.legend()

                # Create subfolder for the indicator if it doesn't exist
                indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
                os.makedirs(indicator_folder, exist_ok=True)
                
                # Save the plot in the Indicators folder with dynamic model name
                plot_filename_indicator = os.path.join(indicator_folder, f'AllModels_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
                plt.savefig(plot_filename_indicator)

                # Create subfolder for the country if it doesn't exist
                country_folder = os.path.join('images', 'model_plot', 'Countries', country)
                os.makedirs(country_folder, exist_ok=True)
                
                # Save the same plot in the Countries folder with dynamic model name
                plot_filename_country = os.path.join(country_folder, f'AllModels_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
                plt.savefig(plot_filename_country)

                
                #plt.savefig(f'Predicted vs Actual for {country} - {indicator}')

                if country not in country_indicators_plots:
                    country_indicators_plots[country] = []
                country_indicators_plots[country].append(plt.gcf())
                plt.close()


# After collecting all plots for each country, create a combined plot
for country, plots in country_indicators_plots.items():
    n_plots = len(plots)
    n_cols = 2  # Set number of columns in the grid
    n_rows = (n_plots + 1) // n_cols  # Calculate required number of rows
    
    plt.figure(figsize=(15, 5 * n_rows))  # Adjust figure size for grid layout
    for i, plot in enumerate(plots, start=1):
        plt.subplot(n_rows, n_cols, i)
        
        # Copy each plot's data by extracting from the original plot and plotting again
        for ax in plot.get_axes():  # Iterate through all axes in the current plot
            for line in ax.get_lines():  # Get lines (or other elements) from the original plot
                plt.plot(line.get_xdata(), line.get_ydata(), label=line.get_label(), color=line.get_color(), linestyle=line.get_linestyle(), marker=line.get_marker())
        
        indicator_name = list(indicators.keys())[i - 1]
        plt.title(f'{country} - {indicator_name}')
        plt.xlabel('Year')
        plt.ylabel('Value')
        plt.legend()

    # Save the combined plot
    country_folder = os.path.join('images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    plot_filename_country = os.path.join(country_folder, f'AllIndicators_{country.replace(" ", "_")}.png')
    plt.tight_layout()
    plt.savefig(plot_filename_country)
    plt.close()

# Create a dictionary to store all indicator plots for later use
indicator_plots = {indicator: [] for indicator in indicators.keys()}

# After collecting all plots for each country, create a combined plot for each indicator
for country, plots in country_indicators_plots.items():
    for i, plot in enumerate(plots, start=1):
        indicator_name = list(indicators.keys())[i - 1]
        
        # Append each plot to the corresponding indicator's list
        indicator_plots[indicator_name].append(plot)

# Now create a combined plot for all countries for each indicator
for indicator, plots in indicator_plots.items():
    n_plots = len(plots)
    n_cols = 2  # Set number of columns in the grid
    n_rows = (n_plots + 1) // n_cols  # Calculate required number of rows
    
    plt.figure(figsize=(15, 5 * n_rows))  # Adjust figure size for grid layout
    for i, plot in enumerate(plots, start=1):
        plt.subplot(n_rows, n_cols, i)
        
        # Copy each plot's data by extracting from the original plot and plotting again
        for ax in plot.get_axes():  # Iterate through all axes in the current plot
            for line in ax.get_lines():  # Get lines (or other elements) from the original plot
                plt.plot(line.get_xdata(), line.get_ydata(), label=line.get_label(), color=line.get_color(), linestyle=line.get_linestyle(), marker=line.get_marker())
        
        plt.title(f'{indicator} - {country}')
        plt.xlabel('Year')
        plt.ylabel('Value')
        plt.legend()

    # Save the combined plot for the indicator across all countries
    indicator_folder = os.path.join('images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    plot_filename_indicator = os.path.join(indicator_folder, f'AllCountries_{indicator.replace(" ", "_")}.png')
    plt.tight_layout()
    plt.savefig(plot_filename_indicator)
    plt.close()

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv("model_error_log.csv", index=False)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, fr