In [7]:
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense , Dropout
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""

    model_colors = {
    "ARIMA": "blue",
    "Holt_Winters": "yellow",
    "LSTM": "black",
    "XGBoost": "pink",
    "Prophet": "brown"
}

    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    if model_name == "Prophet":
        plt.plot(train['ds'], train['y'], label='Train Data', color='green', linestyle='--')
        plt.plot(test_y['ds'], test_y['y'], label='Actual', color='red', linestyle='--')
        plt.plot(test_y['ds'], predictions, label=f'Predicted({model_name})', color=f'{model_colors["Prophet"]}', 
                 linestyle='-', marker='o')
    else:
        plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
        plt.plot(test_y.index, test_y, label='Actual', color='red', linestyle='--')
        plt.plot(test_y.index, predictions, label=f'Predicted({model_name})', color=f'{model_colors[model_name]}', 
                 linestyle='-', marker='o')
    

    
    plt.title(f'Predicted({model_name}) vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('../images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('../images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()


def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        labels.append(data[i + seq_length])
    return np.array(sequences), np.array(labels)

def train_lstm(train, test_y, country, indicator, seq_length=5, epochs=50, batch_size=16):
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train.values.reshape(-1, 1))
    test_scaled = scaler.transform(test_y.values.reshape(-1, 1))

    X_train, y_train = create_sequences(train_scaled, seq_length)

    # Define LSTM model
    model = Sequential([
        LSTM(100, activation='relu', return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.3),
        LSTM(100, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[early_stop])

    # **Rolling Prediction to Use Entire `test_y`**
    predictions = []
    input_seq = train_scaled[-seq_length:].tolist()  # Start with last known sequence

    for _ in range(len(test_y)):  # Predict for every step in test_y
        X_input = np.array(input_seq[-seq_length:]).reshape(1, seq_length, 1)
        y_pred = model.predict(X_input, verbose=0).flatten()[0]  # Predict next value
        predictions.append(y_pred)  # Store predicted value
        input_seq.append([y_pred])  # Append prediction to sequence for next step

    # Convert predictions back to original scale
    predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1)).flatten()

    # Calculate RMSE using full `test_y`
    rmse = np.sqrt(mean_squared_error(test_y, predictions))

    # Save plot
    save_plot(train, test_y, predictions, country, indicator, model_name="LSTM")
    return rmse , predictions


with open("../countries.json", "r") as f:
    country_names = json.load(f)

with open("../indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "../data/base"
model_errors_rmse = {}
log_data = []
country_indicators_plots = {}
for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()
                df = df.drop('Indicator', axis = 1)
                df_original = df.copy()
                
                
                
                #df = df.dropna()
                train_size = int(len(df) * 0.8)
                              
                model_errors_rmse[(country, indicator)] = {}

                model_errors_rmse[(country, indicator)]['LSTM'] , lstm_pred = train_lstm(df_original.iloc[:train_size]['Value'], 
                                                                             df_original.iloc[train_size:]['Value'],
                                                                             country,indicator)



                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                log_current_data = []
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])
                    log_current_data.append([country, indicator, model_name, rmse, rank])

from datetime import datetime
model ="LSMT"
log_dir = f"../data/{model}_train"
os.makedirs(log_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y-%m-%d--%H-%M")
log_filename = os.path.join(log_dir, f"{model}_error_log_{timestamp}.csv")

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv(log_filename, index=False)

  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(**kwargs)
  current = self.get_monitor_value(logs)
  super().__init__(*