In [10]:
import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

def save_plot(train, test_y, predictions, country, indicator, model_name):
    """Function to save the plot in both Indicators and Countries folders."""

    model_colors = {
    "ARIMA": "blue",
    "Holt_Winters": "yellow",
    "LSTM": "black",
    "XGBoost": "pink",
    "Prophet": "brown"
}

    # Plotting predicted vs actual
    plt.figure(figsize=(10, 6))
    if model_name == "Prophet":
        plt.plot(train['ds'], train['y'], label='Train Data', color='green', linestyle='--')
        plt.plot(test_y['ds'], test_y['y'], label='Actual', color='red', linestyle='--')
        plt.plot(test_y['ds'], predictions, label=f'Predicted({model_name})', color=f'{model_colors["Prophet"]}', 
                 linestyle='-', marker='o')
    else:
        plt.plot(train.index, train, label='Train Data', color='green', linestyle='--')
        plt.plot(test_y.index, test_y, label='Actual', color='red', linestyle='--')
        plt.plot(test_y.index, predictions, label=f'Predicted({model_name})', color=f'{model_colors[model_name]}', 
                 linestyle='-', marker='o')
    

    
    plt.title(f'Predicted({model_name}) vs Actual for {country} - {indicator}')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend()

    # Create subfolder for the indicator if it doesn't exist
    indicator_folder = os.path.join('../images', 'model_plot', 'Indicators', indicator)
    os.makedirs(indicator_folder, exist_ok=True)
    
    # Save the plot in the Indicators folder with dynamic model name
    plot_filename_indicator = os.path.join(indicator_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_indicator)

    # Create subfolder for the country if it doesn't exist
    country_folder = os.path.join('../images', 'model_plot', 'Countries', country)
    os.makedirs(country_folder, exist_ok=True)
    
    # Save the same plot in the Countries folder with dynamic model name
    plot_filename_country = os.path.join(country_folder, f'{model_name}_{country.replace(" ", "_")}_{indicator.replace(" ", "_")}.png')
    plt.savefig(plot_filename_country)

    plt.close()

def train_xgboost(train_x, train_y, test_x, test_y,country,indicator):
    # Define the parameter grid for GridSearchCV
    top_n_features=4
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 5]
    }

    # Initialize the XGBoost regressor
    model = xgb.XGBRegressor(objective='reg:squarederror')

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(train_x, train_y)

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    # Perform Recursive Feature Elimination (RFE)
    selector = RFE(estimator=best_model, n_features_to_select=top_n_features)
    selector = selector.fit(train_x, train_y)

    # Select the top N important features based on RFE
    selected_train_x = selector.transform(train_x)
    selected_test_x = selector.transform(test_x)

    # Train the model again with the selected features
    best_model.fit(selected_train_x, train_y)

    # Make predictions on the test set using the selected features
    predictions = best_model.predict(selected_test_x)

    save_plot(train_y, test_y, predictions, country, indicator, model_name="XGBoost")

    # Calculate the RMSE on the test set
    return np.sqrt(mean_squared_error(test_y, predictions)), predictions



with open("../countries.json", "r") as f:
    country_names = json.load(f)

with open("../indicators.json", "r") as f:
    indicators = json.load(f)

data_folder = "../data/base"
model_errors_rmse = {}
log_data = []
country_indicators_plots = {}
for country, country_code in country_names.items():
    for indicator, indicator_code in indicators.items():
        filename = f"{country.replace(' ', '_')}_{indicator.replace(' ', '_')}.parquet"
        filepath = os.path.join(data_folder, filename)
        
        if os.path.exists(filepath):
            df = pd.read_parquet(filepath)
            if 'Year' in df.columns and 'Value' in df.columns:
                df = df.set_index('Year').sort_index()
                df.index = pd.to_datetime(df.index, format='%Y')
                df = df.dropna()
                df = df.drop('Indicator', axis = 1)
                df_original = df.copy()
                
                for lag in range(1, 6):  
                    df[f'lag_{lag}'] = df['Value'].shift(lag)
                
                df['expanding_mean'] = df['Value'].expanding().mean()
                df['expanding_std'] = df['Value'].expanding().std()
                df['expanding_max'] = df['Value'].expanding().max()
                df['expanding_min'] = df['Value'].expanding().min()
                
                window_size = 3  
                df['rolling_mean'] = df['Value'].rolling(window=window_size, min_periods=1).mean()
                df['rolling_std'] = df['Value'].rolling(window=window_size, min_periods=1).std()
                df['rolling_max'] = df['Value'].rolling(window=window_size, min_periods=1).max()
                df['rolling_min'] = df['Value'].rolling(window=window_size, min_periods=1).min()
                
                
                #df = df.dropna()
                train_size = int(len(df) * 0.8)
                
                train_xgb, test_xgb = df.iloc[:train_size], df.iloc[train_size:]
                feature_columns = ['rolling_mean', 'rolling_std', 'rolling_max', 'rolling_min', 
                                   'expanding_mean', 'expanding_std', 'expanding_max', 'expanding_min',
                                   'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']
                train_x, train_y = train_xgb[feature_columns], train_xgb['Value']
                test_x, test_y = test_xgb[feature_columns], test_xgb['Value']
                
                model_errors_rmse[(country, indicator)] = {}
                model_errors_rmse[(country, indicator)]['XGBoost'], xgb_pred = train_xgboost(train_x, train_y, test_x, test_y , country,indicator)
                
                
                sorted_models = sorted(model_errors_rmse[(country, indicator)].items(), key=lambda x: x[1])
                log_current_data = []
                for rank, (model_name, rmse) in enumerate(sorted_models, start=1):
                    log_data.append([country, indicator, model_name, rmse, rank])
                    log_current_data.append([country, indicator, model_name, rmse, rank])


from datetime import datetime
model ="XGBoost"
log_dir = f"../data/{model}_train"
os.makedirs(log_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y-%m-%d--%H-%M")
log_filename = os.path.join(log_dir, f"{model}_error_log_{timestamp}.csv")

log_df = pd.DataFrame(log_data, columns=['Country', 'Indicator', 'Model', 'RMSE', 'Rank'])
log_df.to_csv(log_filename, index=False)
