In [21]:
# =============================================================================
# 1. IMPORTACI√ìN DE LIBRER√çAS Y CONFIGURACI√ìN INICIAL
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from tqdm import tqdm

# Preprocesamiento y modelado de Scikit-Learn
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split


# Modelado de Deep Learning con TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


# Configuraciones generales
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 7)

In [None]:
# =============================================================================
# 2. CARGA Y PREPARACI√ìN DE DATOS
# =============================================================================
url = "https://raw.githubusercontent.com/rortizgeo/Maestria_CD_Proyecto-Aplicado/main/Data_final.csv"
Data_final = pd.read_csv(url)

# 1. Filtrar por Pa√≠s
Data_final_CO = Data_final[Data_final['country'] == 'Colombia'].copy()

# 2. Filtrar por Rango de A√±os (AJUSTE CR√çTICO)
START_YEAR = 2008 # Debido a que hay datos publicados a partir de este a√±o
END_YEAR = 2022
Data_final_CO = Data_final_CO[
    (Data_final_CO['year'] >= START_YEAR) & 
    (Data_final_CO['year'] <= END_YEAR)
].copy()
print(f"‚úÖ Datos filtrados para Colombia entre {START_YEAR} y {END_YEAR}.")

TARGET = 'occurrenceCount_publisher'
Data_final_CO[TARGET] = np.log1p(Data_final_CO[TARGET])
print("Transformaci√≥n log1p aplicada al target.")

columns_to_drop = ['Overall score', 'areas_protegidas', 'countryCode']
Data_final_CO = Data_final_CO.drop(columns=columns_to_drop)

features_total = ['country', 'year', 'ds', 'superficie_total_km2', 'gasto_RD_pib', 'efectividad_gobierno', 'uso_internet', 'pib_per_capita', 'Overall score', 'region', 'incomeLevel', 'occurrenceCount_publisher', 'gbif_member', 'ogp_membership', 'PC1', 'PC2', 'PC3']

‚úÖ Datos filtrados para Colombia entre 2008 y 2022.
Transformaci√≥n log1p aplicada al target.


In [25]:
# =============================================================================
# 3. PREPARACI√ìN DE DATOS BASE PARA MODELOS (SIN ESCALADO GLOBAL)
# =============================================================================

# La funci√≥n se simplifica para procesar directamente un solo pa√≠s (df)
def create_lstm_sequences_single_country(df, features, target, look_back=3):
    """
    Genera secuencias LSTM exclusivamente para el DataFrame proporcionado (un solo pa√≠s).
    
    df: DataFrame YA filtrado para el pa√≠s y per√≠odo de inter√©s (ej., Data_final_CO).
    """
    X_seq, y_seq, years, countries = [], [], [], []
    
    # Asegurar ordenamiento temporal
    df_country = df.sort_values('year').copy()
    
    # 1. Preparar arrays num√©ricos (incluyendo la conversi√≥n a float)
    # Se usan solo las features v√°lidas, excluyendo 'country'
    X_country = df_country[features].values.astype(np.float32)
    y_country = df_country[target].values.astype(np.float32)
    years_country = df_country['year'].values
    country_name = df_country['country'].iloc[0] # Obtener el nombre del pa√≠s (e.g., 'Colombia')
    
    # 2. Generar Secuencias
    # Se requieren al menos (look_back + 1) filas para generar la primera secuencia
    if len(X_country) > look_back:
        for i in range(len(X_country) - look_back):
            # X_seq: Secuencia de t a t + look_back - 1
            X_seq.append(X_country[i:(i + look_back)])
            # y_seq: El valor a predecir en el tiempo t + look_back
            y_seq.append(y_country[i + look_back])
            years.append(years_country[i + look_back])
            countries.append(country_name)
            
    return np.array(X_seq), np.array(y_seq), np.array(years), np.array(countries)

# Definici√≥n de variables para la funci√≥n

# Columnas a excluir del input X (ya que son el target o IDs de control)
columnas_a_excluir_lstm = ['country', TARGET, 'occurrenceCount_publisher', 'ds'] 

# Crear la lista final de features num√©ricos a usar en la LSTM
# Se usa features_total (que contiene todas las variables, incluyendo las ya convertidas)
features_lstm = [f for f in features_total if f not in columnas_a_excluir_lstm]

# Asegurar que solo queden las columnas presentes en el DF filtrado
features_lstm = [f for f in features_lstm if f in Data_final_CO.columns] 

look_back = 3

# LLAMADA CORREGIDA: Utilizar Data_final_CO y la funci√≥n simplificada
X_seq, y_seq, years_seq, countries_seq = create_lstm_sequences_single_country(
    Data_final_CO, 
    features=features_lstm, 
    target=TARGET, 
    look_back=look_back
)

print(f"‚úÖ Secuencias LSTM generadas. Total de secuencias: {len(X_seq)}.")
print(f"‚úÖ Las secuencias corresponden √∫nicamente a: {countries_seq[0] if len(countries_seq) > 0 else 'N/A'}")

‚úÖ Secuencias LSTM generadas. Total de secuencias: 12.
‚úÖ Las secuencias corresponden √∫nicamente a: Colombia


In [27]:
# =============================================================================
# M√ìDULO DE ENTRENAMIENTO FINAL (Solo para Colombia)
# =============================================================================

# üß† LSTM (Par√°metros Fijos del Mejor Fold)
# Esta lista ahora representa el √∫nico set de par√°metros a utilizar.
LSTM_FINAL_PARAMS = {
    "units": 50, 
    "dropout": 0.3, 
    "epochs": 30, 
    "batch_size": 32, 
    "learning_rate": 0.005, 
    "lstm_activation": "tanh"
}

def train_final_lstm_model(X_train, y_train, look_back, final_params=LSTM_FINAL_PARAMS):
    """
    Entrena el modelo LSTM final utilizando los hiperpar√°metros preseleccionados
    del mejor fold de validaci√≥n cruzada.
    
    Args:
        X_train (np.array): Secuencias 3D de features escalados de Colombia.
        y_train (np.array): Target escalado de Colombia.
        look_back (int): Longitud de la secuencia de entrada.
        final_params (dict): Hiperpar√°metros √≥ptimos preseleccionados.
        
    Returns:
        tf.keras.models.Sequential: El modelo LSTM final entrenado.
    """
    
    # Usamos los par√°metros fijos
    params = final_params 
    
    # Dividir una peque√±a porci√≥n para Early Stopping (Mejora la robustez del entrenamiento)
    # Se usa test_size=0.05 (5%) para maximizar los datos de entrenamiento
    X_train_final, X_val, y_train_final, y_val = train_test_split(
        X_train, y_train, test_size=0.05, random_state=42, shuffle=False
    )
    
    # Inicializar Early Stopping
    es = EarlyStopping(monitor='val_loss', mode='min', patience=5, verbose=1, restore_best_weights=True)

    # 1. Definici√≥n de la Arquitectura
    model = Sequential([
        LSTM(params["units"], 
             activation=params["lstm_activation"], 
             input_shape=(look_back, X_train.shape[2])), 
        Dropout(params["dropout"]), 
        Dense(1) 
    ])
    
    # 2. Compilaci√≥n y Entrenamiento
    optimizer = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss="mae")
    
    print(f"\nüöÄ Iniciando entrenamiento final con {len(X_train_final)} muestras...")
    
    model.fit(X_train_final, y_train_final, 
              epochs=params["epochs"], 
              batch_size=params["batch_size"], 
              verbose=0,
              validation_data=(X_val, y_val),
              callbacks=[es])
    
    # No hay necesidad de retornar best_params ya que son fijos.
    return model

In [None]:
# =============================================================================
# 6. M√âTRICAS
# =============================================================================

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

def smape(y_true, y_pred, eps=1e-8):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) + eps
    return np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100

def mape(y_true, y_pred, eps=1e-8):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100

def compute_metrics(y_true, y_pred):
    # CORRECCI√ìN 3: Revertir la transformaci√≥n log1p
    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)
    
    return {
        "MAE": mean_absolute_error(y_true_orig, y_pred_orig),
        "RMSE": np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)),
        "R2": r2_score(y_true_orig, y_pred_orig),
        "MAPE": mape(y_true_orig, y_pred_orig),
        "SMAPE": smape(y_true_orig, y_pred_orig)
    }


In [3]:
# =============================================================================
# M√ìDULO 1: CONFIGURACI√ìN, LIBRER√çAS Y CARGA DE DATOS
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Librer√≠as de Machine Learning/Deep Learning
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

# --- CONSTANTES GLOBALES Y DE MODELO ---
TARGET_RAW = 'occurrenceCount_publisher'
TARGET = f'{TARGET_RAW}_log1p' 
COUNTRY_CODE = 'Colombia'
SEQUENCE_LENGTH = 3      # Look-back
START_YEAR = 2008
END_YEAR = 2022
YEARS_TO_FORECAST = 8    # Proyectar de 2023 a 2030 (8 a√±os)

# Lista de variables NUM√âRICAS que ser√°n input de la LSTM
FEATURES_NUMERIC_FOR_SCALING = [
    "year", "superficie_total_km2", "gasto_RD_pib", "efectividad_gobierno", 
    "uso_internet", "pib_per_capita", "Overall score", "region", "incomeLevel", 
    "gbif_member", "ogp_membership", "PC1", "PC2", "PC3"
]

# Par√°metros √ìptimos de LSTM (Fijos, extra√≠dos del mejor Fold)
LSTM_PARAMS = {'units': 50, 'dropout': 0.3, 'epochs': 30, 'batch_size': 32, 'learning_rate': 0.001, 'lstm_activation': 'tanh'}

# --- CARGA Y FILTRADO DE DATOS ---
try:
    # 1. Cargar datos globales
    df_global = pd.read_csv("Data_final.csv")
    
    # 2. Aplicar Transformaci√≥n y Tipado
    df_global[TARGET] = np.log1p(df_global[TARGET_RAW])
    df_global['year'] = df_global['year'].astype(int) 
    
    # 3. Filtrar por Pa√≠s y Rango de A√±os (2008-2022)
    df_colombia_all = df_global[df_global['country'] == COUNTRY_CODE].copy()
    
    df_colombia_filtered = df_colombia_all[
        (df_colombia_all['year'] >= START_YEAR) & 
        (df_colombia_all['year'] <= END_YEAR)
    ].copy()
    
    print(f"‚úÖ Datos cargados y filtrados. Filas hist√≥ricas para entrenamiento: {len(df_colombia_filtered)}")

except FileNotFoundError:
    print("‚ùå ERROR: No se encontr√≥ Data_final.csv.")
    exit()

‚úÖ Datos cargados y filtrados. Filas hist√≥ricas para entrenamiento: 15


In [4]:
# =============================================================================
# M√ìDULO 2: FUNCIONES CORE (Secuenciaci√≥n y Preparaci√≥n)
# =============================================================================

def create_sequences(X, y, time_steps):
    """Convierte arrays X e Y en secuencias 3D para LSTM."""
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

def build_lstm_model(timesteps, features, params):
    """Construye el modelo LSTM con los hiperpar√°metros √≥ptimos."""
    model = Sequential([
        LSTM(params.get('units'), activation=params.get('lstm_activation'), 
             input_shape=(timesteps, features), return_sequences=False),
        Dropout(params.get('dropout')),
        Dense(1)
    ])
    
    optimizer = Adam(learning_rate=params.get('learning_rate'))
    model.compile(optimizer=optimizer, loss='mae')
    return model

def prepare_data_lstm(df_global, df_colombia_filtered, features_numeric, target_col, sequence_length):
    """Ajusta escaladores en el GLOBAL, los aplica a los datos de Colombia (2008-2022), y genera secuencias."""
    
    # 1. Ajustar Escaladores en el conjunto GLOBAL (para replicar el contexto de entrenamiento)
    y_global = df_global[[target_col]].values.astype(np.float32)
    y_scaler = MinMaxScaler(feature_range=(0, 1))
    y_scaler.fit(y_global)
    
    X_global = df_global[features_numeric].values.astype(np.float32) 
    X_scaler = MinMaxScaler(feature_range=(0, 1))
    X_scaler.fit(X_global)
    
    # 2. Limpieza y Aplicaci√≥n de Escalado a COLOMBIA
    df_train_raw = df_colombia_filtered.dropna(subset=features_numeric + [target_col]).copy()
    
    # Aplicar escalado a X y Y de Colombia
    y_colombia_scaled = y_scaler.transform(df_train_raw[[target_col]].values)
    X_colombia_scaled = X_scaler.transform(df_train_raw[features_numeric].values)
    
    # 3. Generar Secuencias
    X_seq, y_seq = create_sequences(X_colombia_scaled, y_colombia_scaled, sequence_length)
    X_raw_scaled = X_colombia_scaled
    
    return X_seq, y_seq, X_raw_scaled, y_scaler, X_scaler, df_train_raw

In [6]:
# =============================================================================
# M√ìDULO 3: PRON√ìSTICO AUTOREGRESIVO (EXECUCI√ìN FINAL CORREGIDA)
# =============================================================================

def execute_lstm_forecast_final(df_global, df_colombia_filtered, sequence_length, lstm_params, years_to_forecast):
    
    # 1. Preparaci√≥n de Datos (Obtiene escaladores y secuencias)
    X_seq, y_seq, X_raw_scaled, y_scaler, X_scaler, df_train_raw = prepare_data_lstm(
        df_global, df_colombia_filtered, FEATURES_NUMERIC_FOR_SCALING, TARGET, sequence_length
    )
    
    # 2. Entrenamiento Final
    timesteps = X_seq.shape[1]
    n_features = X_seq.shape[2]
    
    model = build_lstm_model(timesteps, n_features, lstm_params)
    es = EarlyStopping(monitor='loss', patience=5, verbose=0, mode='min')
    
    model.fit(X_seq, y_seq,
              epochs=lstm_params.get('epochs', 30), 
              batch_size=lstm_params.get('batch_size', 32), 
              verbose=0, callbacks=[es])
    print(f"‚úÖ Modelo LSTM reentrenado en {len(X_seq)} secuencias.")

    # 3. Inicializaci√≥n del Bucle Autoregresivo
    last_historical_year = df_train_raw['year'].max() # √öltimo a√±o real (2022)
    end_year = last_historical_year + years_to_forecast # Fin del pron√≥stico (2030)
    
    projected_rows_list = [] 
    last_sequence_scaled = X_raw_scaled[-sequence_length:] 

    for year in range(last_historical_year + 1, end_year + 1):
        
        # 3.1. Definir los FEATURES EX√ìGENOS FUTUROS (Supuesto de Continuidad)
        X_next_raw = df_train_raw[FEATURES_NUMERIC_FOR_SCALING].iloc[-1].copy()
        
        # 3.2. Escalar X y preparar la secuencia
        X_next_scaled = X_scaler.transform(X_next_raw.values.reshape(1, -1))

        # 3.3. Actualizar la Secuencia de Entrada y predecir
        X_forecast_seq = np.vstack([last_sequence_scaled[1:], X_next_scaled])
        X_forecast_seq_3D = X_forecast_seq.reshape(1, sequence_length, n_features)
        
        predicted_scaled = model.predict(X_forecast_seq_3D, verbose=0)[0]
        predicted_log1p = y_scaler.inverse_transform(predicted_scaled.reshape(-1, 1))[0, 0]
        predicted_records = np.expm1(predicted_log1p)
        
        # 3.4. Crear la nueva fila de datos (solo las columnas esenciales)
        new_row_data = {
            'year': year,
            'country': country_code,
            TARGET_RAW: predicted_records # Valor de la predicci√≥n futura
        }
        
        projected_rows_list.append(new_row_data)
        
        # 3.5. Actualizar la secuencia del estado para la pr√≥xima iteraci√≥n
        last_sequence_scaled = X_forecast_seq 

    # 4. Concatenaci√≥n Final y Formato (L√≥gica Corregida)
    
    # 4.1 Preparar DF de proyecci√≥n solo con los resultados
    df_projected_only = pd.DataFrame(projected_rows_list)
    
    # 4.2 Preparar el DF hist√≥rico para la uni√≥n (solo las columnas de salida)
    df_historical_view = df_train_raw[['year', TARGET_RAW]].copy()
    
    # 4.3 Concatenar Hist√≥rico (2008-2022) y Proyecci√≥n (2023-2030)
    df_final = pd.concat([df_historical_view, df_projected_only], ignore_index=True)
    
    # 4.4 Columna de valores REALES (solo para a√±os <= 2022)
    df_final['historical_records'] = df_final[TARGET_RAW].where(df_final['year'] <= last_historical_year).round(0)
    
    # 4.5 Columna de PRON√ìSTICO (valores proyectados + valores hist√≥ricos para la l√≠nea de inicio)
    df_final['predicted_records'] = df_final[TARGET_RAW].where(df_final['year'] > last_historical_year)
    
    # Rellenar la serie de predicci√≥n con los valores hist√≥ricos para la continuidad del plot
    df_final['predicted_records'] = df_final['predicted_records'].fillna(df_final['historical_records']).round(0)

    # 4.6 Limpiar columnas y retornar
    df_final = df_final.drop(columns=[TARGET_RAW], errors='ignore')
    
    return df_final, last_historical_year