# 0 Preparación del entorno.

## 0.1 Definición de parámetros

In [1]:
tfm_path='C:/Users/raqga/OneDrive - Universidad Complutense de Madrid (UCM)/Documentos/tsa4dst/TFM_data/'
H1_code = 'OMNI2_H0_MRG1HR'
lookback = 12
lookforward = 2
tfm_path_Nh_models = f'PRED_{lookforward}h/'
cols_to_use = ['Bx', 'By_gse', 'Bz_gse', 'By_gsm', 'Bz_gsm', 'P_density', 'E_field', 'plasma_T', 'plasma_V', 'Dst'] # 'AP', out
col_to_pred = "Dst"
hstorms_data = 'historical_storms_gruet2018.csv'
weak_threshold = -30 #1
moderate_threshold = -50 #2
strong_threshold = -100 #3
severe_threshold = -200 #4
great_threshold = -300 #5
gamma_value=0.0001
temporal_margin=5*24 # margen para obtener tiempos ampliados de las tormentas de gruet et al 2018
test_size = 0.2
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

In [2]:
PATH_NN_RES_BY_STORM = "SVR_{}h_res_by_storm/storm_{}.csv"
PATH_NN_RES_BY_STORM_PLOTS = "SVR_{}h_res_by_storm/storm_{}.png"

## 0.2 Montar Google Drive (obtención de datos)

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
# !rm -rf sample_data/

## 0.3 Importación de librerías

In [4]:
# !pip install optuna

In [5]:
import os

import numpy as np

# librerías de manipulación de datos y gráficos
import pandas as pd
import matplotlib.dates as mdates
import numpy as np

# gráficos
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# modelo
from sklearn.svm import SVR
#from thundersvm import SVR
# escalado y división en train/test
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# obtención de métricas
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_log_error, median_absolute_error
from sklearn.metrics import explained_variance_score, max_error

# meta
# timer
import time


## 0.4 Definición de funciones

In [6]:
def exploracion_inicial_datos(df):
    """
    Función para realizar una exploración inicial de los datos.

    Parámetros:
    df (dataframe): El dataframe que contiene los datos.

    Muestra las primeras filas, estadísticas descriptivas, valores faltantes,
    histogramas de variables numéricas y un mapa de calor de la correlación.
    """
    # Configuración de visualización
    sns.set(style="whitegrid")  # Estilo de gráficos


    print("Primeras filas del DataFrame:")
    print(df.head())


    print("\nDescripción estadística de los datos:")
    print(df.describe())


    print("\nValores faltantes por columna:")
    print(df.isnull().sum())


    print("\nVisualización de histogramas para variables numéricas:")
    df.hist(bins=15, figsize=(15, 10), layout=(5, 4))
    plt.show()
    print("\nMapa de calor de la matriz de correlación:")
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
    plt.show()


def calcular_layout_optimo(num_cols):
    """
    Calcula el número óptimo de filas y columnas para una figura con subgráficos,
    tratando de mantener una forma que sea visualmente agradable y que aproveche el espacio.

    Parámetros:
    num_cols (int): Número total de columnas (gráficos) a mostrar.

    Retorna:
    (int, int): Número de filas y columnas para el layout de los subgráficos.
    """
    # Calcula el número óptimo de columnas teniendo un límite visual razonable
    cols_per_row = int(np.sqrt(num_cols)) + 1  # Ajuste para maximizar el uso del espacio y la forma de la figura
    rows_needed = (num_cols + cols_per_row - 1) // cols_per_row  # Redondeo hacia arriba para incluir todas las columnas
    return rows_needed, cols_per_row

def exploracion_histogramas(df):
    """
    Función para generar histogramas para todas las columnas numéricas en un DataFrame,
    excluyendo las columnas de tipo datetime y no numéricas.

    Parámetros:
    df (DataFrame): DataFrame de pandas con los datos a analizar.
    """
    # Eliminar columnas no numéricas y de tipo datetime
    df_numerico = df.select_dtypes(include=[np.number])

    # Número de columnas numéricas
    num_cols = df_numerico.shape[1]

    # Verificar si hay columnas para mostrar
    if num_cols == 0:
        print("No hay columnas numéricas para mostrar.")
        return

    # Calculando el layout necesario
    rows_needed, cols_per_row = calcular_layout_optimo(num_cols)

    # Crear histogramas
    df_numerico.hist(bins=15, figsize=(15, 10), layout=(rows_needed, cols_per_row))
    plt.show()

def imputar_nan(df):
  df.interpolate(method='linear', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)
  if sum(df.isnull().sum())!=0:
    print("Faltan nulos por tratar")
  return df

def visualizar_nulos_plot(df, variable_with_nans):
    """
    Plot the specified 'variable_with_nans' column and 'Dst' column from the DataFrame.

    Parameters:
    - df: pandas.DataFrame containing the data to plot.
    - variable_with_nans: str, the name of the column in the DataFrame to plot, which may contain NaNs.

    The function assumes that 'Dst' is a column name in the DataFrame and that the DataFrame's index is suitable for plotting (e.g., datetime).
    """
    # Create the figure and subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(20, 8))

    # Handling NaNs in the 'variable_with_nans' column before plotting
    df_plot = df.copy()
    df_plot[variable_with_nans] = df_plot[variable_with_nans].fillna(method='ffill')  # Forward fill to handle NaNs

    # Plotting 'variable_with_nans' on the first subplot
    ax1.scatter(df_plot.index, df_plot[variable_with_nans], label=variable_with_nans, color='blue')
    ax1.set_ylabel(variable_with_nans)
    ax1.legend(loc='upper right')
    ax1.grid(True)

    # Plotting 'Dst' on the second subplot
    ax2.plot(df_plot.index, df_plot['Dst'], label='Dst', color='red')
    ax2.set_ylabel('Dst')
    ax2.legend(loc='upper right')
    ax2.grid(True)

    # Setting the x-axis label only on the bottom subplot
    ax2.set_xlabel('Datetime')

    # Improve layout to prevent overlap
    plt.tight_layout()

    # Show the plot
    plt.show()

# Example of how to use this function with a DataFrame containing NaNs
data = {
    'Datetime': pd.date_range(start='2021-01-01', periods=100, freq='D'),
    'variable_with_nans': pd.Series(range(100)).where(lambda x : x % 10 != 0),
    'Dst': range(100, 0, -1)
}
df = pd.DataFrame(data)
df.set_index('Datetime', inplace=True)

def create_window_df_svr(list_dfs, lookback, lookforward, cols_to_use, col_to_pred, scaler_label=None):
    """
    Creates input and output datasets for SVR training from a list of DataFrames, incorporating windowing and optional descaling for target

    Parameters:
    list_dfs (list of pandas.DataFrame): List of DataFrames to process.
    lookback (int): Number of past records to include as features for each prediction.
    lookforward (int): Number of records ahead to predict.
    cols_to_use (list of str): List of column names to use as features.
    col_to_pred (str): Column name to predict.
    scaler_label (StandardScaler, optional): Scaler for the output variable, used for inverse transformation.

    Returns:
    tuple: A tuple containing arrays for training features and labels.
    """
    x_train, y_train = [], []

    for df_ in list_dfs:
        df = df_.copy()

        for i in range(len(df) - lookback - lookforward + 1):
            x_train.append(np.asarray(df.iloc[i:i+lookback][cols_to_use].values))
            y_train.append(np.asarray(df.iloc[i+lookback][col_to_pred]))

    if scaler_label is not None:
        y_train = scaler_label.inverse_transform(np.asarray(y_train).reshape(-1,1))

    return np.asarray(x_train), np.asarray(y_train)


def filter_storms(df, historical_storms, temporal_margin):
    """
    Filter DataFrame entries based on the occurrence of storms within specific time intervals.

    Parameters:
    df (pandas.DataFrame): DataFrame containing time-series data with a 'Datetime' column.
    historical_storms (pandas.DataFrame): DataFrame containing the start and end times of historical storms.
    temporal_margin (int): Number of rows before and after the minimum Dst index to include in the result.

    Returns:
    list: A list of DataFrame snippets corresponding to the specified storm intervals.
    """
    all_storms = []
    for i in range(len(historical_storms)):
        df_tmp = df[(df["Datetime"] >= historical_storms.iloc[i]["start"]) & (df["Datetime"] <= historical_storms.iloc[i]["end"])]
        idx = df_tmp['Dst'].idxmin()
        all_storms.append(df.iloc[idx-temporal_margin:idx+temporal_margin])
    return all_storms

def combinar_dataframes_solapados(dfs):
    """
    Combines overlapping DataFrames in a list into non-overlapping DataFrames based on the 'Datetime' column.

    Parameters:
    dfs (list of pandas.DataFrame): List of DataFrames to combine.

    Returns:
    list: A list of combined DataFrames without overlap.
    """
    dfs.sort(key=lambda x: x['Datetime'].min())
    combinados = []
    combinacion_actual = dfs[0]

    for df in dfs[1:]:
        if df['Datetime'].min() <= combinacion_actual['Datetime'].max():
            combinacion_actual = pd.concat([combinacion_actual, df]).drop_duplicates().sort_values(by='Datetime')
        else:
            combinados.append(combinacion_actual)
            combinacion_actual = df
    combinados.append(combinacion_actual)
    return combinados

def scale_data(list_dfs, cols_to_use, col_to_pred):
    """
    Scales columns in a list of DataFrames using StandardScaler.

    Parameters:
    list_dfs (list of pandas.DataFrame): List of DataFrames to scale.
    cols_to_use (list of str): Column names to apply scaling to.
    col_to_pred (str): Column name used as a label for prediction.

    Returns:
    tuple: A tuple containing the list of scaled DataFrames and the label scaler.
    """
    list_dfs_ = []
    scaler_cols = StandardScaler()
    scaler_label = StandardScaler()
    scaler_cols.fit(pd.concat(list_dfs)[cols_to_use])
    scaler_label.fit(np.asarray(pd.concat(list_dfs)[col_to_pred]).reshape(-1,1))

    for df_ in list_dfs:
        df = df_.copy()
        df[cols_to_use] = scaler_cols.transform(df[cols_to_use])
        list_dfs_.append(df)

    return list_dfs_, scaler_label



def calc_metrics(predictions, y_test):
  # Mean Squared Error
  mse = mean_squared_error(y_test, predictions)
  # Mean Absolute Error
  mae = mean_absolute_error(y_test, predictions)
  # R^2 Score, the coefficient of determination
  r2 = r2_score(y_test, predictions)
  # Median Absolute Error
  medae = median_absolute_error(y_test, predictions)
  # Explained Variance Score
  explained_variance = explained_variance_score(y_test, predictions)
  # Max Error
  max_err = max_error(y_test, predictions)

  return mse, mae, r2, medae, explained_variance, max_err


def formatear_tiempo(segundos):
    horas = int(segundos // 3600)
    minutos = int((segundos % 3600) // 60)
    segundos = segundos % 60
    return f"{horas} horas, {minutos} minutos, {segundos:.2f} segundos"


In [7]:
def evaluate_models(X_train, y_train, X_test, y_test, kernels, C_values, epsilon_values, gamma_values, degree_values, verbose=True):
    """
    Evaluate SVR models with different hyperparameter configurations and kernel types.

    Args:
    X_train (array): Independent training data.
    y_train (array): Dependent training data (target).
    X_test (array): Independent test data.
    y_test (array): Dependent test data (target).
    kernels (list): List of kernel types to evaluate.
    C_values (list): List of values for the penalty parameter C.
    epsilon_values (list): List of values for the epsilon parameter.
    gamma_values (list): List of values for the gamma parameter.
    degree_values (list): List of values for the degree parameter (used only in polynomial kernels).
    verbose (bool, optional): If True, prints messages during the evaluation process. Default is True.

    Returns:
    DataFrame: A pandas DataFrame containing evaluation metrics for each parameter configuration.
    dict: A dictionary of trained models, with keys describing the specific parameter configuration.

    Note:
    Assumes that the 'precomputed' kernel is only used if the 'precomputed_matrix' is defined in the local environment.
    """
    results = []
    models_dict = {}

    for kernel_ in kernels:
        for C in C_values:
            for epsilon in epsilon_values:
                # Check parameter relevance for gamma and degree for the current kernel
                relevant_gamma_values = gamma_values if kernel_ in ['rbf', 'sigmoid', 'poly'] else [None]
                relevant_degree_values = degree_values if kernel_ == 'poly' else [None]

                for gamma in relevant_gamma_values:
                    for degree in relevant_degree_values:
                        start = time.time()
                        config_key = f"{kernel_}_C{C}_eps{epsilon}_gamma{gamma}_deg{degree}"
                        if verbose:
                            print(f"Starting {config_key}")

                        if kernel_ == "precomputed":
                            if 'precomputed_matrix' in locals():
                                model = SVR(kernel='precomputed', C=C, epsilon=epsilon)
                                model.fit(precomputed_matrix, y_train)
                                models_dict[config_key] = model
                                continue
                            else:
                                if verbose:
                                    print("Precomputed matrix not defined for kernel='precomputed'")
                                continue

                        model = SVR(kernel=kernel_, C=C, epsilon=epsilon, gamma=(gamma if gamma is not None else 0.01), degree=(degree if degree is not None else 3))
                        model.fit(X_train, y_train)
                        models_dict[config_key] = model

                        predictions = model.predict(X_test)
                        mse, mae, r2, medae, explained_variance, max_err = calc_metrics(predictions, y_test)
                        end = time.time()
                        time_exec = formatear_tiempo(end - start)

                        results.append({
                            "kernel": kernel_,
                            "C": C,
                            "epsilon": epsilon,
                            "gamma": gamma,
                            "degree": degree,
                            "mse": mse,
                            "mae": mae,
                            "r2": r2,
                            "medae": medae,
                            "exp_var": explained_variance,
                            "max_err": max_err,
                            "time_exec": time_exec
                        })

                        print(f"{config_key} finished. Time for iteration: {time_exec} | mae: {mae} | mse: {mse} | exp_var: {explained_variance} | max_err: {max_err}")

    df_results = pd.DataFrame(results)
    return df_results, models_dict


Función para extraer las fechas de la nueva división de datos, una vez ampliadas las ventanas de cada tormenta a +-5 días desde el mínimo de cada tormenta, y eliminados los datos solapados

In [8]:
def get_new_splits_dates(all_storms):
    dates_start = []
    dates_end = []
    num_storm = []
    min_dst_values = []
    
    for idx, df in enumerate(all_storms):
        dates_start.append(df["Datetime"].iloc[0])
        dates_end.append(df["Datetime"].iloc[-1])
        num_storm.append(idx+1)
        min_dst_values.append(df["Dst"].min())
    df_new_storms = pd.DataFrame(
        {
            "storm_index": num_storm,
            "date_start": dates_start,
            "date_end": dates_end,
            "min_DST": min_dst_values
        }
    )
    return df_new_storms

Función para sacar los scalers de las variables de input y de la label (Dst)

In [9]:
def get_scalers(list_dfs, cols_to_use, col_to_pred):
    list_dfs_ = []
    scaler_cols = StandardScaler()
    scaler_label = StandardScaler()
    scaler_cols.fit(pd.concat(list_dfs)[cols_to_use])
    scaler_label.fit(np.asarray(pd.concat(list_dfs)[col_to_pred]).reshape(-1,1))

    return scaler_cols, scaler_label

Nueva función para crear ventanas de datos para input del modelo

In [10]:
def create_window_df_nn(list_dfs, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols):
    x_train, y_train = [], []

    for df_ in list_dfs:
        df = df_[cols_to_use].copy()
        df.loc[:, cols_to_use] = scaler_cols.transform(df[cols_to_use])

        for i in range(len(df) - lookback - lookforward + 1):
            x_train.append(np.asarray(df.iloc[i:i+lookback].values))
            y_train.append(np.asarray(df.iloc[i+lookback+lookforward-1][col_to_pred]))

    return np.asarray(x_train), np.asarray(y_train)

def create_window_df_nn_test(list_dfs, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols):
    x_train, y_train, date_pred, date_last_data = [], [], [], []

    for df_ in list_dfs:
        df = df_.copy()
        df.loc[:, cols_to_use] = scaler_cols.transform(df[cols_to_use])

        for i in range(len(df) - lookback - lookforward + 1):
            x_train.append(np.asarray(df.iloc[i:i+lookback][cols_to_use].values))
            y_train.append(np.asarray(df.iloc[i+lookback+lookforward-1][col_to_pred]))
            date_last_data.append(df.iloc[i+lookback-1]["Datetime"])
            date_pred.append(df.iloc[i+lookback+lookforward-1]["Datetime"])

    return np.asarray(x_train), np.asarray(y_train), np.asarray(date_last_data), np.asarray(date_pred)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error, median_absolute_error
from math import sqrt
import sklearn

def error_nn(y_pred, y_true, index_scaler):
    y_pred = index_scaler.inverse_transform(y_pred.reshape(-1, 1))
    y_true = index_scaler.inverse_transform(y_true.reshape(-1, 1))
    
    print(y_pred.reshape(-1).shape)
    print(y_true.reshape(-1).shape)
    df_ = pd.DataFrame({"y_pred": y_pred.reshape(-1), "y_true": y_true.reshape(-1)})
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    # Median Absolute Error
    medae = median_absolute_error(y_true, y_pred)
    # Explained Variance Score
    explained_variance = explained_variance_score(y_true, y_pred)
    # Max Error
    max_err = max_error(y_true, y_pred)
    
    
    data = {
        'Métrica': ['RMSE', 'MSE', 'MAE', 'R²', 'MedAE', 'Varianza explicada', 'Max error'],
        'Valor': [rmse, mse, mae, r2, medae, explained_variance, max_err]
    }
    df = pd.DataFrame(data)
    df_invertido = df.transpose()
    df_invertido.columns = df_invertido.iloc[0]
    df_invertido = df_invertido[1:]
    display(df_invertido)
    
    # scatter plot
    plt.figure(figsize=(8, 4))
    plt.scatter(y_true, y_pred, c='blue', label='Predicciones vs. Valores reales')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'k--', lw=2, label='Línea de referencia')
    plt.xlabel('Valores reales')
    plt.ylabel('Predicciones')
    plt.title('Diagrama de dispersión de Predicciones vs. Valores reales')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return 

# 1. Carga y preparación de datos

## 1.1 Carga de los datos

In [12]:
hd = pd.read_csv(tfm_path+H1_code+'.csv', parse_dates=["Datetime"])
# md = pd.read_csv(tfm_path+M5_code+'.csv', parse_dates=["Datetime"]) # no se va a usar por ahora
historical_storms = pd.read_csv(tfm_path+hstorms_data)
# historical_storms = historical_storms.drop(columns=['Min. Dst (nT)','Unnamed: 0'], axis=1)

- Ver qué columnas y tipo de datos contienen los df.

In [13]:
hd.columns

Index(['ID_IMF', 'ID_plasma', 'Bmag', 'dev_Bmag', 'Bx', 'By_gse', 'Bz_gse',
       'By_gsm', 'Bz_gsm', 'dev_Bx', 'dev_By', 'dev_Bz', 'P_density',
       'dev_P_density', 'AP', 'dev_AP', 'E_field', 'plasma_T', 'dev_plasma_T',
       'plasma_V', 'Dst', 'Datetime'],
      dtype='object')

In [14]:
hd.dtypes

ID_IMF                  float64
ID_plasma               float64
Bmag                    float64
dev_Bmag                float64
Bx                      float64
By_gse                  float64
Bz_gse                  float64
By_gsm                  float64
Bz_gsm                  float64
dev_Bx                  float64
dev_By                  float64
dev_Bz                  float64
P_density               float64
dev_P_density           float64
AP                      float64
dev_AP                  float64
E_field                 float64
plasma_T                float64
dev_plasma_T            float64
plasma_V                float64
Dst                     float64
Datetime         datetime64[ns]
dtype: object

In [15]:
historical_storms.columns

Index(['Unnamed: 0', 'Min. Dst (nT)', 'start', 'end', 'storm'], dtype='object')

In [16]:
historical_storms.dtypes

Unnamed: 0        int64
Min. Dst (nT)     int64
start            object
end              object
storm             int64
dtype: object

- Ordenar las tormentas en caso de que no lo estén
- Convertir todas las fechas a datetime de pd.

In [17]:
# Ordenar los dataframes por fecha
historical_storms = historical_storms.sort_values(by='start')

# convertir las columnas de tiempo a datetime64
hd['Datetime']=pd.to_datetime(hd['Datetime'])
historical_storms['start']=pd.to_datetime(historical_storms['start'])
historical_storms['end']=pd.to_datetime(historical_storms['end'])

# Cuando se utilizan los datos a 5 minutos, se unen a 5min
#data = pd.merge(md, hd[["Datetime", "Dst"]], on='Datetime', how='left')

In [18]:
hd = imputar_nan(hd)

  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


In [19]:
all_storms = filter_storms(hd, historical_storms, temporal_margin)
all_storms = combinar_dataframes_solapados(all_storms)

In [20]:
historical_storms_new = get_new_splits_dates(all_storms)

Storm index de cada conjunto de datos, test y validación, entrenamiento el resto

In [21]:
index_test_storms = [2, 12, 23, 32, 8, 40, 42, 3, 31]
index_val_storms = [25, 13, 1, 37, 33, 7]

TORMENTAS DE ENTRENAMIENTO

In [22]:
train_storms = [df for idx, df in enumerate(all_storms) if idx not in [x-1 for x in index_test_storms]]
# historical_storms_new[~historical_storms_new["storm_index"].isin(index_test_storms)].reset_index(drop=True)

TORMENTAS DE TEST

In [23]:
train_storms

[      ID_IMF  ID_plasma  Bmag  dev_Bmag   Bx  By_gse  Bz_gse  By_gsm  Bz_gsm  \
 1765    71.0       71.0   4.2       0.2  0.8    -4.0    -0.2    -3.8    -1.2   
 1766    71.0       71.0   4.0       0.1  1.1    -3.8     0.3    -3.7    -0.8   
 1767    71.0       71.0   3.5       0.5 -0.8    -2.9     0.3    -2.8    -0.7   
 1768    71.0       71.0   4.0       0.4 -1.2    -3.2     1.8    -3.6     0.5   
 1769    71.0       71.0   4.4       0.7 -0.7    -1.4     3.4    -2.7     2.5   
 ...      ...        ...   ...       ...  ...     ...     ...     ...     ...   
 2000    71.0       71.0   4.9       0.3 -0.2    -4.7    -0.4    -4.4    -1.6   
 2001    71.0       71.0   5.1       0.5 -0.2    -4.9     0.3    -4.8    -0.9   
 2002    71.0       71.0   4.9       0.2 -0.3    -4.6     0.0    -4.5    -1.0   
 2003    71.0       71.0   4.8       0.3 -2.6    -3.1    -0.4    -2.9    -1.1   
 2004    71.0       71.0   3.9       0.6 -2.7    -2.0    -0.2    -1.9    -0.7   
 
       dev_Bx  ...  P_dens

In [24]:
test_storms = [df for idx, df in enumerate(all_storms) if idx in [x-1 for x in index_test_storms]]
# historical_storms_new[historical_storms_new["storm_index"].isin(index_test_storms)].reset_index(drop=True)

Obtenemos scalers de input y label con todo el conjunto de datos

In [25]:
scaler_cols, scaler_label = get_scalers(all_storms, cols_to_use, col_to_pred)

Genereamos ventanas en el formato deseado de los conjuntos de entrenamiento test y validación

In [26]:
x_train, y_train = create_window_df_nn(train_storms, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols)
print("x_train shape : ", x_train.shape, "\ny_train shape:", y_train.shape)

x_train shape :  (7845, 12, 10) 
y_train shape: (7845,)


In [27]:
x_test, y_test = create_window_df_nn(test_storms, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols)
print("x_test shape : ", x_test.shape, "\ny_test shape:", y_test.shape)

x_test shape :  (2395, 12, 10) 
y_test shape: (2395,)


Necesario hacer reshape de las ventanas, flatten(). Requerimientos de SVR y algoritmos similares (XGBoost, RandomForest...)  
La shape de y_train, y_test se mantiene, comprobamos:

In [28]:
X_train = x_train.reshape(x_train.shape[0], -1)  # Transforma a (n_samples, n_features*lookback)
X_test = x_test.reshape(x_test.shape[0], -1)  # Lo mismo para el conjunto de prueba

print("x_train shape : ", X_train.shape, "\ny_train shape:", y_train.shape)
print("x_test shape : ", X_test.shape, "\ny_test shape:", y_test.shape)

x_train shape :  (7845, 120) 
y_train shape: (7845,)
x_test shape :  (2395, 120) 
y_test shape: (2395,)


# 2. Entrenamiento de modelo ÓPTIMO

In [29]:
storm_data_test = test_storms[4].iloc[130]

In [30]:
print(type(storm_data_test))
print(storm_data_test.shape)
print(storm_data_test)

<class 'pandas.core.series.Series'>
(22,)
ID_IMF                          51.0
ID_plasma                       52.0
Bmag                            10.2
dev_Bmag                         1.0
Bx                               4.5
By_gse                           5.0
Bz_gse                          -7.5
By_gsm                           1.7
Bz_gsm                          -8.9
dev_Bx                           1.0
dev_By                           0.9
dev_Bz                           1.0
P_density                        0.8
dev_P_density                    0.5
AP                             0.032
dev_AP                         0.009
E_field                         4.84
plasma_T                    113029.0
dev_plasma_T                 80403.0
plasma_V                       544.0
Dst                            -59.0
Datetime         2004-07-23 12:00:00
Name: 31188, dtype: object


In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Función para escalar los datos
def get_scalers(df, cols_to_use, col_to_pred):
    scaler_cols = StandardScaler()
    scaler_label = StandardScaler()
    scaler_cols.fit(df[cols_to_use])
    scaler_label.fit(df[col_to_pred].values.reshape(-1, 1))
    return scaler_cols, scaler_label

# Función para crear ventanas de datos
def create_window_df_nn(df, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols):
    x_train, y_train = [], []
    df_ = df[cols_to_use].copy()
    df_[cols_to_use] = scaler_cols.transform(df_[cols_to_use])
    for i in range(len(df_) - lookback - lookforward + 1):
        x_train.append(df_.iloc[i:i+lookback].values.flatten())
        y_train.append(df_.iloc[i+lookback+lookforward-1][col_to_pred])
    return np.array(x_train), np.array(y_train)

# Función para evaluar el modelo SVR
def evaluate_svr(model, X_test, y_test, scaler_y):
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2, y_pred

df = pd.concat(train_storms, ignore_index=False)
print(df.shape)
# Obtener los escaladores
scaler_cols, scaler_label = get_scalers(df, cols_to_use, col_to_pred)

# Crear las ventanas de datos
X, y = create_window_df_nn(df, lookback, lookforward, cols_to_use, col_to_pred, scaler_cols)

# Escalar la variable objetivo
y = scaler_label.transform(y.reshape(-1, 1)).flatten()

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo
model = SVR(kernel="linear", C=0.7196856730011522, epsilon=0.1)
# # Entrenar el modelo SVR con kernel lineal
# # model = SVR(kernel="linear", C=0.13894954943731375, epsilon=0.1)
# # model = SVR(kernel="linear", C=0.02682695795279726, epsilon=0.1)

model.fit(X_train, y_train)

(8274, 22)


In [47]:
# Obtener los vectores de soporte, los coeficientes duales y el intercepto
support_vectors = model.support_vectors_
dual_coef = model.dual_coef_[0]
intercept = model.intercept_[0]

# Calcular los coeficientes del modelo (esto es válido solo para kernel lineal)
coefficients = np.dot(dual_coef, support_vectors)

# Construir la ecuación del modelo usando los nombres de las columnas
equation_terms = [f"({coeff:.4f} * {var})" for coeff, var in zip(coefficients, cols_to_use)]
equation = " + ".join(equation_terms) + f" + {intercept:.4f}"

print("Ecuación del modelo:", equation)

Ecuación del modelo: (-0.0000 * Bx) + (-0.0001 * By_gse) + (0.0000 * Bz_gse) + (-0.0001 * By_gsm) + (0.0001 * Bz_gsm) + (-0.0000 * P_density) + (-0.0001 * E_field) + (-0.0001 * plasma_T) + (-0.0002 * plasma_V) + (0.0001 * Dst) + 0.8449


In [37]:
storm_data_test1 = test_storms[4].iloc[132]
print(storm_data_test1['Dst'])

-65.0


In [38]:
storm_data_test = storm_data_test[cols_to_use]

In [39]:
# Calcular la predicción manualmente usando la ecuación
predicted_value = sum(coeff * storm_data_test[var] for coeff, var in zip(coefficients, cols_to_use)) + intercept
print("Predicción manual:", predicted_value)

# Comparar con la predicción del modelo
model_prediction = model.predict([storm_data_test])[0]
print("Predicción del modelo:", model_prediction)

Predicción manual: -10.07232404944434


ValueError: X has 10 features, but SVR is expecting 120 features as input.