In [1]:
#Importaciones
import numpy as np
import pandas as pd
import warnings

# Configuración
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
# Carga de Datos Brutos
print("Cargando datos brutos desde './Datasets/Apple.csv'...")
file_path = './Datasets/Apple.csv'

try:
    df = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
    df = df.sort_index()
    # Usar nombres de columna consistentes (minúsculas)
    df.columns = df.columns.str.lower()
    if 'adj_close' in df.columns:
        df = df.drop(columns=['adj_close'])
    print(f"Datos brutos cargados. Forma inicial: {df.shape}")
except FileNotFoundError:
    print(f"Error: No se encontró el archivo '{file_path}'.")

Cargando datos brutos desde './Datasets/Apple.csv'...
Datos brutos cargados. Forma inicial: (10836, 6)


In [3]:
# Limpieza de Anomalías (Valores Faltantes/Inconsistentes)

# Como descubrimos en el EDA, hay un día con Volumen 0 que debe ser tratado.
print(f"Días con Volumen 0 antes de limpiar: {len(df[df['volume'] == 0])}")

# Reemplazar 0 con NaN (No Nulo) para que pueda ser rellenado
df['volume'] = df['volume'].replace(0, np.nan)

# Usar 'forward fill' (ffill) para rellenar el NaN con el valor del día anterior.
# Esto es apropiado para series temporales donde el volumen no debería ser 0.
df['volume'] = df['volume'].ffill()

print(f"Días con Volumen 0 después de limpiar: {len(df[df['volume'] == 0])}")
print("Anomalía de Volumen 0 corregida usando forward fill (ffill).")

Días con Volumen 0 antes de limpiar: 1
Días con Volumen 0 después de limpiar: 0
Anomalía de Volumen 0 corregida usando forward fill (ffill).


In [4]:
# Ingeniería de Características 

# Función de Volatilidad (Garman-Klass)
def calculate_garman_klass_volatility(data):
    log_hl = np.log(data['high'] / data['low'])
    log_co = np.log(data['close'] / data['open'])
    gk_vol = 0.5 * (log_hl**2) - (2 * np.log(2) - 1) * (log_co**2)
    gk_vol = np.sqrt(np.maximum(gk_vol, 0.000001)) 
    return pd.Series(gk_vol, index=data.index, name='volatility')

df['volatility'] = calculate_garman_klass_volatility(df)
print("Variable 'volatility' (Garman-Klass) calculada.")

# Features (X)
df['log_return'] = np.log(df['close'] / df['close'].shift(1))
df['realized_vol_5d'] = df['log_return'].rolling(window=5).std()
df['return_range'] = (df['high'] - df['low']) / df['close']
df['volume_change'] = df['volume'].pct_change()
print("Features (X) calculadas: log_return, realized_vol_5d, return_range, volume_change.")

# Target (y) - El cambio en la volatilidad del día siguiente
PREDICTION_HORIZON = 1
df['target_delta_vol'] = df['volatility'].shift(-PREDICTION_HORIZON) - df['volatility']
print(f"Target (y) calculado: 'target_delta_vol' (Horizonte T+{PREDICTION_HORIZON}).")

Variable 'volatility' (Garman-Klass) calculada.
Features (X) calculadas: log_return, realized_vol_5d, return_range, volume_change.
Target (y) calculado: 'target_delta_vol' (Horizonte T+1).


In [5]:
# Limpieza Final

print(f"Forma ANTES de limpiar NaNs/Infs: {df.shape}")

# Reemplazar infinitos 
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Lista de columnas que DEBEN tener un valor para el modelo
features_and_target = [
    'log_return', 'realized_vol_5d', 'return_range', 'volume_change', 
    'volatility', 'target_delta_vol'
]

# Eliminar cualquier fila que tenga NaN en estas columnas clave
df_processed = df.dropna(subset=features_and_target)

print(f"Forma DESPUÉS de limpiar NaNs/Infs: {df_processed.shape}")

Forma ANTES de limpiar NaNs/Infs: (10836, 12)
Forma DESPUÉS de limpiar NaNs/Infs: (10830, 12)


In [6]:
# Guardar Datos Preprocesados

output_path = './Datasets/Apple_processed.csv'
df_processed.to_csv(output_path)

print(f"Datos limpios y con características guardados en: {output_path}")
print(f"Este archivo contiene {df_processed.shape[0]} filas listas para el modelado.")

# Mostrar las primeras filas del archivo final
display(df_processed.head())

Datos limpios y con características guardados en: ./Datasets/Apple_processed.csv
Este archivo contiene 10830 filas listas para el modelado.


Unnamed: 0_level_0,open,high,low,close,adj close,volume,volatility,log_return,realized_vol_5d,return_range,volume_change,target_delta_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-12-19,0.126116,0.126674,0.126116,0.126116,0.097591,48630400.0,0.003122,0.059239,0.05819,0.004424,-0.337908,-0.000139
1980-12-22,0.132254,0.132813,0.132254,0.132254,0.102341,37363200.0,0.002982,0.047522,0.053845,0.004227,-0.23169,-0.000126
1980-12-23,0.137835,0.138393,0.137835,0.137835,0.10666,46950400.0,0.002857,0.041333,0.014146,0.004048,0.256595,-0.000143
1980-12-24,0.145089,0.145647,0.145089,0.145089,0.112273,48003200.0,0.002714,0.05129,0.011511,0.003846,0.022424,-0.000229
1980-12-26,0.158482,0.15904,0.158482,0.158482,0.122637,55574400.0,0.002485,0.088294,0.018376,0.003521,0.157723,-3.4e-05
