In [10]:
import pandas as pd
import numpy as np
import os

# Define input features and categorical encodings
INPUT_FEATURES = [
    'Open', 'High', 'Low', 'Close', 'RSI', 'MACD', 'HABodyRangeRatio', 'MyWAZLTTrend',
    'MACDState', 'HAColor', 'HALongWick', 'HAGreenConsec', 'HARedConsec', 'EMAState',
    'HAHighToEMALong', 'HACloseToEMALong', 'HALowToEMALong',
    'HAHighToEMAShort', 'HACloseToEMAShort', 'HALowToEMAShort'
]

CATEGORICAL_RANGES = {
    'MACDState': list(range(1, 17)),
    'HAColor': [1, 2, 3],
    'HALongWick': [1, 2, 3],
    'HAGreenConsec': list(range(0, 51)),
    'HARedConsec': list(range(0, 51)),
    'EMAState': list(range(1, 17)),
    'HAHighToEMALong': [1, 2, 3],
    'HACloseToEMALong': [1, 2, 3],
    'HALowToEMALong': [1, 2, 3],
    'HAHighToEMAShort': [1, 2, 3],
    'HACloseToEMAShort': [1, 2, 3],
    'HALowToEMAShort': [1, 2, 3]
}

def generate_all_features(input_file: str, output_csv: str):
    print(" Loading data...")

    # Load from XLSX
    df = pd.read_excel(input_file)

    required_cols = ['Date'] + INPUT_FEATURES
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns in input file: {missing}")
#     print(df.head())
    
    
    df['Time'] = df['Date'].dt.time
    
    df = df[df['Time'].apply(lambda x: pd.Timestamp('08:35:00').time() <= x <= pd.Timestamp('10:25:00').time())]
    
    # Drop Time column if no longer needed
    df.drop(columns=['Time'], inplace=True)
    
    
    df['Date'] = pd.to_datetime(df['Date'])
    df['DayOnly'] = df['Date'].dt.date  # For grouping
    
    print(np.unique(df["DayOnly"]).shape)
    
    # Lag Continuous Features
    for col in INPUT_FEATURES:
        if col in CATEGORICAL_RANGES:
            continue  # Categorical handled separately
        df[f"{col}_lag1"] = df.groupby('DayOnly')[col].shift(1).fillna(0)

    #  Lag Categorical Features BEFORE dropping
    for col, cats in CATEGORICAL_RANGES.items():
        lag_col = f"{col}_lag1"
        df[lag_col] = df.groupby('DayOnly')[col].shift(1).fillna(0)

    # One-hot encode categorical columns and their lag versions
    for col, cats in CATEGORICAL_RANGES.items():
        # One-hot current
        df[col] = pd.Categorical(df[col], categories=cats)
        dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df.drop(columns=[col]), dummies], axis=1)

        # One-hot lagged
        lag_col = f"{col}_lag1"
        df[lag_col] = pd.Categorical(df[lag_col], categories=cats)
        dummies_lag = pd.get_dummies(df[lag_col], prefix=lag_col)
        df = pd.concat([df.drop(columns=[lag_col]), dummies_lag], axis=1)

    # Drop helper column
    df.drop(columns=['DayOnly'], inplace=True)

    #  Ensure Date is first column
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index('Date')))
    df = df[cols]

    # Save
    df.to_csv(output_csv, index=False)
    print(f" All features generated and saved to: {output_csv}")

# Main block to run directly
if __name__ == "__main__":
    input_xlsx = 'data/concat_ESData.xlsx'  # Change path as needed
    output_dir = 'data/'
    os.makedirs(output_dir, exist_ok=True)
    output_csv = os.path.join(output_dir, "ES_preprocessed_all_features_training_zeros_corrected.csv")
   
    print("Starting feature generation...")
    generate_all_features(input_xlsx, output_csv)

Starting feature generation...
 Loading data...
(72,)
 All features generated and saved to: data/ES_preprocessed_all_features_training_zeros_corrected.csv


In [None]:
# # Main block to run directly
# if __name__ == "__main__":
#     input_xlsx = 'data/concat_ESData.xlsx'  # Change path as needed
#     output_dir = 'data/'
#     os.makedirs(output_dir, exist_ok=True)
#     output_csv = os.path.join(output_dir, "ES_preprocessed_all_features_training_zeros_v1.csv")
   
#     print("Starting feature generation...")
#     generate_all_features(input_xlsx, output_csv)