In [243]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [244]:
data_raw = pd.read_csv('raw_data/most_traded_stocks_data.csv', header=[0,1], index_col=0, parse_dates=True)

In [245]:
# Remove all columns where the second level is 'Volume'
columns_to_drop = [col for col in data_raw.columns if col[1] == 'Volume']
data_raw = data_raw.drop(columns=columns_to_drop)

In [246]:
data_raw['^VIX'] = data_raw['^VIX'] / np.sqrt(252) / 100  # Convert VIX to daily volatility

In [247]:
# New dataframe with features only
data_features = data_raw.copy().drop(columns=[col for col in data_raw.columns if col[1] != ''])

In [248]:
# Signal trigger
Trigger_up = 0.005
Trigger_down = -0.005

# Get unique tickers
tickers = data_raw.columns.get_level_values(0).unique()

# Collect all calculations
data = {}
for ticker in tickers:
    # Select the relevant columns for the ticker
    high = data_raw[(ticker, 'High')]
    low = data_raw[(ticker, 'Low')]
    close = data_raw[(ticker, 'Close')]
    
    # Calculate indicators
    stochastic = np.clip((close - low) / (high - low), 0, 1)
    william = np.clip((high - close) / (high - low), 0, 1)
    basic_mid = (high + low + close) / 3
    close_to_high = high / close - 1
    close_to_low = low / close - 1
    day_to_day_return = close.pct_change()
    signal = np.where(day_to_day_return > Trigger_up, 1, np.where(day_to_day_return < Trigger_down, -1, 0))
    
    # Store in dict with multi-index key
    data[(ticker, 'Stochastic')] = stochastic
    data[(ticker, 'William')] = william
    data[(ticker, 'Basic_Mid')] = basic_mid
    data[(ticker, 'Close_to_High')] = close_to_high
    data[(ticker, 'Close_to_Low')] = close_to_low
    data[(ticker, 'DD_Return')] = day_to_day_return
    
    # Moving averages (N=20 and N=50 days)
    MA_N = [5, 10, 15, 35, 40, 45]
    Trend_N = [50, 100, 150]
    Series_dict = {'Close': close, 'High': high, 'Low': low}

    for serie in Series_dict.keys():
        matrice = Series_dict[serie]
        for N in MA_N:
            data[(ticker, f'{serie}_MA_{N}')] = matrice.rolling(window=N).mean() / matrice
            data[(ticker, f'{serie}_STD_{N}')] = matrice.rolling(window=N).std() / matrice.rolling(window=N).mean()
            data[(ticker, f'{serie}_GROWTH_{N}')] = (close / close.shift(N)) ** (252 / N) - 1
    
    for serie in Series_dict.keys():
        matrice = Series_dict[serie]
        for N in Trend_N:
            data[(ticker, f'{serie}_TREND_{N}')] = matrice.rolling(window=N).mean() / matrice
            data[(ticker, f'{serie}_STD_{N}')] = matrice.rolling(window=N).std() / matrice.rolling(window=N).mean()
            data[(ticker, f'{serie}_GROWTH_{N}')] = (close / close.shift(N)) ** (252 / N) - 1

    for N in MA_N:
        # Correlation with VIX
        data[(ticker, f'Close_CORR_{N}')] = data[(ticker, f'Close_STD_{N}')].rolling(window=N).corr(data_raw[('^VIX', 'Close')].rolling(window=N).mean())
        
    #Target signal to predict
    data[(ticker, 'Signal')] = signal

# Create a DataFrame from the collected data
new_columns_df = pd.DataFrame(data)

# Concatenate with data_features to avoid fragmentation
data_features = pd.concat([data_features, new_columns_df], axis=1)

In [250]:
features = data_features.columns.get_level_values(1).unique()
print(f"features available: {features}")

features available: Index(['Stochastic', 'William', 'Basic_Mid', 'Close_to_High', 'Close_to_Low',
       'DD_Return', 'Close_MA_5', 'Close_STD_5', 'Close_GROWTH_5',
       'Close_MA_10', 'Close_STD_10', 'Close_GROWTH_10', 'Close_MA_15',
       'Close_STD_15', 'Close_GROWTH_15', 'Close_MA_35', 'Close_STD_35',
       'Close_GROWTH_35', 'Close_MA_40', 'Close_STD_40', 'Close_GROWTH_40',
       'Close_MA_45', 'Close_STD_45', 'Close_GROWTH_45', 'High_MA_5',
       'High_STD_5', 'High_GROWTH_5', 'High_MA_10', 'High_STD_10',
       'High_GROWTH_10', 'High_MA_15', 'High_STD_15', 'High_GROWTH_15',
       'High_MA_35', 'High_STD_35', 'High_GROWTH_35', 'High_MA_40',
       'High_STD_40', 'High_GROWTH_40', 'High_MA_45', 'High_STD_45',
       'High_GROWTH_45', 'Low_MA_5', 'Low_STD_5', 'Low_GROWTH_5', 'Low_MA_10',
       'Low_STD_10', 'Low_GROWTH_10', 'Low_MA_15', 'Low_STD_15',
       'Low_GROWTH_15', 'Low_MA_35', 'Low_STD_35', 'Low_GROWTH_35',
       'Low_MA_40', 'Low_STD_40', 'Low_GROWTH_40', 'Low_

In [251]:
data_features.to_csv('processed_data/data_features.csv')