<a href="https://colab.research.google.com/github/roni762583/NEAT/blob/main/feature_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


In [1]:

# prompt: create pandas data frame from DAT_ASCII_EURJPY_M1_202406.csv
import pandas as pd
df = pd.read_csv('DAT_ASCII_EURJPY_M1_202407.csv', delimiter=';')

# rename columns
df.rename(columns={df.columns[i]: col_name for i, col_name in enumerate(['Time', 'Open', 'High', 'Low', 'Close', 'Garbage'])}, inplace=True)


df = df.drop('Garbage', axis=1)

df['Time'] = pd.to_datetime(df['Time'])
df.set_index('Time', inplace=True)
print(df.shape)
df.head()

(28262, 4)


Unnamed: 0_level_0,Open,High,Low,Close
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-07-15 00:01:00,172.071,172.085,172.071,172.084
2024-07-15 00:02:00,172.084,172.088,172.084,172.088
2024-07-15 00:03:00,172.088,172.091,172.087,172.09
2024-07-15 00:04:00,172.09,172.094,172.088,172.094
2024-07-15 00:05:00,172.095,172.105,172.09,172.102


In [2]:
# reduce size of df to first 3000 rows
df_reduced = df#.iloc[0:15000]
#df_reduced.head()

# Down sample one minute data
M5_df = df_reduced.resample('5T').agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last'})
H1_df = df_reduced.resample('1H').agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last'})
#df_8H = df_reduced.resample('8H').agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last'})
#df_1D = df_reduced.resample('1D').agg({'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last'})

# push timestamp index one period forward to mark the endtime of bar instead of start time
print('M5')
print(M5_df.head())
M5_df.index = M5_df.index + pd.Timedelta(minutes=5)
print(M5_df.head())
print('H1')
print(H1_df.head())
H1_df.index = H1_df.index + pd.Timedelta(hours=1)
print(H1_df.head())
# shape
print(M5_df.shape)
print(H1_df.shape)

# put df's in list
#dfs_lst = [M5_df, H1_df]
dfs_lst = [('M5_df', M5_df), ('H1_df', H1_df)]

M5
                        Open     High      Low    Close
Time                                                   
2024-07-01 00:00:00  173.171  173.205  173.163  173.183
2024-07-01 00:05:00  173.181  173.195  173.157  173.157
2024-07-01 00:10:00  173.158  173.248  173.149  173.240
2024-07-01 00:15:00  173.239  173.256  173.214  173.219
2024-07-01 00:20:00  173.219  173.228  173.202  173.213
                        Open     High      Low    Close
Time                                                   
2024-07-01 00:05:00  173.171  173.205  173.163  173.183
2024-07-01 00:10:00  173.181  173.195  173.157  173.157
2024-07-01 00:15:00  173.158  173.248  173.149  173.240
2024-07-01 00:20:00  173.239  173.256  173.214  173.219
2024-07-01 00:25:00  173.219  173.228  173.202  173.213
H1
                        Open     High      Low    Close
Time                                                   
2024-07-01 00:00:00  173.171  173.256  173.149  173.231
2024-07-01 01:00:00  173.232  173.318  173

In [3]:

# prompt: delete df, and files DAT_ASCII_*.csv to conserve resources

del df, df_reduced
# !rm DAT_ASCII_*.csv


In [13]:
def find_extreme_si_values(df, column_name='SI', threshold_multiplier=2):
    """
    Find and print rows where the values in the specified column are extreme.
    An extreme value is defined as being more than a certain number of standard deviations
    away from the mean.

    Args:
        df (pd.DataFrame): The DataFrame to analyze.
        column_name (str): The name of the column to check for extreme values. Defaults to 'ASI'.
        threshold_multiplier (float): The number of standard deviations away from the mean to define as extreme. Defaults to 2.

    Returns:
        None
    """
    # Calculate the mean and standard deviation of the specified column
    mean_val = df[column_name].mean()
    std_val = df[column_name].std()

    # Define a threshold for extreme values
    threshold = threshold_multiplier * std_val

    # Find rows with extreme values
    extreme_rows = df[(df[column_name] > mean_val + threshold) | (df[column_name] < mean_val - threshold)]

    # Print the number of extreme rows found
    print(f"Number of rows with extreme {column_name} values: {len(extreme_rows)}")

    # Print the extreme rows with row numbers
    for index, row in extreme_rows.iterrows():
        print(f"Row {df.index.get_loc(index)}: {column_name}: {row[column_name]}")


In [6]:
import numpy as np
import pandas as pd

def calculate_asi_with_peaks_troughs(df, pip_size=0.01):
    """
    Calculates the Accumulative Swing Index (ASI), detects peaks and troughs,
    and handles conflicts where a data point is marked as both.
    """

    # Yesterday's values
    df['C1'] = df['Close']
    df['O1'] = df['Open']

    # Today's values
    df['C2'] = df['Close'].shift(-1)
    df['O2'] = df['Open'].shift(-1)
    df['H2'] = df['High'].shift(-1)
    df['L2'] = df['Low'].shift(-1)

    # For numerator (only place NOT to use ABS Val.)
    df['C2C1'] = (df['C2'] - df['C1']) / pip_size
    df['C2O2'] = (df['C2'] - df['O2']) / pip_size
    df['C1O1'] = (df['C1'] - df['O1']) / pip_size

    # For R
    df['H2C1'] = (df['H2'] - df['C1']).abs() / pip_size
    df['L2C1'] = (df['L2'] - df['C1']).abs() / pip_size
    df['H2L2'] = (df['H2'] - df['L2']).abs() / pip_size
    df['absC1O1'] = (df['C1'] - df['O1']).abs() / pip_size

    df['numerator'] = df['C2C1'] + 0.5 * df['C2O2'] + 0.25 * df['C1O1']

    # Create a new column 'R' and assign values based on the greatest value
    conditions = [
        (df['H2C1'] >= df[['H2C1', 'L2C1', 'H2L2']].max(axis=1)),
        (df['L2C1'] >= df[['H2C1', 'L2C1', 'H2L2']].max(axis=1)),
        (df['H2L2'] >= df[['H2C1', 'L2C1', 'H2L2']].max(axis=1))
    ]

    choices = [
        df['H2C1'] - 0.5 * df['L2C1'] + 0.25 * df['absC1O1'],
        df['L2C1'] - 0.5 * df['H2C1'] + 0.25 * df['absC1O1'],
        df['H2L2'] + 0.25 * df['absC1O1']
    ]

    df['R'] = np.select(conditions, choices, default=np.nan)

    df['K'] = np.maximum(df['H2C1'], df['L2C1'])

    df['time_diff'] = df.index.to_series().diff().dt.total_seconds() / 60.0
    time_diff_1_2 = df['time_diff'].iloc[1]
    time_diff_2_3 = df['time_diff'].iloc[2]

    if time_diff_1_2 == time_diff_2_3:
        time_frame = time_diff_1_2
    else:
        print('Check your data file, time frame not consistent')
        return None

    number_of_minutes_per_bar = time_frame
    df['SI'] = round(50 * (df['numerator'] / df['R']))

    df['ASI'] = df['SI'].cumsum()

    # Detect peaks, troughs, and handle conflicts
    df['swing_points'] = np.nan

    for i in range(2, len(df)):
        is_peak = df['ASI'].iloc[i-1] > df['ASI'].iloc[i-2] and df['ASI'].iloc[i-1] > df['ASI'].iloc[i]
        is_trough = df['ASI'].iloc[i-1] < df['ASI'].iloc[i-2] and df['ASI'].iloc[i-1] < df['ASI'].iloc[i]

        if is_peak and is_trough:
            df.at[df.index[i-1], 'swing_points'] = -9
        elif is_peak:
            df.at[df.index[i-1], 'swing_points'] = 1
        elif is_trough:
            df.at[df.index[i-1], 'swing_points'] = -1

    # Drop unused columns
    df.drop(columns=['R', 'K', 'C1', 'O1', 'C2', 'O2', 'H2', 'L2', 'C2C1', 'C2O2', 'C1O1', 'H2C1', 'L2C1', 'H2L2', 'absC1O1', 'numerator', 'time_diff'], inplace=True)

    return df


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_ohlc_asi(df, df_name, row_start=0, row_end=None):
    # Set row_end to the number of rows in the DataFrame if not provided
    if row_end is None:
        row_end = df.shape[0]

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add candlestick trace
    fig.add_trace(go.Candlestick(x=df.index[row_start:row_end],
                                 open=df['Open'][row_start:row_end],
                                 high=df['High'][row_start:row_end],
                                 low=df['Low'][row_start:row_end],
                                 close=df['Close'][row_start:row_end]),
                  secondary_y=False)

    # Add ASI trace
    fig.add_trace(go.Scatter(x=df.index[row_start:row_end],
                             y=df['ASI'][row_start:row_end],
                             name='ASI'),
                  secondary_y=True)

    # Add peaks and troughs based on swing_points
    peaks = df[(df['swing_points'] == 1)].index
    troughs = df[(df['swing_points'] == -1)].index

    if not peaks.empty:
        peak_y = df['ASI'][peaks]  # ASI values for peaks
        fig.add_trace(go.Scatter(x=peaks, y=peak_y, mode='markers', name='Peaks', marker=dict(color='red', size=8, symbol='triangle-up')))

    if not troughs.empty:
        trough_y = df['ASI'][troughs]  # ASI values for troughs
        fig.add_trace(go.Scatter(x=troughs, y=trough_y, mode='markers', name='Troughs', marker=dict(color='blue', size=8, symbol='triangle-down')))

    # Set title
    fig.update_layout(title_text=f"{df_name}: OHLC and ASI")

    # Set x-axis title
    fig.update_xaxes(title_text="Date")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>OHLC</b>", secondary_y=False)
    fig.update_yaxes(title_text="<b>ASI</b>", secondary_y=True)

    # Show plot
    fig.show()


In [15]:
# ASI Calculated for TF's
for df_name, df in dfs_lst:
    calculate_asi_with_peaks_troughs(df)
    # drop rows with NaN's except in swing_points column
    df.dropna(subset=df.columns.difference(['swing_points']), inplace=True)
    print(df_name, df.shape,'\n', df.head(3),'\n','extreme values:')
    # examine SI's fall in range of 0 to 100
    find_extreme_si_values(df, column_name='SI', threshold_multiplier=2)
    print('\n')
    # plot peaks and troughs
    plot_ohlc_asi(df, df_name)




M5_df (5648, 7) 
                         Open     High      Low    Close    SI   ASI  \
Time                                                                  
2024-07-01 00:05:00  173.171  173.205  173.163  173.183 -43.0 -43.0   
2024-07-01 00:10:00  173.181  173.195  173.157  173.157  56.0  13.0   
2024-07-01 00:15:00  173.158  173.248  173.149  173.240  -8.0   5.0   

                     swing_points  
Time                               
2024-07-01 00:05:00           NaN  
2024-07-01 00:10:00           1.0  
2024-07-01 00:15:00           NaN   
 extreme values:
Number of rows with extreme SI values: 16
Row 212: SI: -73.0
Row 738: SI: -73.0
Row 1270: SI: -74.0
Row 1356: SI: -74.0
Row 1633: SI: -95.0
Row 1689: SI: -74.0
Row 1712: SI: 75.0
Row 2207: SI: -94.0
Row 3042: SI: -74.0
Row 3582: SI: -73.0
Row 3743: SI: -73.0
Row 3962: SI: -74.0
Row 4649: SI: -73.0
Row 4786: SI: -78.0
Row 5074: SI: -73.0
Row 5414: SI: -73.0




H1_df (469, 7) 
                         Open     High      Low    Close    SI   ASI  \
Time                                                                  
2024-07-01 01:00:00  173.171  173.256  173.149  173.231  -8.0  -8.0   
2024-07-01 02:00:00  173.232  173.318  173.142  173.202  21.0  13.0   
2024-07-01 03:00:00  173.196  173.470  173.195  173.285 -25.0 -12.0   

                     swing_points  
Time                               
2024-07-01 01:00:00           NaN  
2024-07-01 02:00:00           1.0  
2024-07-01 03:00:00          -1.0   
 extreme values:
Number of rows with extreme SI values: 5
Row 60: SI: -73.0
Row 198: SI: -68.0
Row 285: SI: -72.0
Row 357: SI: -70.0
Row 442: SI: -70.0


