# Main

In [1]:
import ccxt
import pandas as pd
from datetime import datetime

# Initialize the Indodax exchange
exchange = ccxt.indodax()

# Define the trading pair and timeframe
symbol = 'USDT/IDR'  # Example: USDT to Indonesian Rupiah
timeframe = '1h'     # Supported timeframes: '1m', '5m', '15m', '1h', '1d', etc.
limit = 60000      # Number of candles to fetch (max depends on the exchange)

# Fetch OHLCV data
ohlcv = exchange.fetch_ohlcv(symbol, timeframe, limit=limit)

# Convert to DataFrame
columns = ['date', 'open', 'high', 'low', 'close', 'volume']
data = [
    [datetime.utcfromtimestamp(c[0] / 1000).strftime('%Y-%m-%d %H:%M:%S'), *c[1:]]
    for c in ohlcv
]
indodax_df = pd.DataFrame(data, columns=columns)

# Ensure 'date' column is in datetime format
indodax_df['date'] = pd.to_datetime(indodax_df['date'])

# Convert from UTC to Bali time (UTC+8)
indodax_df['date'] = indodax_df['date'] + pd.Timedelta(hours=8)

indodax_df = indodax_df.tail(len(indodax_df) - 100)

# Print the DataFrame
indodax_df

Unnamed: 0,date,open,high,low,close,volume
100,2018-08-28 19:00:00,14543.0,14543.0,14537.0,14537.0,5159.853746
101,2018-08-28 20:00:00,14540.0,14551.0,14538.0,14550.0,1839.865859
102,2018-08-28 21:00:00,14550.0,14551.0,14550.0,14550.0,6104.525120
103,2018-08-28 22:00:00,14550.0,14551.0,14550.0,14551.0,4093.335581
104,2018-08-28 23:00:00,14550.0,14551.0,14550.0,14550.0,11764.586659
...,...,...,...,...,...,...
55050,2024-12-04 09:00:00,15987.0,16001.0,15982.0,15986.0,201488.095733
55051,2024-12-04 10:00:00,15986.0,16037.0,15955.0,16019.0,607856.204671
55052,2024-12-04 11:00:00,16013.0,16038.0,15992.0,16034.0,679751.652550
55053,2024-12-04 12:00:00,16007.0,16036.0,16000.0,16005.0,455462.783172


In [3]:
import ccxt
import pandas as pd
from datetime import datetime, timedelta

# Initialize the Indodax exchange
exchange = ccxt.tokocrypto()

# Define the trading pair and timeframe
symbol = 'USDT/IDR'  # Example: USDT to Indonesian Rupiah
timeframe = '1h'     # Supported timeframes: '1m', '5m', '15m', '1h', '1d', etc.
limit = 1000         # Number of candles to fetch per request

# Define the start and end times
start_time = int(datetime(2018, 8, 24, 15, 0).timestamp() * 1000)  # Start time in milliseconds
end_time = int(datetime.now().timestamp() * 1000)                  # Current time in milliseconds

# Initialize an empty list to collect OHLCV data
all_ohlcv = []
current_time = start_time

# Loop to fetch data in chunks of `limit`
while current_time < end_time:
    print(f"Fetching data starting from: {datetime.utcfromtimestamp(current_time / 1000).strftime('%Y-%m-%d %H:%M:%S')} UTC")
    ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since=current_time, limit=limit)

    if not ohlcv:
        print("No more data available.")
        break

    all_ohlcv.extend(ohlcv)
    current_time = ohlcv[-1][0] + 1  # Increment start time to avoid overlapping

# Convert the collected data into a DataFrame
columns = ['date', 'open', 'high', 'low', 'close', 'volume']
data = [
    [datetime.utcfromtimestamp(c[0] / 1000).strftime('%Y-%m-%d %H:%M:%S'), *c[1:]]
    for c in all_ohlcv
]
tokocrypto_df = pd.DataFrame(data, columns=columns)

# Ensure 'date' column is in datetime format
tokocrypto_df['date'] = pd.to_datetime(tokocrypto_df['date'])

# Convert from UTC to Bali time (UTC+8)
tokocrypto_df['date'] = tokocrypto_df['date'] + pd.Timedelta(hours=8)

# Print the DataFrame
tokocrypto_df

Fetching data starting from: 2018-08-24 15:00:00 UTC
Fetching data starting from: 2023-09-12 03:00:00 UTC
Fetching data starting from: 2023-10-23 19:00:00 UTC
Fetching data starting from: 2023-12-04 11:00:00 UTC
Fetching data starting from: 2024-01-15 03:00:00 UTC
Fetching data starting from: 2024-02-25 19:00:00 UTC
Fetching data starting from: 2024-04-07 11:00:00 UTC
Fetching data starting from: 2024-05-19 03:00:00 UTC
Fetching data starting from: 2024-06-29 19:00:00 UTC
Fetching data starting from: 2024-08-10 11:00:00 UTC
Fetching data starting from: 2024-09-21 03:00:00 UTC
Fetching data starting from: 2024-11-01 19:00:00 UTC
Fetching data starting from: 2024-11-29 09:00:00 UTC
No more data available.


Unnamed: 0,date,open,high,low,close,volume
0,2023-08-01 20:00:00,15500.0,15500.0,15500.0,15500.0,0.000000
1,2023-08-01 21:00:00,15500.0,15500.0,15500.0,15500.0,0.000000
2,2023-08-01 22:00:00,15500.0,15500.0,15500.0,15500.0,0.000000
3,2023-08-01 23:00:00,15159.0,21000.0,15050.0,15120.0,22060.430480
4,2023-08-02 00:00:00,15120.0,15138.0,15084.0,15137.0,44587.912635
...,...,...,...,...,...,...
11657,2024-11-29 13:00:00,15895.0,15900.0,15866.0,15900.0,228724.000000
11658,2024-11-29 14:00:00,15900.0,15908.0,15860.0,15876.0,241338.600000
11659,2024-11-29 15:00:00,15885.0,15899.0,15873.0,15878.0,339245.700000
11660,2024-11-29 16:00:00,15893.0,15904.0,15878.0,15886.0,203342.200000


In [4]:
import pandas as pd

# Merge the two DataFrames on the 'date' column
df = pd.merge(indodax_df, tokocrypto_df, on='date', suffixes=('_indodax', '_tokocrypto'))

# Calculate the average of 'open' and 'close' for the two sources
df['open'] = (df['open_indodax'] + df['open_tokocrypto']) / 2
df['close'] = (df['close_indodax'] + df['close_tokocrypto']) / 2

# Keep only the desired columns
df = df[['date', 'open', 'close']]

# Display the resulting DataFrame
df


Unnamed: 0,date,open,close
0,2023-08-01 20:00:00,15279.0,15279.5
1,2023-08-01 21:00:00,15279.0,15279.5
2,2023-08-01 22:00:00,15279.0,15279.0
3,2023-08-01 23:00:00,15108.5,15089.0
4,2023-08-02 00:00:00,15089.5,15098.0
...,...,...,...
11657,2024-11-29 13:00:00,15887.5,15894.5
11658,2024-11-29 14:00:00,15894.5,15869.0
11659,2024-11-29 15:00:00,15873.5,15867.5
11660,2024-11-29 16:00:00,15875.0,15876.0


In [5]:
# Lag features
for lag in range(1, 6):
    df[f'open_lag_{lag}'] = df['open'].shift(lag)
    df[f'close_lag_{lag}'] = df['close'].shift(lag)

# Moving averages
for window in [5, 10, 20]:
    df[f'ma_open_{window}'] = df['open'].rolling(window=window).mean()
    df[f'ma_close_{window}'] = df['close'].rolling(window=window).mean()

# Percentage changes
df['open_pct_change'] = df['open'].pct_change() * 100
df['close_pct_change'] = df['close'].pct_change() * 100

# Time-based features
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

# Rolling statistics
for window in [5, 10, 20]:
    df[f'rolling_min_close_{window}'] = df['close'].rolling(window=window).min()
    df[f'rolling_max_close_{window}'] = df['close'].rolling(window=window).max()

# Exponential moving averages
for span in [5, 10, 20]:
    df[f'ema_close_{span}'] = df['close'].ewm(span=span).mean()

# Target variable
df['next_close'] = df['close'].shift(-1)

# Drop rows with NaN values
df = df.dropna()

df

Unnamed: 0,date,open,close,open_lag_1,close_lag_1,open_lag_2,close_lag_2,open_lag_3,close_lag_3,open_lag_4,...,rolling_min_close_5,rolling_max_close_5,rolling_min_close_10,rolling_max_close_10,rolling_min_close_20,rolling_max_close_20,ema_close_5,ema_close_10,ema_close_20,next_close
19,2023-08-02 15:00:00,15150.5,15150.5,15150.0,15151.0,15120.5,15149.5,15137.5,15137.0,15121.0,...,15137.0,15151.0,15098.0,15170.5,15089.0,15279.5,15143.557081,15136.594145,15134.912676,15150.5
20,2023-08-02 16:00:00,15151.0,15150.5,15150.5,15150.5,15150.0,15151.0,15120.5,15149.5,15137.5,...,15137.0,15151.0,15098.0,15170.5,15089.0,15279.5,15145.871851,15139.160427,15136.603925,15150.5
21,2023-08-02 17:00:00,15150.5,15150.5,15151.0,15150.5,15150.5,15150.5,15150.0,15151.0,15120.5,...,15149.5,15151.0,15098.0,15151.0,15089.0,15279.0,15147.414774,15141.247415,15138.091934,15151.0
22,2023-08-02 18:00:00,15151.0,15151.0,15150.5,15150.5,15151.0,15150.5,15150.5,15150.5,15150.0,...,15150.5,15151.0,15099.0,15151.0,15089.0,15170.5,15148.609956,15143.038339,15139.457968,15151.0
23,2023-08-02 19:00:00,15150.5,15151.0,15151.0,15151.0,15150.5,15150.5,15151.0,15150.5,15150.5,...,15150.5,15151.0,15121.5,15151.0,15089.0,15170.5,15149.406684,15144.497732,15140.666638,15151.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11656,2024-11-29 12:00:00,15873.5,15887.0,15876.0,15877.5,15887.5,15875.5,15901.5,15894.0,15895.5,...,15875.5,15894.5,15875.5,15928.5,15875.5,15942.0,15887.183139,15895.866393,15907.510155,15894.5
11657,2024-11-29 13:00:00,15887.5,15894.5,15873.5,15887.0,15876.0,15877.5,15887.5,15875.5,15901.5,...,15875.5,15894.5,15875.5,15928.5,15875.5,15938.5,15889.622093,15895.617958,15906.271092,15869.0
11658,2024-11-29 14:00:00,15894.5,15869.0,15887.5,15894.5,15873.5,15887.0,15876.0,15877.5,15887.5,...,15869.0,15894.5,15869.0,15917.5,15869.0,15936.0,15882.748062,15890.778329,15902.721465,15867.5
11659,2024-11-29 15:00:00,15873.5,15867.5,15894.5,15869.0,15887.5,15894.5,15873.5,15887.0,15876.0,...,15867.5,15894.5,15867.5,15900.0,15867.5,15933.5,15877.665375,15886.545906,15899.367039,15876.0


In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

# Separate df into features and target
features_df = df.drop(['date', 'next_close'], axis=1)
target_df = df['next_close']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = 50      # Maximum number of windows to process
set_limit = False     # Set this to False to process all windows

# Define list of window sizes
# window_sizes = list(range(765, 776, 1))
window_sizes = [1000]

# List to store results
results = []

# Loop through each window size
for window_size in window_sizes:

    # Initiate lists to store RMSEs and percentages
    all_val_rmse = []
    all_val_rmse_perc = []
    all_train_rmse = []
    all_train_rmse_perc = []
    total_window_times = 0  # Variable to store total time for all windows

    # Counters and variables for prediction comparison to actual values
    lower_count = 0
    higher_count = 0
    max_rmse_perc_lower = 0  # To store max RMSE% when prediction is lower than actual
    max_rmse_perc_higher = 0  # To store max RMSE% when prediction is higher than actual

    # Separate lists to store RMSE percentage for lower and higher predictions
    lower_rmse_percs = []
    higher_rmse_percs = []

    # Separate lists to store val_rmse for lower and higher predictions
    rmse_lower_perc = []  # To store RMSE when prediction is lower than actual
    rmse_higher_perc = []  # To store RMSE when prediction is higher than actual

    # Calculate the number of windows based on dataset size
    num_windows = len(X) - window_size - num_predictions

    # Apply maximum window limit if set
    if set_limit:
        num_windows = min(num_windows, max_windows)

    # Loop through each sliding window with the gap applied
    window_number = 0
    while window_number < num_windows:
        if window_number % 100:
           print(f"Processing Window {window_number}/{num_windows}")

        start = window_number
        end = start + window_size
        X_train = X[start:end]
        y_train = y[start:end]

        # Prepare validation data for prediction
        X_val = X[end:end + num_predictions]
        y_val = y[end:end + num_predictions]

        # Track the start time of the window processing
        start_time = time.time()

        # Initialize and fit the model
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        model.fit(X_train, y_train)

        # Predict on validation data
        y_pred_val = model.predict(X_val)
        # Predict on training data
        y_pred_train = model.predict(X_train)

        # Calculate RMSE and RMSE percentage for validation
        mse_val = np.mean((y_val - y_pred_val) ** 2)
        rmse_val = np.sqrt(mse_val)
        rmse_val_perc = (rmse_val / y_val)[0] * 100  # Convert to percentage

        # Calculate RMSE for training
        mse_train = np.mean((y_train - y_pred_train) ** 2)
        rmse_train = np.sqrt(mse_train)
        rmse_train_perc = (rmse_train / y_train).mean() * 100  # Convert to percentage

        # Track the end time of the window processing
        end_time = time.time()

        # Calculate the time taken for this window
        window_time = end_time - start_time
        total_window_times += window_time  # Add the window time to the total time

        # Append RMSEs and percentage errors
        all_val_rmse.append(rmse_val)
        all_val_rmse_perc.append(rmse_val_perc)
        all_train_rmse.append(rmse_train)
        all_train_rmse_perc.append(rmse_train_perc)

        # Count predictions relative to actual values and update max RMSE percentage
        if y_pred_val < y_val:
            lower_count += 1
            lower_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for lower predictions
            max_rmse_perc_lower = max(max_rmse_perc_lower, rmse_val_perc)
        elif y_pred_val > y_val:
            higher_count += 1
            higher_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for higher predictions
            max_rmse_perc_higher = max(max_rmse_perc_higher, rmse_val_perc)

        # Move to the next window based on the gap
        window_number += gap

    # Calculate percentage for lower and higher counts
    lower_count_perc = (lower_count / num_windows) * 100
    higher_count_perc = (higher_count / num_windows) * 100

    # Calculate average RMSE percentage errors for lower and higher predictions
    avg_rmse_perc_lower = np.mean(lower_rmse_percs) if lower_rmse_percs else 0
    avg_rmse_perc_higher = np.mean(higher_rmse_percs) if higher_rmse_percs else 0

    # Calculate average, max, min, and variance for validation and training RMSEs, percentages
    avg_val_rmse = np.mean(all_val_rmse)
    var_val_rmse = np.var(all_val_rmse)

    avg_val_rmse_perc = np.mean(all_val_rmse_perc)
    var_val_rmse_perc = np.var(all_val_rmse_perc)
    max_avg_val_rmse_perc = np.max(all_val_rmse_perc)

    avg_train_rmse = np.mean(all_train_rmse)
    var_train_rmse = np.var(all_train_rmse)

    avg_train_rmse_perc = np.mean(all_train_rmse_perc)
    var_train_rmse_perc = np.var(all_train_rmse_perc)

    # Calculate percentage of times prediction is lower or equal to average RMSE
    percentage_lower_equal_avg_rmse = (np.sum(np.array(all_val_rmse) <= avg_val_rmse) / len(all_val_rmse)) * 100

    # Append results to the list with updated metric name
    results.append({
        'window_size': window_size,
        'avg_val_rmse': avg_val_rmse,
        'windowed_confidence_level': percentage_lower_equal_avg_rmse,  # Renamed metric
        'var_val_rmse': var_val_rmse,
        'avg_val_rmse_perc': avg_val_rmse_perc,
        'var_val_rmse_perc': var_val_rmse_perc,
        'avg_train_rmse': avg_train_rmse,
        'var_train_rmse': var_train_rmse,
        'avg_train_rmse_perc': avg_train_rmse_perc,
        'var_train_rmse_perc': var_train_rmse_perc,
        'window_time': total_window_times,
        'max_avg_val_rmse_perc': max_avg_val_rmse_perc,
        'lower_count_perc': lower_count_perc,
        'higher_count_perc': higher_count_perc,
        'max_rmse_perc_lower': max_rmse_perc_lower,
        'max_rmse_perc_higher': max_rmse_perc_higher,
        'avg_rmse_perc_lower': avg_rmse_perc_lower,
        'avg_rmse_perc_higher': avg_rmse_perc_higher,

    })

    # Print results for the current window size with the new name
    print(f'Window size [{window_size}] | Time Elapsed: {total_window_times:.3f} seconds')
    print(f'Average Prediction Error: {avg_val_rmse:.3f} IDR | {avg_val_rmse_perc:.3f} % | Confidence Level: {percentage_lower_equal_avg_rmse:.3f} % ')
    print(f'Under Predict - Count: {lower_count_perc:.3f} % | Average: {avg_rmse_perc_lower:.3f} % | Max: {max_rmse_perc_lower:.3f} %')
    print(f'Over Predict - Count: {higher_count_perc:.3f} % | Average: {avg_rmse_perc_higher:.3f} % | Max: {max_rmse_perc_higher:.3f} %')
    print(f'===========================================================================')

results_summary = pd.DataFrame(results)
results_summary

Processing Window 0/10617
Processing Window 1/10617
Processing Window 2/10617
Processing Window 3/10617
Processing Window 4/10617
Processing Window 5/10617
Processing Window 6/10617
Processing Window 7/10617
Processing Window 8/10617
Processing Window 9/10617
Processing Window 10/10617
Processing Window 11/10617
Processing Window 12/10617
Processing Window 13/10617
Processing Window 14/10617
Processing Window 15/10617
Processing Window 16/10617
Processing Window 17/10617
Processing Window 18/10617
Processing Window 19/10617
Processing Window 20/10617
Processing Window 21/10617
Processing Window 22/10617
Processing Window 23/10617
Processing Window 24/10617
Processing Window 25/10617
Processing Window 26/10617
Processing Window 27/10617
Processing Window 28/10617
Processing Window 29/10617
Processing Window 30/10617
Processing Window 31/10617
Processing Window 32/10617
Processing Window 33/10617
Processing Window 34/10617
Processing Window 35/10617
Processing Window 36/10617
Processing 

Unnamed: 0,window_size,avg_val_rmse,windowed_confidence_level,var_val_rmse,avg_val_rmse_perc,var_val_rmse_perc,avg_train_rmse,var_train_rmse,avg_train_rmse_perc,var_train_rmse_perc,window_time,max_avg_val_rmse_perc,lower_count_perc,higher_count_perc,max_rmse_perc_lower,max_rmse_perc_higher,avg_rmse_perc_lower,avg_rmse_perc_higher
0,1000,10.505201,65.31977,164.35588,0.066235,0.006303,8.536352,1.779455,0.053966,6.6e-05,7893.48363,2.399144,52.208722,47.791278,2.399144,1.382095,0.064805,0.067797


In [None]:
import time
import numpy as np
import pandas as pd
import logging
from sklearn.ensemble import GradientBoostingRegressor

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Separate df into features and target
features_df = df.drop(['date', 'next_close'], axis=1)
target_df = df['next_close']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = 50      # Maximum number of windows to process
set_limit = False     # Set this to False to process all windows

# Define list of window sizes
window_sizes = list(range(2000, 11001, 1000))
# window_sizes = [1000]

# List to store results
results = []

# Loop through each window size
for window_size in window_sizes:

    # Initiate lists to store RMSEs and percentages
    all_val_rmse = []
    all_val_rmse_perc = []
    all_train_rmse = []
    all_train_rmse_perc = []
    total_window_times = 0  # Variable to store total time for all windows

    # Counters and variables for prediction comparison to actual values
    lower_count = 0
    higher_count = 0
    max_rmse_perc_lower = 0  # To store max RMSE% when prediction is lower than actual
    max_rmse_perc_higher = 0  # To store max RMSE% when prediction is higher than actual

    # Separate lists to store RMSE percentage for lower and higher predictions
    lower_rmse_percs = []
    higher_rmse_percs = []

    # Separate lists to store val_rmse for lower and higher predictions
    rmse_lower_perc = []  # To store RMSE when prediction is lower than actual
    rmse_higher_perc = []  # To store RMSE when prediction is higher than actual

    # Calculate the number of windows based on dataset size
    num_windows = len(X) - window_size - num_predictions

    # Apply maximum window limit if set
    if set_limit:
        num_windows = min(num_windows, max_windows)

    # Loop through each sliding window with the gap applied
    window_number = 0
    while window_number < num_windows:
        if window_number % 100:
            logger.info(f"Processing Window {window_number}/{num_windows}")

        start = window_number
        end = start + window_size
        X_train = X[start:end]
        y_train = y[start:end]

        # Prepare validation data for prediction
        X_val = X[end:end + num_predictions]
        y_val = y[end:end + num_predictions]

        # Track the start time of the window processing
        start_time = time.time()

        # Initialize and fit the model
        model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        model.fit(X_train, y_train)

        # Predict on validation data
        y_pred_val = model.predict(X_val)
        # Predict on training data
        y_pred_train = model.predict(X_train)

        # Calculate RMSE and RMSE percentage for validation
        mse_val = np.mean((y_val - y_pred_val) ** 2)
        rmse_val = np.sqrt(mse_val)
        rmse_val_perc = (rmse_val / y_val)[0] * 100  # Convert to percentage

        # Calculate RMSE for training
        mse_train = np.mean((y_train - y_pred_train) ** 2)
        rmse_train = np.sqrt(mse_train)
        rmse_train_perc = (rmse_train / y_train).mean() * 100  # Convert to percentage

        # Track the end time of the window processing
        end_time = time.time()

        # Calculate the time taken for this window
        window_time = end_time - start_time
        total_window_times += window_time  # Add the window time to the total time

        # Append RMSEs and percentage errors
        all_val_rmse.append(rmse_val)
        all_val_rmse_perc.append(rmse_val_perc)
        all_train_rmse.append(rmse_train)
        all_train_rmse_perc.append(rmse_train_perc)

        # Count predictions relative to actual values and update max RMSE percentage
        if y_pred_val < y_val:
            lower_count += 1
            lower_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for lower predictions
            max_rmse_perc_lower = max(max_rmse_perc_lower, rmse_val_perc)
        elif y_pred_val > y_val:
            higher_count += 1
            higher_rmse_percs.append(rmse_val_perc)  # Store RMSE percentage for higher predictions
            max_rmse_perc_higher = max(max_rmse_perc_higher, rmse_val_perc)

        # Move to the next window based on the gap
        window_number += gap

    # Calculate percentage for lower and higher counts
    lower_count_perc = (lower_count / num_windows) * 100
    higher_count_perc = (higher_count / num_windows) * 100

    # Calculate average RMSE percentage errors for lower and higher predictions
    avg_rmse_perc_lower = np.mean(lower_rmse_percs) if lower_rmse_percs else 0
    avg_rmse_perc_higher = np.mean(higher_rmse_percs) if higher_rmse_percs else 0

    # Calculate average, max, min, and variance for validation and training RMSEs, percentages
    avg_val_rmse = np.mean(all_val_rmse)
    var_val_rmse = np.var(all_val_rmse)

    avg_val_rmse_perc = np.mean(all_val_rmse_perc)
    var_val_rmse_perc = np.var(all_val_rmse_perc)
    max_avg_val_rmse_perc = np.max(all_val_rmse_perc)

    avg_train_rmse = np.mean(all_train_rmse)
    var_train_rmse = np.var(all_train_rmse)

    avg_train_rmse_perc = np.mean(all_train_rmse_perc)
    var_train_rmse_perc = np.var(all_train_rmse_perc)

    # Calculate percentage of times prediction is lower or equal to average RMSE
    percentage_lower_equal_avg_rmse = (np.sum(np.array(all_val_rmse) <= avg_val_rmse) / len(all_val_rmse)) * 100

    # Append results to the list with updated metric name
    results.append({
        'window_size': window_size,
        'avg_val_rmse': avg_val_rmse,
        'windowed_confidence_level': percentage_lower_equal_avg_rmse,  # Renamed metric
        'var_val_rmse': var_val_rmse,
        'avg_val_rmse_perc': avg_val_rmse_perc,
        'var_val_rmse_perc': var_val_rmse_perc,
        'avg_train_rmse': avg_train_rmse,
        'var_train_rmse': var_train_rmse,
        'avg_train_rmse_perc': avg_train_rmse_perc,
        'var_train_rmse_perc': var_train_rmse_perc,
        'window_time': total_window_times,
        'max_avg_val_rmse_perc': max_avg_val_rmse_perc,
        'lower_count_perc': lower_count_perc,
        'higher_count_perc': higher_count_perc,
        'max_rmse_perc_lower': max_rmse_perc_lower,
        'max_rmse_perc_higher': max_rmse_perc_higher,
        'avg_rmse_perc_lower': avg_rmse_perc_lower,
        'avg_rmse_perc_higher': avg_rmse_perc_higher,
    })

    # Log results for the current window size with the new name
    logger.info(f'Window size [{window_size}] | Time Elapsed: {total_window_times:.3f} seconds')
    logger.info(f'Average Prediction Error: {avg_val_rmse:.3f} IDR | {avg_val_rmse_perc:.3f} % | Confidence Level: {percentage_lower_equal_avg_rmse:.3f} % ')
    logger.info(f'Under Predict - Count: {lower_count_perc:.3f} % | Average: {avg_rmse_perc_lower:.3f} % | Max: {max_rmse_perc_lower:.3f} %')
    logger.info(f'Over Predict - Count: {higher_count_perc:.3f} % | Average: {avg_rmse_perc_higher:.3f} % | Max: {max_rmse_perc_higher:.3f} %')
    logger.info(f'===========================================================================')

results_summary = pd.DataFrame(results)

In [13]:
import logging
import os
from datetime import datetime

# Get the current timestamp for the log file name
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

# Set a custom script name with the current time
script_name = f"gbm_usdt_idr-{current_time}"

# Construct the log file name
log_filename = os.path.join(os.getcwd(), f"{script_name}.log")

# Configure logging
logging.basicConfig(
    filename=log_filename,
    level=logging.INFO,  # Set the desired logging level
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Test logging
logging.info("Log file initiated successfully.")
logging.info("This is a sample log message.")

print(f"Log file created: {log_filename}")

Log file created: /home/ubuntu/Rheza/local-share/gbm_usdt_idr-20241129_100052.log


# Indodax Data Exploration

In [2]:
indodax_df

Unnamed: 0,date,open,high,low,close,volume
100,2018-08-28 19:00:00,14543.0,14543.0,14537.0,14537.0,5159.853746
101,2018-08-28 20:00:00,14540.0,14551.0,14538.0,14550.0,1839.865859
102,2018-08-28 21:00:00,14550.0,14551.0,14550.0,14550.0,6104.525120
103,2018-08-28 22:00:00,14550.0,14551.0,14550.0,14551.0,4093.335581
104,2018-08-28 23:00:00,14550.0,14551.0,14550.0,14550.0,11764.586659
...,...,...,...,...,...,...
55048,2024-12-04 07:00:00,16034.0,16050.0,16004.0,16027.0,128705.673654
55049,2024-12-04 08:00:00,16010.0,16027.0,15967.0,15987.0,265645.961721
55050,2024-12-04 09:00:00,15987.0,16001.0,15982.0,15986.0,201488.095733
55051,2024-12-04 10:00:00,15986.0,16037.0,15955.0,16019.0,607856.204671


In [3]:
check_1 = indodax_df.copy()

check_1 = check_1[['date','close']]

check_1['close_1h_chg_pct'] = (check_1['close'].shift(-1) - check_1['close']) / check_1['close'] * 100
check_1['close_2h_chg_pct'] = (check_1['close'].shift(-2) - check_1['close']) / check_1['close'] * 100
check_1['close_3h_chg_pct'] = (check_1['close'].shift(-3) - check_1['close']) / check_1['close'] * 100
check_1['close_4h_chg_pct'] = (check_1['close'].shift(-4) - check_1['close']) / check_1['close'] * 100
check_1['close_5h_chg_pct'] = (check_1['close'].shift(-5) - check_1['close']) / check_1['close'] * 100
check_1['close_6h_chg_pct'] = (check_1['close'].shift(-6) - check_1['close']) / check_1['close'] * 100
check_1['close_7h_chg_pct'] = (check_1['close'].shift(-7) - check_1['close']) / check_1['close'] * 100
check_1['close_8h_chg_pct'] = (check_1['close'].shift(-8) - check_1['close']) / check_1['close'] * 100
check_1['close_12h_chg_pct'] = (check_1['close'].shift(-12) - check_1['close']) / check_1['close'] * 100
check_1['close_16h_chg_pct'] = (check_1['close'].shift(-16) - check_1['close']) / check_1['close'] * 100
check_1['close_24h_chg_pct'] = (check_1['close'].shift(-24) - check_1['close']) / check_1['close'] * 100
check_1['close_48h_chg_pct'] = (check_1['close'].shift(-48) - check_1['close']) / check_1['close'] * 100
check_1['close_72h_chg_pct'] = (check_1['close'].shift(-72) - check_1['close']) / check_1['close'] * 100

check_1.dropna(inplace=True)

check_1

Unnamed: 0,date,close,close_1h_chg_pct,close_2h_chg_pct,close_3h_chg_pct,close_4h_chg_pct,close_5h_chg_pct,close_6h_chg_pct,close_7h_chg_pct,close_8h_chg_pct,close_12h_chg_pct,close_16h_chg_pct,close_24h_chg_pct,close_48h_chg_pct,close_72h_chg_pct
100,2018-08-28 19:00:00,14537.0,0.089427,0.089427,0.096306,0.089427,0.103185,0.089427,0.089427,0.089427,0.089427,-0.213249,0.020637,0.433377,0.811722
101,2018-08-28 20:00:00,14550.0,0.000000,0.006873,0.000000,0.013746,0.000000,0.000000,0.000000,-0.013746,-0.075601,-0.130584,-0.061856,0.336770,1.209622
102,2018-08-28 21:00:00,14550.0,0.006873,0.000000,0.013746,0.000000,0.000000,0.000000,-0.013746,-0.061856,-0.171821,-0.295533,0.068729,0.261168,0.996564
103,2018-08-28 22:00:00,14551.0,-0.006872,0.006872,-0.006872,-0.006872,-0.006872,-0.020617,-0.068724,-0.089341,0.020617,-0.041234,0.048107,0.501684,1.223284
104,2018-08-28 23:00:00,14550.0,0.013746,0.000000,0.000000,0.000000,-0.013746,-0.061856,-0.082474,0.000000,-0.302405,0.096220,0.240550,0.515464,1.010309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54976,2024-12-01 07:00:00,15845.0,0.119912,0.075734,0.189334,0.164090,0.189334,0.189334,0.183023,0.201956,0.063111,-0.088356,-0.618492,0.347113,1.148627
54977,2024-12-01 08:00:00,15864.0,-0.044125,0.069339,0.044125,0.069339,0.069339,0.063036,0.081947,-0.081947,0.000000,-0.271054,-0.743822,0.208018,0.775340
54978,2024-12-01 09:00:00,15857.0,0.113515,0.088289,0.113515,0.113515,0.107208,0.126127,-0.037838,0.044145,-0.012613,-0.233335,-0.693700,0.201804,0.813521
54979,2024-12-01 10:00:00,15875.0,-0.025197,0.000000,0.000000,-0.006299,0.012598,-0.151181,-0.069291,-0.094488,-0.119685,-0.409449,-0.277165,0.088189,0.907087


In [5]:
# Function to calculate the required statistics
def calculate_statistics(df):
    results = {}

    for column in df.columns[1:]:  # Exclude the 'date' column
        col_data = df[column]

        # Percentages
        positive_pct = (col_data > 0).mean() * 100
        negative_pct = (col_data < 0).mean() * 100
        zero_pct = (col_data == 0).mean() * 100

        # Positive statistics
        positive_values = col_data[col_data > 0]
        positive_avg = positive_values.mean()
        positive_min = positive_values.min()
        positive_max = positive_values.max()

        # Negative statistics
        negative_values = col_data[col_data < 0]
        negative_avg = negative_values.mean()
        negative_min = negative_values.min()
        negative_max = negative_values.max()

        results[column] = {
            "positive_pct": positive_pct,
            "negative_pct": negative_pct,
            "zero_pct": zero_pct,
            "positive_avg": positive_avg,
            "positive_min": positive_min,
            "positive_max": positive_max,
            "negative_avg": negative_avg,
            "negative_min": negative_min,
            "negative_max": negative_max,
        }

    return pd.DataFrame(results).T

# Calculate statistics
stats_1 = calculate_statistics(check_1)

# Display the results
stats_1

Unnamed: 0,positive_pct,negative_pct,zero_pct,positive_avg,positive_min,positive_max,negative_avg,negative_min,negative_max
close,100.0,0.0,0.0,14885.054992,13267.0,16998.0,,,
close_1h_chg_pct,39.844391,40.669813,19.485797,0.129489,0.005952,8.961083,-0.125914,-6.938319,-0.006022
close_2h_chg_pct,42.198575,43.116926,14.684499,0.160175,0.006022,9.145763,-0.155276,-6.211566,-0.005952
close_3h_chg_pct,43.459485,44.512673,12.027842,0.182472,0.006022,8.571621,-0.176177,-6.484875,-0.006022
close_4h_chg_pct,44.191979,45.195969,10.612052,0.201747,0.005953,8.418075,-0.194787,-6.633952,-0.006022
close_5h_chg_pct,44.867987,45.811847,9.320165,0.218645,0.006022,9.561726,-0.211171,-7.074974,-0.005952
close_6h_chg_pct,45.267032,46.025036,8.707932,0.233913,0.006024,9.464881,-0.226602,-7.037704,-0.006013
close_7h_chg_pct,45.686121,46.513365,7.800514,0.248211,0.00602,9.509557,-0.239874,-7.447668,-0.006009
close_8h_chg_pct,46.01957,46.6974,7.283031,0.261638,0.006024,9.079206,-0.253437,-6.832723,-0.006013
close_12h_chg_pct,47.090979,47.439004,5.470017,0.306526,0.006024,10.638444,-0.298009,-6.510862,-0.00601


# Clean + Features Engineering

In [2]:
df_1h = indodax_df[['date','open','close']]
df_1h = df_1h[df_1h['date'] >= '2023-01-01']
df_1h

Unnamed: 0,date,open,close
38169,2023-01-01 00:00:00,15668.0,15668.0
38170,2023-01-01 01:00:00,15668.0,15669.0
38171,2023-01-01 02:00:00,15669.0,15668.0
38172,2023-01-01 03:00:00,15669.0,15668.0
38173,2023-01-01 04:00:00,15669.0,15668.0
...,...,...,...
55050,2024-12-04 09:00:00,15987.0,15986.0
55051,2024-12-04 10:00:00,15986.0,16019.0
55052,2024-12-04 11:00:00,16013.0,16034.0
55053,2024-12-04 12:00:00,16007.0,16005.0


In [5]:
import pandas as pd

df = df_1h.copy()
# Step 1: Sort by date (if not already sorted)
df = df.sort_values('date').reset_index(drop=True)

# Step 2: Create lagged features
df['prev_close'] = df['close'].shift(1)

# Step 3: Calculate price changes
df['open_change'] = ((df['open'] - df['open'].shift(1)) / df['open'].shift(1)) * 100
df['close_change'] = ((df['close'] - df['close'].shift(1)) / df['close'].shift(1)) * 100

# Step 4: Calculate price range
df['price_range'] = df['open'] - df['close']

# Step 5: Calculate moving averages
df['ma_close_3'] = df['close'].rolling(window=3).mean()
df['ma_close_6'] = df['close'].rolling(window=6).mean()

# Step 6: Calculate trend indicators
df['gap'] = df['open'] - df['prev_close']

# Step 7: Relative strength (close to moving averages ratio)
df['close_ma3_ratio'] = df['close'] / df['ma_close_3']
df['close_ma6_ratio'] = df['close'] / df['ma_close_6']

# Step 9: Drop rows with NaN values due to shifting and rolling
df = df.dropna().reset_index(drop=True)

# Step 8: Create the target variable (next_close_change)
df['next_close_change'] = ((df['close'].shift(-1) - df['close']) / df['close']).apply(lambda x: 1 if x >= 0 else (0 if pd.notna(x) else x))

df

Unnamed: 0,date,open,close,prev_close,open_change,close_change,price_range,ma_close_3,ma_close_6,gap,close_ma3_ratio,close_ma6_ratio,next_close_change
0,2023-01-01 05:00:00,15669.0,15668.0,15668.0,0.000000,0.000000,1.0,15668.000000,15668.166667,1.0,1.000000,0.999989,1.0
1,2023-01-01 06:00:00,15669.0,15669.0,15668.0,0.000000,0.006382,0.0,15668.333333,15668.333333,1.0,1.000043,1.000043,0.0
2,2023-01-01 07:00:00,15669.0,15668.0,15669.0,0.000000,-0.006382,1.0,15668.333333,15668.166667,0.0,0.999979,0.999989,1.0
3,2023-01-01 08:00:00,15669.0,15668.0,15668.0,0.000000,0.000000,1.0,15668.333333,15668.166667,1.0,0.999979,0.999989,1.0
4,2023-01-01 09:00:00,15669.0,15668.0,15668.0,0.000000,0.000000,1.0,15668.000000,15668.166667,1.0,1.000000,0.999989,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16876,2024-12-04 09:00:00,15987.0,15986.0,15987.0,-0.143660,-0.006255,1.0,16000.000000,16011.833333,0.0,0.999125,0.998387,1.0
16877,2024-12-04 10:00:00,15986.0,16019.0,15986.0,-0.006255,0.206431,-33.0,15997.333333,16011.833333,0.0,1.001354,1.000448,1.0
16878,2024-12-04 11:00:00,16013.0,16034.0,16019.0,0.168898,0.093639,-21.0,16013.000000,16014.500000,-6.0,1.001311,1.001218,0.0
16879,2024-12-04 12:00:00,16007.0,16005.0,16034.0,-0.037470,-0.180866,2.0,16019.333333,16009.666667,-27.0,0.999105,0.999709,0.0


In [6]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Separate df into features and target
features_df = df.drop(['date', 'next_close_change'], axis=1)
target_df = df['next_close_change']

# Convert to NumPy arrays
X = features_df.values
y = target_df.values

# Specify parameters for the sliding window approach
num_predictions = 1   # Number of rows to predict
gap = 1               # Gap (number of rows to skip after each window)
max_windows = 50      # Maximum number of windows to process
set_limit = False     # Set this to False to process all windows

# Define list of window sizes
# window_sizes = list(range(2000, 16001, 1000))
window_sizes = [2000]

# List to store results
results = []

# Loop through each window size
for window_size in window_sizes:

    # Initiate lists to store F1 scores and accuracies
    all_f1_scores = []
    accuracies = []
    total_window_times = 0  # Variable to store total time for all windows

    # Calculate the number of windows based on dataset size
    num_windows = len(X) - window_size - num_predictions

    # Apply maximum window limit if set
    if set_limit:
        num_windows = min(num_windows, max_windows)

    # Loop through each sliding window with the gap applied
    window_number = 0
    while window_number < num_windows:

        if window_number % 1000 == 0:
            print(f'Processing window {window_number} of {num_windows}')

        start = window_number
        end = start + window_size
        X_train = X[start:end]
        y_train = y[start:end]

        X_train_mean = np.mean(X_train, axis=0)
        X_train_std = np.std(X_train, axis=0)
        X_train_normalized = (X_train - X_train_mean) / X_train_std

        # Prepare validation data for prediction
        X_val = X[end:end + num_predictions]
        y_val = y[end:end + num_predictions]

        # Normalize validation data using the statistics from the training set
        X_val_normalized = (X_val - X_train_mean) / X_train_std

        # Track the start time of the window processing
        start_time = time.time()

        # Initialize and fit the Logistic Regression model
        model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence if needed
        model.fit(X_train_normalized, y_train)  # Train without normalizing y_train

        # Predict on validation data
        y_pred_val = model.predict(X_val_normalized)
        
        # Binary prediction and ground truth
        y_pred_binary_val = np.where(y_pred_val  == True, 1, 0)  # Binary prediction (1 if predicted value <= 0.03)
        y_binary_val = np.where(y_val == True, 1, 0)  # Binary ground truth (1 if actual value is True, else 0)


        # Calculate True Positives (TP), False Positives (FP), False Negatives (FN), and True Negatives (TN)
        TP = np.sum((y_pred_binary_val == 1) & (y_binary_val == 1))
        FP = np.sum((y_pred_binary_val == 1) & (y_binary_val == 0))
        FN = np.sum((y_pred_binary_val == 0) & (y_binary_val == 1))
        TN = np.sum((y_pred_binary_val == 0) & (y_binary_val == 0))

        # Calculate Precision and Recall safely
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0  # Avoid division by zero
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0  # Avoid division by zero

        # Calculate F1 Score safely
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Calculate Accuracy
        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0

        # Store the results
        all_f1_scores.append(f1)
        accuracies.append(accuracy)

        # Track the end time of the window processing
        end_time = time.time()

        # Calculate the time taken for this window
        window_time = end_time - start_time
        total_window_times += window_time  # Add the window time to the total time

        # Move to the next window based on the gap
        window_number += gap
    
    # Calculate average F1 score and accuracy
    avg_f1_score = np.mean(all_f1_scores)
    avg_accuracy = np.mean(accuracies)

    # Store the results
    results.append({
        'window_size': window_size,
        'avg_f1_score': avg_f1_score,
        'avg_accuracy': avg_accuracy,
        'total_time': total_window_times
    })
    
    # Print results for the current window size with the new name
    print(f'Window size [{window_size}] | Time Elapsed: {total_window_times:.3f} seconds')
    print(f'Average F1 Score: {avg_f1_score:.3f} | Average Accuracy: {avg_accuracy:.3f} %')
    print(f'===================================================================================')

# Convert the results into a DataFrame for easy viewing
results_summary = pd.DataFrame(results)

# Display the results
results_summary

Processing window 0 of 14880
Processing window 1000 of 14880


: 