## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from typing import Tuple
import pandas_ta as ta

# backtesting library
import vectorbt as vbt

# machine learning
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

# get data from TradingView
from tvDatafeed import TvDatafeed, Interval
tv = TvDatafeed()

you are using nologin method, data you access may be limited


## Import Data

In [2]:
interval = Interval.in_1_hour

raw_df = tv.get_hist(symbol='BTCUSDT', exchange='BINANCE', interval=interval , n_bars=100000)
raw_df

Unnamed: 0_level_0,symbol,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 07:00:00,BINANCE:BTCUSDT,46216.93,46731.39,46208.37,46656.13,1503.33095
2022-01-01 08:00:00,BINANCE:BTCUSDT,46656.14,46949.99,46574.06,46778.14,943.81539
2022-01-01 09:00:00,BINANCE:BTCUSDT,46778.14,46928.94,46721.96,46811.77,485.16860
2022-01-01 10:00:00,BINANCE:BTCUSDT,46811.77,46916.63,46760.12,46813.20,562.88971
2022-01-01 11:00:00,BINANCE:BTCUSDT,46813.21,46887.33,46591.23,46711.05,861.88389
...,...,...,...,...,...,...
2023-05-30 10:00:00,BINANCE:BTCUSDT,27783.43,27940.55,27773.69,27829.23,1430.88893
2023-05-30 11:00:00,BINANCE:BTCUSDT,27829.23,27848.09,27760.38,27776.19,733.62881
2023-05-30 12:00:00,BINANCE:BTCUSDT,27776.18,27776.19,27672.93,27753.38,969.42806
2023-05-30 13:00:00,BINANCE:BTCUSDT,27753.38,27821.86,27752.88,27795.81,835.43097


## Add technical indicators

In [3]:
strategy_name = "Demo Strategy"
strategy1 = ta.Strategy(
    name='Demo Strategy',
    description='',
    ta=[
        {
            'kind': 'adx', 'length': 14
            , 'col_names': ('ADX_14', 'DMP_14', 'DMN_14')
            
        },
        {
            'kind': 'bbands', 'length': 20, 'std': 2, 
            'col_names': ('BBL', 'BBM', 'BBU', 'BBB', 'BBP')
        },
        {
            'kind': 'ema', 'length': 50
        },
        {
            'kind': 'ema', 'length': 100
        },
        {
            'kind': 'ema', 'length': 200
        },
        {
            "kind": "donchian", "lower_length": 12, "upper_length": 24
            , "col_names": ("DCL", "DCM", "DCU")
        },
        {
            "kind": "macd", "fast": 12, "slow": 26
            , "col_names": ("MACD", "MACD_H", "MACD_S")
        }
    ]
)

strat1 = raw_df.copy()
strat1.ta.strategy(strategy1)

# strat1.head(5)



In [4]:
# label for machine learning 
strat1['ret'] = strat1['close'].pct_change().shift(-1)

strat1['y'] = np.where(np.isnan(strat1['ret']), strat1['ret'], 
                       np.where(strat1['ret'] > 0
                                , 1
                                , 0)
                       )
# strat1

## Data Cleaning

In [5]:
def num_split_data(data: pd.DataFrame, train_size: float, test_size: float, out_of_sample_size: float, n_splits: int) -> Tuple[int, int, int]:
    """
    Calculate the size of training, testing, and out-of-sample data for time series cross-validation.

    Parameters
    ----------
    data : pd.DataFrame
        A pandas DataFrame containing the time series data.
    train_size : float
        The proportion of the data to use for training.
    test_size : float
        The proportion of the data to use for testing.
    out_of_sample_size : float
        The proportion of the data to use for out-of-sample prediction.
    n_splits : int
        The number of folds to use for cross-validation.

    Returns
    -------
    Tuple[int, int, int]
        A tuple containing the sizes of the training, testing, and out-of-sample data.
    """
    # Calculate the size of out-of-sample data
    rows_out_of_sample = (len(data)) * out_of_sample_size

    # Calculate the size of train/test dat
    train_test_size = ((len(data)) - rows_out_of_sample)
    num_of_days = train_test_size/24

    # Calculate the size of training and testing sets for each fold
    split_train_test_size = (train_test_size//n_splits)
    rows_train_size = round((split_train_test_size * train_size))
    rows_test_size = round((split_train_test_size * test_size))
    rows_out_of_sample = round(rows_out_of_sample)
    
    # Print the number of days, training size, and testing size for each fold
    print(f"Number of Days: {num_of_days}")
    print(f"Training Size for Each Fold: {rows_train_size}")
    print(f"Testing Size for Each Fold: {rows_test_size}")
    print(f"Out of Sample Size for Each Fold: {rows_out_of_sample}")
    
    return rows_train_size, rows_test_size, rows_out_of_sample

In [6]:
n_splits = 10

rows_train_size, rows_test_size, rows_out_of_sample = num_split_data(data=raw_df, train_size=0.80, test_size=0.20, 
                                                                     out_of_sample_size=0.1, n_splits=n_splits)

Number of Days: 462.825
Training Size for Each Fold: 888
Testing Size for Each Fold: 222
Out of Sample Size for Each Fold: 1234


In [7]:

X = strat1.drop(columns=['symbol', 'ret', 'y']).copy()
X = X.iloc[:, 5:] # drop OHLC
X.dropna(inplace=True)
y = strat1['y']
y = y[X.index] # matching index

# The training and validation datasets are utilized to explore and determine the optimal methodology
# The testing dataset, which represents unseen data, is then used once the best result has been achieved on the training and validation datasets.
X_train_val, X_test = X.iloc[:-rows_out_of_sample], X.iloc[-rows_out_of_sample:]
y_train_val, y_test = y.iloc[:-rows_out_of_sample], y.iloc[-rows_out_of_sample:]

## Split Train/Validation Dataset

In [8]:
# training for 1 month
# week * hrs
train_size = 28*24

# test for 1 week
test_size = 7*24

In [9]:
rf = RandomForestClassifier(random_state=42)

tscv = TimeSeriesSplit(n_splits=10, max_train_size=train_size, test_size=test_size)

# Perform cross-validation on the data
for k_fold, (X_train_val_index, y_train_val_index) in enumerate(tscv.split(X_train_val)) :
    # X_train_val = raw_indicators.iloc[X_train_val_index]
    # y_train_val = raw_indicators.iloc[y_train_val_index]
    train_data = X_train_val.iloc[X_train_val_index]
    test_data = X_train_val.iloc[y_train_val_index]
    print('--------------------')
    # number of fold
    print('Fold: ', k_fold+1)
    # Print the shapes of the training and testing sets for each fold
    print('Training data shape:', train_data.shape)
    print('Testing data shape:', test_data.shape)

    # Print the period of the training and testing sets for each fold 
    print('Training date:', train_data.index[0], "-", train_data.index[-1])
    print('Testing date:', test_data.index[0], "-", test_data.index[-1])

--------------------
Fold:  1
Training data shape: (672, 17)
Testing data shape: (168, 17)
Training date: 2023-01-01 03:00:00 - 2023-01-29 02:00:00
Testing date: 2023-01-29 03:00:00 - 2023-02-05 02:00:00
--------------------
Fold:  2
Training data shape: (672, 17)
Testing data shape: (168, 17)
Training date: 2023-01-08 03:00:00 - 2023-02-05 02:00:00
Testing date: 2023-02-05 03:00:00 - 2023-02-12 02:00:00
--------------------
Fold:  3
Training data shape: (672, 17)
Testing data shape: (168, 17)
Training date: 2023-01-15 03:00:00 - 2023-02-12 02:00:00
Testing date: 2023-02-12 03:00:00 - 2023-02-19 02:00:00
--------------------
Fold:  4
Training data shape: (672, 17)
Testing data shape: (168, 17)
Training date: 2023-01-22 03:00:00 - 2023-02-19 02:00:00
Testing date: 2023-02-19 03:00:00 - 2023-02-26 02:00:00
--------------------
Fold:  5
Training data shape: (672, 17)
Testing data shape: (168, 17)
Training date: 2023-01-29 03:00:00 - 2023-02-26 02:00:00
Testing date: 2023-02-26 03:00:00 - 

## Normalization

In [10]:
normalizers = [
    ('scaler', None),
    ('scaler', RobustScaler()),
    ('scaler', MinMaxScaler()),
    ('scaler', StandardScaler())
]

for i, normalizer in enumerate(normalizers):
    pipeline = Pipeline([
        normalizer,
        ('rf', RandomForestClassifier(random_state=42))
    ])

    tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=train_size, test_size=test_size)

    score = cross_val_score(pipeline, X_train_val, y_train_val, scoring='roc_auc', cv=tscv)
    print("---------------------------")
    print(f'Normalizer: {type(normalizers[i][1]).__name__}')
    print('{}: {:.4f} +- {:.4f}'.format(type(pipeline).__name__, score.mean(), score.std()))
    print(score)

---------------------------
Normalizer: NoneType
Pipeline: 0.5449 +- 0.0515
[0.48072289 0.62900014 0.49337607 0.53389109 0.48431957 0.56591457
 0.52875216 0.63506807 0.54369279 0.55404637]
---------------------------
Normalizer: RobustScaler
Pipeline: 0.5432 +- 0.0487
[0.48072289 0.62359551 0.49337607 0.53389109 0.48474528 0.56108983
 0.52824899 0.625      0.54726831 0.55404637]
---------------------------
Normalizer: MinMaxScaler
Pipeline: 0.5443 +- 0.0498
[0.48072289 0.62900014 0.49337607 0.5417612  0.48247481 0.56108983
 0.52824899 0.625      0.54726831 0.55404637]
---------------------------
Normalizer: StandardScaler
Pipeline: 0.5432 +- 0.0487
[0.48072289 0.62359551 0.49337607 0.53389109 0.48474528 0.56108983
 0.52824899 0.625      0.54726831 0.55404637]


Select the Best Normalizer

In [11]:
# Define a parameter grid for the Random Forest model
param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

tscv = TimeSeriesSplit(n_splits=n_splits, max_train_size=train_size, test_size=test_size)

grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=tscv)
grid_search.fit(X_train_val, y_train_val)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 10, 'rf__n_estimators': 50}
Best score:  0.5613308639172659


In [12]:
# auc score
grid_search.score(X_train_val, y_train_val)

0.8350252530005795

## Out of Sample

In [13]:
signal_df = X_test.copy()
signal_df['signal'] = pd.Series(data=grid_search.predict(X_test), index=X_test.index)

# convert signal as boolean
signal_df['signal'] = signal_df['signal'].apply(lambda x: True if x==1 else False)

# more realistic
signal_df['signal'] = signal_df['signal'].shift()
signal_df.dropna(inplace=True)

# create signal backtesting
signal_vectorbt = signal_df.ta.tsignals(signal_df.signal, asbool=True, append=True)

# close price
close_price = strat1.loc[signal_df.index[0]:signal_df.index[-1]]['close']

port = vbt.Portfolio.from_signals(close_price,
                                    entries=signal_vectorbt.TS_Entries,
                                    exits=signal_vectorbt.TS_Exits,
                                    freq="1h",
                                    init_cash = 1000,
                                    size=0.1,
                                    # fees = 0.00075,     # 0.075% (BNB 25% off)
                                    # slippage = 0.001    # 0.1%
                                )

port.plot().show()
port.stats()

Start                               2023-04-09 06:00:00
End                                 2023-05-30 14:00:00
Period                                 51 days 09:00:00
Start Value                                      1000.0
End Value                                    958.897439
Total Return [%]                              -4.110256
Benchmark Return [%]                           -0.63411
Max Gross Exposure [%]                            100.0
Total Fees Paid                                     0.0
Max Drawdown [%]                               11.17887
Max Drawdown Duration                  47 days 14:00:00
Total Trades                                        122
Total Closed Trades                                 122
Total Open Trades                                     0
Open Trade PnL                                      0.0
Win Rate [%]                                   61.47541
Best Trade [%]                                 2.584027
Worst Trade [%]                               -9