In [1]:
import numpy as np
import pandas as pd
import talib
import numpy
import datetime as dt
# pd.set_option('display.max_rows', None)  # show all rows
# pd.set_option('display.max_columns', None)  # show all columns
# pd.set_option('display.width', None)  # don't wrap lines
# pd.set_option('display.max_colwidth', None)  # show full text in cells

df = pd.read_excel(r'aud_usd_1h_5year.xlsx')
df['atr'] = talib.ATR(
    df['high'],
    df['low'],
    df['close'],
    timeperiod=24
)
df['ma'] = talib.KAMA(
    df['close'],
    timeperiod=14
)

df['adx'] = talib.ADX(
    df['high'],
    df['low'],
    df['close'],
    timeperiod=24
) / 100

df['rsi'] = talib.RSI(
    df['close'],
    timeperiod=14
) / 100



In [2]:
df = df.iloc[47:].reset_index(drop=True)

In [3]:
df = df.drop(columns=['Unnamed: 0'])

In [4]:
def set_ma_status(row):
    if row['close'] > row['ma']:
        return 'upper'
    if row['close'] < row['ma']:
        return 'under'
    return 'eq'


df['ma_status'] = df.apply(set_ma_status, axis=1)

In [5]:
df['id'] = df.index

In [6]:
import numpy as np

In [7]:
df['entry'] = df['close']
df['tp_buy'] = df['close'] + (df['atr'] * 1.5)
df['tp_sell'] = df['close'] - (df['atr'] * 1.5)


In [8]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['timestamp'] = df['datetime'].apply(lambda d: int(d.timestamp()))

In [13]:
df.dtypes

datetime     datetime64[ns, UTC]
open                     float64
high                     float64
low                      float64
close                    float64
timeframe                 object
atr                      float64
ma                       float64
adx                      float64
rsi                      float64
ma_status                 object
id                         int64
entry                    float64
tp_buy                   float64
tp_sell                  float64
timestamp                  int64
dtype: object

In [9]:
array = np.array(df[['timestamp', 'open', 'high', 'low', 'close', 'atr', 'adx', 'tp_buy', 'tp_sell']])

In [10]:
patterns = []
array_size = len(array)

for i, element in enumerate(array):
    if i + 24 == array_size:
        break

    pat = array[i:i + 24, :]
    patterns.append(pat)

In [11]:
def get_trade_from_pattern(pattern: np.array):
    timestamp = pattern[-1, 0].copy()
    entry = pattern[-1, 4].copy()
    tp_buy = pattern[-1, 7].copy()
    tp_sell = pattern[-1, 8].copy()

    return timestamp, entry, tp_buy, tp_sell


any_trade = get_trade_from_pattern(patterns[10])

In [12]:
def evaluate_trade_result(chart: np.array, trade: tuple):
    timestamp, entry, tp_buy, tp_sell = trade

    backtest_chart = chart[chart[:, 0] >= timestamp]

    for i, element in enumerate(backtest_chart):
        price_high = element[2]
        price_low = element[3]

        # print(f'entry: {entry} price_high: {price_high} price_low: {price_low}')
        # print(f'tp_buy: {tp_buy} tp_sell: {tp_sell}')

        if price_high > tp_buy:
            return "buy"

        if price_low < tp_sell:
            return "sell"

    return "None"

In [13]:
trade_result = []
for pat in patterns:
    trade = get_trade_from_pattern(pat)
    result = evaluate_trade_result(array, trade)
    trade_result.append((trade[0], result))

In [14]:
patterns_for_train = []
for pat in patterns:
    id = pat[-1, 0]
    closes = pat[:, 4]
    atr = pat[-1, 5]
    adx = pat[-1, 6]
    result = np.array([id, atr, adx])
    result = np.hstack((result, closes))
    patterns_for_train.append(result)

In [15]:
patterns_for_train_df = pd.DataFrame(patterns_for_train, columns=[
    'timestamp',
    'atr',
    'adx',
    'close_1',
    'close_2',
    'close_3',
    'close_4',
    'close_5',
    'close_6',
    'close_7',
    'close_8',
    'close_9',
    'close_10',
    'close_11',
    'close_12',
    'close_13',
    'close_14',
    'close_15',
    'close_16',
    'close_17',
    'close_18',
    'close_19',
    'close_20',
    'close_21',
    'close_22',
    'close_23',
    'close_24',
])

In [16]:
trade_result_df = pd.DataFrame(trade_result, columns=['timestamp', 'result'])

In [17]:
patterns_for_train_df = pd.merge(patterns_for_train_df, trade_result_df, on='timestamp')

In [18]:
df_ma_statuses = df[['timestamp', 'ma_status', 'rsi']]

In [19]:
final_patterns_for_training = pd.merge(patterns_for_train_df, df_ma_statuses, on='timestamp')

In [20]:
final_patterns_for_training = final_patterns_for_training[
    final_patterns_for_training['result'] != 'None'
    ]

In [21]:
final_patterns_for_training['result'] = (
    final_patterns_for_training['result'].apply(lambda x: 1 if x == 'buy' else 0)
)

In [22]:
final_patterns_for_training['ma_status'] = (
    final_patterns_for_training['ma_status'].apply(lambda x: 1 if x == 'upper' else 0)
)

In [23]:
final_patterns_for_training.columns

Index(['timestamp', 'atr', 'adx', 'close_1', 'close_2', 'close_3', 'close_4',
       'close_5', 'close_6', 'close_7', 'close_8', 'close_9', 'close_10',
       'close_11', 'close_12', 'close_13', 'close_14', 'close_15', 'close_16',
       'close_17', 'close_18', 'close_19', 'close_20', 'close_21', 'close_22',
       'close_23', 'close_24', 'result', 'ma_status', 'rsi'],
      dtype='object')

In [24]:
# for i in range(1,25):
#     final_patterns_for_training[f'close_{i}'] = final_patterns_for_training[f'close_{i}'] * i

In [27]:
data_set = final_patterns_for_training[
    ['atr', 'adx', 'ma_status', 'rsi', 'close_1', 'close_2', 'close_3', 'close_4',
       'close_5', 'close_6', 'close_7', 'close_8', 'close_9', 'close_10',
       'close_11', 'close_12', 'close_13', 'close_14', 'close_15', 'close_16',
       'close_17', 'close_18', 'close_19', 'close_20', 'close_21', 'close_22',
       'close_23', 'close_24', 'result']].copy()

In [28]:
data_set_array = np.array(data_set)
# np.random.shuffle(data_set_array)

In [29]:
X , y = data_set_array[:, :-1], data_set_array[:, -1]

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, train_size=0.8,random_state=42)

In [31]:
from lightgbm import LGBMClassifier

lgbm_classifier = LGBMClassifier(
        # Core settings
    objective="binary",           # or "multiclass" if more than 2 labels
    boosting_type="gbdt",         # could try "dart" for more robustness
    n_estimators=800,             # enough trees for stable learning
    learning_rate=0.03,           # small learning rate = smoother fit

    # Tree structure
    num_leaves=31,                # typical good start, can tune 15–63
    max_depth=-1,                 # let it grow naturally first
    min_data_in_leaf=50,          # prevents overfitting with low correlations
    min_child_weight=1e-3,        # use slightly higher if data is noisy

    # Regularization
    reg_alpha=0.2,                # L1
    reg_lambda=0.8,               # L2
    bagging_fraction=0.8,         # row subsampling
    bagging_freq=1,               # every iteration
    feature_fraction=0.8,         # column subsampling
    subsample_freq=1,

    # Randomness
    random_state=42,
    n_jobs=-1,

    # Extra tricks
    verbose=-1,
    boosting="gbdt"
)

In [32]:
lgbm_classifier.fit(X_train, y_train)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,800
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [33]:
from sklearn.model_selection import cross_val_predict
pred_lgbm_clf =  cross_val_predict(lgbm_classifier, X_train, y_train, cv = 3)




In [34]:
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score

print(f"lgbm_clf f1 score is: {f1_score(y_train, pred_lgbm_clf)}")
print(f"lgbm_clf precision is: {precision_score(y_train, pred_lgbm_clf)}")
print(f"lgbm_clf recall score is: {recall_score(y_train, pred_lgbm_clf)}")

lgbm_clf f1 score is: 0.6870969886325866
lgbm_clf precision is: 0.6727414735745899
lgbm_clf recall score is: 0.7020785219399538


In [35]:
y_pred_test = lgbm_classifier.predict(X_test)



In [36]:
result = y_pred_test == y_test

In [37]:
result_df = pd.DataFrame(result, columns=['result'])

In [38]:
len(result_df[result_df['result'] == True]) / len(y_test)

0.7026649923259384

In [39]:
len(result_df[result_df['result'] == False])

2131

In [45]:
pd.DataFrame(y_pred_test)

Unnamed: 0,0
0,1.0
1,0.0
2,1.0
3,0.0
4,0.0
...,...
7162,0.0
7163,1.0
7164,1.0
7165,0.0


In [38]:
final_model = LGBMClassifier(
        # Core settings
    objective="binary",           # or "multiclass" if more than 2 labels
    boosting_type="gbdt",         # could try "dart" for more robustness
    n_estimators=800,             # enough trees for stable learning
    learning_rate=0.03,           # small learning rate = smoother fit

    # Tree structure
    num_leaves=31,                # typical good start, can tune 15–63
    max_depth=-1,                 # let it grow naturally first
    min_data_in_leaf=50,          # prevents overfitting with low correlations
    min_child_weight=1e-3,        # use slightly higher if data is noisy

    # Regularization
    reg_alpha=0.2,                # L1
    reg_lambda=0.8,               # L2
    bagging_fraction=0.8,         # row subsampling
    bagging_freq=1,               # every iteration
    feature_fraction=0.8,         # column subsampling
    subsample_freq=1,

    # Randomness
    random_state=42,
    n_jobs=-1,

    # Extra tricks
    verbose=-1,
    boosting="gbdt"
)

In [39]:
final_model.fit(X, y)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,800
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [40]:
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score

pred_final_lgbm_clf =  cross_val_predict(final_model, X, y, cv = 3)

print(f"lgbm_clf f1 score is: {f1_score(y, pred_final_lgbm_clf)}")
print(f"lgbm_clf precision is: {precision_score(y, pred_final_lgbm_clf)}")
print(f"lgbm_clf recall score is: {recall_score(y, pred_final_lgbm_clf)}")



lgbm_clf f1 score is: 0.699453264640942
lgbm_clf precision is: 0.6778236283050588
lgbm_clf recall score is: 0.7225088243279935




In [41]:
import joblib

joblib.dump(final_model, f"predictor0_0.0.1.pkl")


['predictor0_0.0.1.pkl']

In [42]:
y_test_df = pd.DataFrame(y_test, columns=['y'])

In [43]:
len(y_test_df[y_test_df['y'] == 0])

3454

In [44]:
df

Unnamed: 0,datetime,open,high,low,close,timeframe,atr,ma,adx,rsi,ma_status,id,entry,tp_buy,tp_sell,timestamp
0,2020-01-03 21:00:00+00:00,0.69567,0.69584,0.69526,0.69582,h1,0.000941,0.694661,0.375250,0.452116,upper,0,0.69582,0.697231,0.694409,1578085200
1,2020-01-03 22:00:00+00:00,0.69584,0.69584,0.69519,0.69531,h1,0.000929,0.694673,0.370280,0.417925,upper,1,0.69531,0.696703,0.693917,1578088800
2,2020-01-06 00:00:00+00:00,0.69405,0.69427,0.69339,0.69367,h1,0.000970,0.694637,0.370768,0.331189,under,2,0.69367,0.695125,0.692215,1578268800
3,2020-01-06 01:00:00+00:00,0.69366,0.69448,0.69326,0.69434,h1,0.000980,0.694633,0.370072,0.387148,under,3,0.69434,0.695811,0.692869,1578272400
4,2020-01-06 02:00:00+00:00,0.69433,0.69558,0.69431,0.69526,h1,0.000992,0.694654,0.363794,0.454626,upper,4,0.69526,0.696749,0.693771,1578276000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35852,2025-10-17 11:00:00+00:00,0.64630,0.64646,0.64514,0.64591,h1,0.001170,0.647903,0.309632,0.310925,under,35852,0.64591,0.647665,0.644155,1760698800
35853,2025-10-17 12:00:00+00:00,0.64592,0.64686,0.64580,0.64673,h1,0.001166,0.647837,0.315655,0.394323,under,35853,0.64673,0.648478,0.644982,1760702400
35854,2025-10-17 13:00:00+00:00,0.64674,0.64797,0.64608,0.64776,h1,0.001196,0.647836,0.316039,0.479532,under,35854,0.64776,0.649554,0.645966,1760706000
35855,2025-10-17 14:00:00+00:00,0.64783,0.65021,0.64740,0.64859,h1,0.001263,0.647840,0.307701,0.536162,upper,35855,0.64859,0.650485,0.646695,1760709600
