In [2]:
#Packages
import pandas as pd
import numpy as np 
from talib import abstract
from talib.abstract import *
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier


df = pd.read_csv("D:\\RA_stock_prediction\\RA_stock_prediction\\data\\futures_full_1min_continuous_adjusted_kzvua9e\\A6_1min_continuous_adjusted.txt", sep=',', header = None)

In [3]:
#Setting inputs 
inputs = {
    "open": df[0], 
    "high": df[1], 
    'low': df[2], 
    'close': df[3],
    'volume': df[4]
}

signals = pd.DataFrame(index=df.index)

# Calculating indicators
signals['EMA10'] = abstract.EMA(inputs, timeperiod=10)
signals['SMA30'] = abstract.SMA(inputs, timeperiod=30)
signals['MACD'], signals['MACD_signal'], _ = abstract.MACD(inputs, fastperiod=12, slowperiod=26, signalperiod=9)
signals['upper_band'], signals['middle_band'], signals['lower_band'] = abstract.BBANDS(inputs, timeperiod=20, nbdevup=1.0, nbdevdn=1.0, matype=0)
signals['RSI'] = abstract.RSI(inputs, timeperiod=14)
signals['ROC'] = abstract.ROC(inputs, timeperiod=10)
signals['VOL'] = df[4]
signals['ADX'] = abstract.ADX(inputs, timeperiod=14)
signals['ATR'] = abstract.ATR(inputs, timeperiod=14)
signals['CMO'] = abstract.CMO(inputs, timeperiod=14)
signals['OBV'] = abstract.OBV(inputs['close'], inputs['volume'])
signals['WILLR'] = abstract.WILLR(inputs, timeperiod=14)
signals['CCI'] = abstract.CCI(inputs, timeperiod=14)
signals['MFI'] = abstract.MFI(inputs, timeperiod=14)
signals['PPO'] = abstract.PPO(inputs, fastperiod=12, slowperiod=26, matype=0)
signals['ULTOSC'] = abstract.ULTOSC(inputs, timeperiod1=7, timeperiod2=14, timeperiod3=28)
signals['CMF'] = abstract.ADOSC(inputs, fastperiod=3, slowperiod=10)
signals['SAR'] = abstract.SAR(inputs, acceleration=0.02, maximum=0.2)
signals['slowk'], signals['slowd'] = abstract.STOCH(inputs, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
signals['APO'] = abstract.APO(inputs, fastperiod=12, slowperiod=26)
signals['AROON_down'], signals['AROON_up'] = abstract.AROON(inputs, timeperiod=14)
signals['DEMA'] = abstract.DEMA(inputs, timeperiod=30)
signals['HT_TRENDLINE'] = abstract.HT_TRENDLINE(inputs)
signals['KAMA'] = abstract.KAMA(inputs, timeperiod=30)
signals['MOM'] = abstract.MOM(inputs, timeperiod=10)
signals['TEMA'] = abstract.TEMA(inputs, timeperiod=30)
signals['TRIX'] = abstract.TRIX(inputs, timeperiod=15)


EMA20 = abstract.EMA(inputs, timeperiod=20)
ATR10 = abstract.ATR(inputs, timeperiod=10)
signals['KC_upper'] = EMA20 + (ATR10 * 2)
signals['KC_middle'] = EMA20
signals['KC_lower'] = EMA20 - (ATR10 * 2)


In [4]:
# Calculate the 2-hour (120 minutes) rolling maximum of the 'close' price
signals['2hr_max_close'] = df[3].rolling(window=120, min_periods=1).max().shift(-120)

# Define the threshold for the price increase 
price_increase_threshold = 1.01  
signals['target'] = (signals['2hr_max_close'] >= df[3] * price_increase_threshold).astype(int)

num_buy_signals = signals['target'].sum()
print(f"Number of 'buy' signals: {int(num_buy_signals)}/{int(len(df))}")


# Drop rows with NaN values
clean_signals = signals.dropna()


# Split data into features and target
X = clean_signals[['EMA10', 'SMA30', 'MACD', 'MACD_signal', 'upper_band', 'middle_band', 'lower_band', 'RSI', 'ROC', 'VOL', 'ADX', 'ATR', 'CMO', 'OBV', 'WILLR', 'CCI', "MFI", "PPO", "ULTOSC", "CMF", "SAR", "slowk", "APO", "AROON_down", "DEMA", "HT_TRENDLINE", "KAMA", "MOM", "TEMA", "TRIX", "KC_upper", "KC_middle", "KC_lower"]]
y = clean_signals['target']



Number of 'buy' signals: 57324/5271627


In [5]:
# add , random_state=42 for a seed
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, random_state = 42)

In [6]:
# Initialize and train the classifier

clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


KeyboardInterrupt: 

In [None]:

#Testing 
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)

print("Gradient Boosting Classifier Results")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")

Gradient Boosting Classifier Results
F1 Score: 0.2581046324298352
Precision: 0.6643387167175906
Recall: 0.16016556755998315
Accuracy: 0.9900410136168243


In [None]:
random_predictions = np.random.randint(2, size=len(y_test))

# Evaluate the random model
f1 = f1_score(y_test, random_predictions)
precision = precision_score(y_test, random_predictions)
recall = recall_score(y_test, random_predictions)
accuracy = accuracy_score(y_test, random_predictions)

print(f"Random Test Results:")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")


Random Test Results:
F1 Score: 0.021234086327769616
Precision: 0.01084688307782965
Recall: 0.5010523361863337
Accuracy: 0.500394201182983
